diff options
author | Paul Mundt <lethal@linux-sh.org> | 2011-01-13 06:06:28 (GMT) |
---|---|---|
committer | Paul Mundt <lethal@linux-sh.org> | 2011-01-13 06:06:28 (GMT) |
commit | f43dc23d5ea91fca257be02138a255f02d98e806 (patch) | |
tree | b29722f6e965316e90ac97abf79923ced250dc21 /mm | |
parent | f8e53553f452dcbf67cb89c8cba63a1cd6eb4cc0 (diff) | |
parent | 4162cf64973df51fc885825bc9ca4d055891c49f (diff) | |
download | linux-fsl-qoriq-f43dc23d5ea91fca257be02138a255f02d98e806.tar.xz |
Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/torvalds/linux-2.6 into common/serial-rework
Conflicts:
arch/sh/kernel/cpu/sh2/setup-sh7619.c
arch/sh/kernel/cpu/sh2a/setup-mxg.c
arch/sh/kernel/cpu/sh2a/setup-sh7201.c
arch/sh/kernel/cpu/sh2a/setup-sh7203.c
arch/sh/kernel/cpu/sh2a/setup-sh7206.c
arch/sh/kernel/cpu/sh3/setup-sh7705.c
arch/sh/kernel/cpu/sh3/setup-sh770x.c
arch/sh/kernel/cpu/sh3/setup-sh7710.c
arch/sh/kernel/cpu/sh3/setup-sh7720.c
arch/sh/kernel/cpu/sh4/setup-sh4-202.c
arch/sh/kernel/cpu/sh4/setup-sh7750.c
arch/sh/kernel/cpu/sh4/setup-sh7760.c
arch/sh/kernel/cpu/sh4a/setup-sh7343.c
arch/sh/kernel/cpu/sh4a/setup-sh7366.c
arch/sh/kernel/cpu/sh4a/setup-sh7722.c
arch/sh/kernel/cpu/sh4a/setup-sh7723.c
arch/sh/kernel/cpu/sh4a/setup-sh7724.c
arch/sh/kernel/cpu/sh4a/setup-sh7763.c
arch/sh/kernel/cpu/sh4a/setup-sh7770.c
arch/sh/kernel/cpu/sh4a/setup-sh7780.c
arch/sh/kernel/cpu/sh4a/setup-sh7785.c
arch/sh/kernel/cpu/sh4a/setup-sh7786.c
arch/sh/kernel/cpu/sh4a/setup-shx3.c
arch/sh/kernel/cpu/sh5/setup-sh5.c
drivers/serial/sh-sci.c
drivers/serial/sh-sci.h
include/linux/serial_sci.h
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 99 | ||||
-rw-r--r-- | mm/Kconfig.debug | 12 | ||||
-rw-r--r-- | mm/Makefile | 20 | ||||
-rw-r--r-- | mm/allocpercpu.c | 149 | ||||
-rw-r--r-- | mm/backing-dev.c | 552 | ||||
-rw-r--r-- | mm/bootmem.c | 267 | ||||
-rw-r--r-- | mm/bounce.c | 3 | ||||
-rw-r--r-- | mm/compaction.c | 605 | ||||
-rw-r--r-- | mm/dmapool.c | 4 | ||||
-rw-r--r-- | mm/fadvise.c | 10 | ||||
-rw-r--r-- | mm/failslab.c | 19 | ||||
-rw-r--r-- | mm/filemap.c | 472 | ||||
-rw-r--r-- | mm/filemap_xip.c | 5 | ||||
-rw-r--r-- | mm/fremap.c | 9 | ||||
-rw-r--r-- | mm/highmem.c | 56 | ||||
-rw-r--r-- | mm/hugetlb.c | 1089 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 141 | ||||
-rw-r--r-- | mm/init-mm.c | 6 | ||||
-rw-r--r-- | mm/internal.h | 47 | ||||
-rw-r--r-- | mm/kmemleak-test.c | 6 | ||||
-rw-r--r-- | mm/kmemleak.c | 935 | ||||
-rw-r--r-- | mm/ksm.c | 1964 | ||||
-rw-r--r-- | mm/maccess.c | 13 | ||||
-rw-r--r-- | mm/madvise.c | 90 | ||||
-rw-r--r-- | mm/memblock.c | 842 | ||||
-rw-r--r-- | mm/memcontrol.c | 3280 | ||||
-rw-r--r-- | mm/memory-failure.c | 1467 | ||||
-rw-r--r-- | mm/memory.c | 711 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 176 | ||||
-rw-r--r-- | mm/mempolicy.c | 643 | ||||
-rw-r--r-- | mm/mempool.c | 9 | ||||
-rw-r--r-- | mm/migrate.c | 496 | ||||
-rw-r--r-- | mm/mincore.c | 232 | ||||
-rw-r--r-- | mm/mlock.c | 235 | ||||
-rw-r--r-- | mm/mmap.c | 501 | ||||
-rw-r--r-- | mm/mmu_context.c | 62 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 21 | ||||
-rw-r--r-- | mm/mmzone.c | 21 | ||||
-rw-r--r-- | mm/mprotect.c | 5 | ||||
-rw-r--r-- | mm/mremap.c | 271 | ||||
-rw-r--r-- | mm/msync.c | 2 | ||||
-rw-r--r-- | mm/nommu.c | 482 | ||||
-rw-r--r-- | mm/oom_kill.c | 778 | ||||
-rw-r--r-- | mm/page-writeback.c | 573 | ||||
-rw-r--r-- | mm/page_alloc.c | 1307 | ||||
-rw-r--r-- | mm/page_cgroup.c | 61 | ||||
-rw-r--r-- | mm/page_io.c | 20 | ||||
-rw-r--r-- | mm/page_isolation.c | 3 | ||||
-rw-r--r-- | mm/pagewalk.c | 60 | ||||
-rw-r--r-- | mm/pdflush.c | 269 | ||||
-rw-r--r-- | mm/percpu-km.c | 108 | ||||
-rw-r--r-- | mm/percpu-vm.c | 451 | ||||
-rw-r--r-- | mm/percpu.c | 1654 | ||||
-rw-r--r-- | mm/quicklist.c | 6 | ||||
-rw-r--r-- | mm/readahead.c | 21 | ||||
-rw-r--r-- | mm/rmap.c | 872 | ||||
-rw-r--r-- | mm/shmem.c | 428 | ||||
-rw-r--r-- | mm/shmem_acl.c | 197 | ||||
-rw-r--r-- | mm/slab.c | 522 | ||||
-rw-r--r-- | mm/slob.c | 38 | ||||
-rw-r--r-- | mm/slub.c | 1374 | ||||
-rw-r--r-- | mm/sparse-vmemmap.c | 74 | ||||
-rw-r--r-- | mm/sparse.c | 213 | ||||
-rw-r--r-- | mm/swap.c | 13 | ||||
-rw-r--r-- | mm/swap_state.c | 145 | ||||
-rw-r--r-- | mm/swapfile.c | 1013 | ||||
-rw-r--r-- | mm/thrash.c | 32 | ||||
-rw-r--r-- | mm/truncate.c | 167 | ||||
-rw-r--r-- | mm/util.c | 26 | ||||
-rw-r--r-- | mm/vmalloc.c | 830 | ||||
-rw-r--r-- | mm/vmscan.c | 1499 | ||||
-rw-r--r-- | mm/vmstat.c | 495 |
72 files changed, 21813 insertions, 7465 deletions
@@ -67,7 +67,7 @@ config DISCONTIGMEM config SPARSEMEM def_bool y - depends on SPARSEMEM_MANUAL + depends on (!SELECT_MEMORY_MODEL && ARCH_SPARSEMEM_ENABLE) || SPARSEMEM_MANUAL config FLATMEM def_bool y @@ -115,6 +115,10 @@ config SPARSEMEM_EXTREME config SPARSEMEM_VMEMMAP_ENABLE bool +config SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + def_bool y + depends on SPARSEMEM && X86_64 + config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" depends on SPARSEMEM && SPARSEMEM_VMEMMAP_ENABLE @@ -124,15 +128,15 @@ config SPARSEMEM_VMEMMAP pfn_to_page and page_to_pfn operations. This is the most efficient option when sufficient kernel resources are available. +config HAVE_MEMBLOCK + boolean + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" depends on SPARSEMEM || X86_64_ACPI_NUMA - depends on HOTPLUG && !(HIBERNATION && !S390) && ARCH_ENABLE_MEMORY_HOTPLUG - depends on (IA64 || X86 || PPC64 || SUPERH || S390) - -comment "Memory hotplug is currently incompatible with Software Suspend" - depends on SPARSEMEM && HOTPLUG && HIBERNATION && !S390 + depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG + depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) config MEMORY_HOTPLUG_SPARSE def_bool y @@ -153,7 +157,7 @@ config MEMORY_HOTREMOVE # config PAGEFLAGS_EXTENDED def_bool y - depends on 64BIT || SPARSEMEM_VMEMMAP || !NUMA || !SPARSEMEM + depends on 64BIT || SPARSEMEM_VMEMMAP || !SPARSEMEM # Heavily threaded applications may benefit from splitting the mm-wide # page_table_lock, so that faults on different parts of the user address @@ -161,25 +165,38 @@ config PAGEFLAGS_EXTENDED # Default to 4 for wider testing, though 8 might be more appropriate. # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. # PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. +# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page. # config SPLIT_PTLOCK_CPUS int - default "4096" if ARM && !CPU_CACHE_VIPT - default "4096" if PARISC && !PA20 + default "999999" if ARM && !CPU_CACHE_VIPT + default "999999" if PARISC && !PA20 + default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" # +# support for memory compaction +config COMPACTION + bool "Allow for memory compaction" + select MIGRATION + depends on EXPERIMENTAL && HUGETLB_PAGE && MMU + help + Allows the compaction of memory for the allocation of huge pages. + +# # support for page migration # config MIGRATION bool "Page migration" def_bool y - depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE + depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION help Allows the migration of the physical location of pages of processes - while the virtual addresses are not changed. This is useful for - example on NUMA systems to put pages nearer to the processors accessing - the page. + while the virtual addresses are not changed. This is useful in + two situations. The first is on NUMA systems to put pages nearer + to the processors accessing. The second is when allocating huge + pages as migration can relocate pages to satisfy a huge page + allocation instead of reclaiming. config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT @@ -196,26 +213,33 @@ config BOUNCE config NR_QUICK int depends on QUICKLIST - default "2" if SUPERH || AVR32 + default "2" if AVR32 default "1" config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS -config HAVE_MLOCK - bool - default y if MMU=y - -config HAVE_MLOCKED_PAGE_BIT - bool - default y if HAVE_MLOCK=y - config MMU_NOTIFIER bool +config KSM + bool "Enable KSM for page merging" + depends on MMU + help + Enable Kernel Samepage Merging: KSM periodically scans those areas + of an application's address space that an app has advised may be + mergeable. When it finds pages of identical content, it replaces + the many instances by a single page with that content, so + saving memory until one or another app needs to modify the content. + Recommended for use with KVM, or with other duplicative applications. + See Documentation/vm/ksm.txt for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). + config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" + depends on MMU default 4096 help This is the portion of low virtual memory which should be protected @@ -225,13 +249,30 @@ config DEFAULT_MMAP_MIN_ADDR For most ia64, ppc64 and x86 users with lots of address space a value of 65536 is reasonable and should cause no problems. On arm and other archs it should not be higher than 32768. - Programs which use vm86 functionality would either need additional - permissions from either the LSM or the capabilities module or have - this protection disabled. + Programs which use vm86 functionality or have some need to map + this low address space will need CAP_SYS_RAWIO or disable this + protection by setting the value to 0. This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. +config ARCH_SUPPORTS_MEMORY_FAILURE + bool + +config MEMORY_FAILURE + depends on MMU + depends on ARCH_SUPPORTS_MEMORY_FAILURE + bool "Enable recovery from hardware memory errors" + help + Enables code to recover from some memory failures on systems + with MCA recovery. This allows a system to continue running + even when some of its memory has uncorrected errors. This requires + special hardware support and typically ECC memory. + +config HWPOISON_INJECT + tristate "HWPoison pages injector" + depends on MEMORY_FAILURE && DEBUG_KERNEL && PROC_FS + select PROC_PAGE_MONITOR config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" @@ -260,3 +301,11 @@ config NOMMU_INITIAL_TRIM_EXCESS of 1 says that all excess pages should be trimmed. See Documentation/nommu-mmap.txt for more information. + +# +# UP and nommu archs use km based percpu allocator +# +config NEED_PER_CPU_KM + depends on !SMP + bool + default y diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index aa99fd1..af7cfb4 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -6,7 +6,7 @@ config DEBUG_PAGEALLOC ---help--- Unmap pages from the kernel linear mapping after free_pages(). This results in a large slowdown, but helps to find certain types - of memory corruptions. + of memory corruption. config WANT_PAGE_DEBUG_FLAGS bool @@ -17,11 +17,11 @@ config PAGE_POISONING depends on !HIBERNATION select DEBUG_PAGEALLOC select WANT_PAGE_DEBUG_FLAGS - help + ---help--- Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages(). This results in a large slowdown, - but helps to find certain types of memory corruptions. + but helps to find certain types of memory corruption. - This option cannot enalbe with hibernation. Otherwise, it will get - wrong messages for memory corruption because the free pages are not - saved to the suspend image. + This option cannot be enabled in combination with hibernation as + that would result in incorrect warnings of memory corruption after + a resume because free pages are not saved to the suspend image. diff --git a/mm/Makefile b/mm/Makefile index 5e0bd64..f73f75a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,16 +5,18 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o + vmalloc.o pagewalk.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ - maccess.o page_alloc.o page-writeback.o pdflush.o \ + maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o mm_init.o $(mmu-y) + page_isolation.o mm_init.o mmu_context.o percpu.o \ + $(mmu-y) obj-y += init-mm.o -obj-$(CONFIG_PROC_PAGE_MONITOR) += pagewalk.o +obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o + obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o thrash.o obj-$(CONFIG_HAS_DMA) += dmapool.o @@ -22,9 +24,10 @@ obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o obj-$(CONFIG_SPARSEMEM) += sparse.o obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o -obj-$(CONFIG_TMPFS_POSIX_ACL) += shmem_acl.o obj-$(CONFIG_SLOB) += slob.o +obj-$(CONFIG_COMPACTION) += compaction.o obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +obj-$(CONFIG_KSM) += ksm.o obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o @@ -33,12 +36,9 @@ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o -ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA -obj-$(CONFIG_SMP) += percpu.o -else -obj-$(CONFIG_SMP) += allocpercpu.o -endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o +obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c deleted file mode 100644 index dfdee6a..0000000 --- a/mm/allocpercpu.c +++ /dev/null @@ -1,149 +0,0 @@ -/* - * linux/mm/allocpercpu.c - * - * Separated from slab.c August 11, 2006 Christoph Lameter - */ -#include <linux/mm.h> -#include <linux/module.h> - -#ifndef cache_line_size -#define cache_line_size() L1_CACHE_BYTES -#endif - -/** - * percpu_depopulate - depopulate per-cpu data for given cpu - * @__pdata: per-cpu data to depopulate - * @cpu: depopulate per-cpu data for this cpu - * - * Depopulating per-cpu data for a cpu going offline would be a typical - * use case. You need to register a cpu hotplug handler for that purpose. - */ -static void percpu_depopulate(void *__pdata, int cpu) -{ - struct percpu_data *pdata = __percpu_disguise(__pdata); - - kfree(pdata->ptrs[cpu]); - pdata->ptrs[cpu] = NULL; -} - -/** - * percpu_depopulate_mask - depopulate per-cpu data for some cpu's - * @__pdata: per-cpu data to depopulate - * @mask: depopulate per-cpu data for cpu's selected through mask bits - */ -static void __percpu_depopulate_mask(void *__pdata, const cpumask_t *mask) -{ - int cpu; - for_each_cpu_mask_nr(cpu, *mask) - percpu_depopulate(__pdata, cpu); -} - -#define percpu_depopulate_mask(__pdata, mask) \ - __percpu_depopulate_mask((__pdata), &(mask)) - -/** - * percpu_populate - populate per-cpu data for given cpu - * @__pdata: per-cpu data to populate further - * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @cpu: populate per-data for this cpu - * - * Populating per-cpu data for a cpu coming online would be a typical - * use case. You need to register a cpu hotplug handler for that purpose. - * Per-cpu object is populated with zeroed buffer. - */ -static void *percpu_populate(void *__pdata, size_t size, gfp_t gfp, int cpu) -{ - struct percpu_data *pdata = __percpu_disguise(__pdata); - int node = cpu_to_node(cpu); - - /* - * We should make sure each CPU gets private memory. - */ - size = roundup(size, cache_line_size()); - - BUG_ON(pdata->ptrs[cpu]); - if (node_online(node)) - pdata->ptrs[cpu] = kmalloc_node(size, gfp|__GFP_ZERO, node); - else - pdata->ptrs[cpu] = kzalloc(size, gfp); - return pdata->ptrs[cpu]; -} - -/** - * percpu_populate_mask - populate per-cpu data for more cpu's - * @__pdata: per-cpu data to populate further - * @size: size of per-cpu object - * @gfp: may sleep or not etc. - * @mask: populate per-cpu data for cpu's selected through mask bits - * - * Per-cpu objects are populated with zeroed buffers. - */ -static int __percpu_populate_mask(void *__pdata, size_t size, gfp_t gfp, - cpumask_t *mask) -{ - cpumask_t populated; - int cpu; - - cpus_clear(populated); - for_each_cpu_mask_nr(cpu, *mask) - if (unlikely(!percpu_populate(__pdata, size, gfp, cpu))) { - __percpu_depopulate_mask(__pdata, &populated); - return -ENOMEM; - } else - cpu_set(cpu, populated); - return 0; -} - -#define percpu_populate_mask(__pdata, size, gfp, mask) \ - __percpu_populate_mask((__pdata), (size), (gfp), &(mask)) - -/** - * alloc_percpu - initial setup of per-cpu data - * @size: size of per-cpu object - * @align: alignment - * - * Allocate dynamic percpu area. Percpu objects are populated with - * zeroed buffers. - */ -void *__alloc_percpu(size_t size, size_t align) -{ - /* - * We allocate whole cache lines to avoid false sharing - */ - size_t sz = roundup(nr_cpu_ids * sizeof(void *), cache_line_size()); - void *pdata = kzalloc(sz, GFP_KERNEL); - void *__pdata = __percpu_disguise(pdata); - - /* - * Can't easily make larger alignment work with kmalloc. WARN - * on it. Larger alignment should only be used for module - * percpu sections on SMP for which this path isn't used. - */ - WARN_ON_ONCE(align > SMP_CACHE_BYTES); - - if (unlikely(!pdata)) - return NULL; - if (likely(!__percpu_populate_mask(__pdata, size, GFP_KERNEL, - &cpu_possible_map))) - return __pdata; - kfree(pdata); - return NULL; -} -EXPORT_SYMBOL_GPL(__alloc_percpu); - -/** - * free_percpu - final cleanup of per-cpu data - * @__pdata: object to clean up - * - * We simply clean up any per-cpu object left. No need for the client to - * track and specify through a bis mask which per-cpu objects are to free. - */ -void free_percpu(void *__pdata) -{ - if (unlikely(!__pdata)) - return; - __percpu_depopulate_mask(__pdata, cpu_possible_mask); - kfree(__percpu_disguise(__pdata)); -} -EXPORT_SYMBOL_GPL(free_percpu); diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 493b468..027100d 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1,12 +1,18 @@ #include <linux/wait.h> #include <linux/backing-dev.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include <linux/fs.h> #include <linux/pagemap.h> +#include <linux/mm.h> #include <linux/sched.h> #include <linux/module.h> #include <linux/writeback.h> #include <linux/device.h> +#include <trace/events/writeback.h> + +static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0); void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) { @@ -14,6 +20,7 @@ void default_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) EXPORT_SYMBOL(default_unplug_io_fn); struct backing_dev_info default_backing_dev_info = { + .name = "default", .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE, .state = 0, .capabilities = BDI_CAP_MAP_COPY, @@ -21,8 +28,29 @@ struct backing_dev_info default_backing_dev_info = { }; EXPORT_SYMBOL_GPL(default_backing_dev_info); +struct backing_dev_info noop_backing_dev_info = { + .name = "noop", + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, +}; +EXPORT_SYMBOL_GPL(noop_backing_dev_info); + static struct class *bdi_class; +/* + * bdi_lock protects updates to bdi_list and bdi_pending_list, as well as + * reader side protection for bdi_pending_list. bdi_list has RCU reader side + * locking. + */ +DEFINE_SPINLOCK(bdi_lock); +LIST_HEAD(bdi_list); +LIST_HEAD(bdi_pending_list); + +static struct task_struct *sync_supers_tsk; +static struct timer_list sync_supers_timer; + +static int bdi_sync_supers(void *); +static void sync_supers_timer_fn(unsigned long); + #ifdef CONFIG_DEBUG_FS #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -37,11 +65,25 @@ static void bdi_debug_init(void) static int bdi_debug_stats_show(struct seq_file *m, void *v) { struct backing_dev_info *bdi = m->private; + struct bdi_writeback *wb = &bdi->wb; unsigned long background_thresh; unsigned long dirty_thresh; unsigned long bdi_thresh; - - get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi); + unsigned long nr_dirty, nr_io, nr_more_io, nr_wb; + struct inode *inode; + + nr_wb = nr_dirty = nr_io = nr_more_io = 0; + spin_lock(&inode_lock); + list_for_each_entry(inode, &wb->b_dirty, i_wb_list) + nr_dirty++; + list_for_each_entry(inode, &wb->b_io, i_wb_list) + nr_io++; + list_for_each_entry(inode, &wb->b_more_io, i_wb_list) + nr_more_io++; + spin_unlock(&inode_lock); + + global_dirty_limits(&background_thresh, &dirty_thresh); + bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); #define K(x) ((x) << (PAGE_SHIFT - 10)) seq_printf(m, @@ -49,12 +91,17 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiReclaimable: %8lu kB\n" "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" - "BackgroundThresh: %8lu kB\n", + "BackgroundThresh: %8lu kB\n" + "b_dirty: %8lu\n" + "b_io: %8lu\n" + "b_more_io: %8lu\n" + "bdi_list: %8u\n" + "state: %8lx\n", (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)), (unsigned long) K(bdi_stat(bdi, BDI_RECLAIMABLE)), - K(bdi_thresh), - K(dirty_thresh), - K(background_thresh)); + K(bdi_thresh), K(dirty_thresh), + K(background_thresh), nr_dirty, nr_io, nr_more_io, + !list_empty(&bdi->bdi_list), bdi->state); #undef K return 0; @@ -175,6 +222,9 @@ static struct device_attribute bdi_dev_attrs[] = { static __init int bdi_class_init(void) { bdi_class = class_create(THIS_MODULE, "bdi"); + if (IS_ERR(bdi_class)) + return PTR_ERR(bdi_class); + bdi_class->dev_attrs = bdi_dev_attrs; bdi_debug_init(); return 0; @@ -185,37 +235,325 @@ static int __init default_bdi_init(void) { int err; + sync_supers_tsk = kthread_run(bdi_sync_supers, NULL, "sync_supers"); + BUG_ON(IS_ERR(sync_supers_tsk)); + + setup_timer(&sync_supers_timer, sync_supers_timer_fn, 0); + bdi_arm_supers_timer(); + err = bdi_init(&default_backing_dev_info); if (!err) bdi_register(&default_backing_dev_info, NULL, "default"); + err = bdi_init(&noop_backing_dev_info); return err; } subsys_initcall(default_bdi_init); +int bdi_has_dirty_io(struct backing_dev_info *bdi) +{ + return wb_has_dirty_io(&bdi->wb); +} + +static void bdi_flush_io(struct backing_dev_info *bdi) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .range_cyclic = 1, + .nr_to_write = 1024, + }; + + writeback_inodes_wb(&bdi->wb, &wbc); +} + +/* + * kupdated() used to do this. We cannot do it from the bdi_forker_thread() + * or we risk deadlocking on ->s_umount. The longer term solution would be + * to implement sync_supers_bdi() or similar and simply do it from the + * bdi writeback thread individually. + */ +static int bdi_sync_supers(void *unused) +{ + set_user_nice(current, 0); + + while (!kthread_should_stop()) { + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + + /* + * Do this periodically, like kupdated() did before. + */ + sync_supers(); + } + + return 0; +} + +void bdi_arm_supers_timer(void) +{ + unsigned long next; + + if (!dirty_writeback_interval) + return; + + next = msecs_to_jiffies(dirty_writeback_interval * 10) + jiffies; + mod_timer(&sync_supers_timer, round_jiffies_up(next)); +} + +static void sync_supers_timer_fn(unsigned long unused) +{ + wake_up_process(sync_supers_tsk); + bdi_arm_supers_timer(); +} + +static void wakeup_timer_fn(unsigned long data) +{ + struct backing_dev_info *bdi = (struct backing_dev_info *)data; + + spin_lock_bh(&bdi->wb_lock); + if (bdi->wb.task) { + trace_writeback_wake_thread(bdi); + wake_up_process(bdi->wb.task); + } else { + /* + * When bdi tasks are inactive for long time, they are killed. + * In this case we have to wake-up the forker thread which + * should create and run the bdi thread. + */ + trace_writeback_wake_forker_thread(bdi); + wake_up_process(default_backing_dev_info.wb.task); + } + spin_unlock_bh(&bdi->wb_lock); +} + +/* + * This function is used when the first inode for this bdi is marked dirty. It + * wakes-up the corresponding bdi thread which should then take care of the + * periodic background write-out of dirty inodes. Since the write-out would + * starts only 'dirty_writeback_interval' centisecs from now anyway, we just + * set up a timer which wakes the bdi thread up later. + * + * Note, we wouldn't bother setting up the timer, but this function is on the + * fast-path (used by '__mark_inode_dirty()'), so we save few context switches + * by delaying the wake-up. + */ +void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi) +{ + unsigned long timeout; + + timeout = msecs_to_jiffies(dirty_writeback_interval * 10); + mod_timer(&bdi->wb.wakeup_timer, jiffies + timeout); +} + +/* + * Calculate the longest interval (jiffies) bdi threads are allowed to be + * inactive. + */ +static unsigned long bdi_longest_inactive(void) +{ + unsigned long interval; + + interval = msecs_to_jiffies(dirty_writeback_interval * 10); + return max(5UL * 60 * HZ, interval); +} + +static int bdi_forker_thread(void *ptr) +{ + struct bdi_writeback *me = ptr; + + current->flags |= PF_SWAPWRITE; + set_freezable(); + + /* + * Our parent may run at a different priority, just set us to normal + */ + set_user_nice(current, 0); + + for (;;) { + struct task_struct *task = NULL; + struct backing_dev_info *bdi; + enum { + NO_ACTION, /* Nothing to do */ + FORK_THREAD, /* Fork bdi thread */ + KILL_THREAD, /* Kill inactive bdi thread */ + } action = NO_ACTION; + + /* + * Temporary measure, we want to make sure we don't see + * dirty data on the default backing_dev_info + */ + if (wb_has_dirty_io(me) || !list_empty(&me->bdi->work_list)) { + del_timer(&me->wakeup_timer); + wb_do_writeback(me, 0); + } + + spin_lock_bh(&bdi_lock); + set_current_state(TASK_INTERRUPTIBLE); + + list_for_each_entry(bdi, &bdi_list, bdi_list) { + bool have_dirty_io; + + if (!bdi_cap_writeback_dirty(bdi) || + bdi_cap_flush_forker(bdi)) + continue; + + WARN(!test_bit(BDI_registered, &bdi->state), + "bdi %p/%s is not registered!\n", bdi, bdi->name); + + have_dirty_io = !list_empty(&bdi->work_list) || + wb_has_dirty_io(&bdi->wb); + + /* + * If the bdi has work to do, but the thread does not + * exist - create it. + */ + if (!bdi->wb.task && have_dirty_io) { + /* + * Set the pending bit - if someone will try to + * unregister this bdi - it'll wait on this bit. + */ + set_bit(BDI_pending, &bdi->state); + action = FORK_THREAD; + break; + } + + spin_lock(&bdi->wb_lock); + + /* + * If there is no work to do and the bdi thread was + * inactive long enough - kill it. The wb_lock is taken + * to make sure no-one adds more work to this bdi and + * wakes the bdi thread up. + */ + if (bdi->wb.task && !have_dirty_io && + time_after(jiffies, bdi->wb.last_active + + bdi_longest_inactive())) { + task = bdi->wb.task; + bdi->wb.task = NULL; + spin_unlock(&bdi->wb_lock); + set_bit(BDI_pending, &bdi->state); + action = KILL_THREAD; + break; + } + spin_unlock(&bdi->wb_lock); + } + spin_unlock_bh(&bdi_lock); + + /* Keep working if default bdi still has things to do */ + if (!list_empty(&me->bdi->work_list)) + __set_current_state(TASK_RUNNING); + + switch (action) { + case FORK_THREAD: + __set_current_state(TASK_RUNNING); + task = kthread_create(bdi_writeback_thread, &bdi->wb, + "flush-%s", dev_name(bdi->dev)); + if (IS_ERR(task)) { + /* + * If thread creation fails, force writeout of + * the bdi from the thread. + */ + bdi_flush_io(bdi); + } else { + /* + * The spinlock makes sure we do not lose + * wake-ups when racing with 'bdi_queue_work()'. + * And as soon as the bdi thread is visible, we + * can start it. + */ + spin_lock_bh(&bdi->wb_lock); + bdi->wb.task = task; + spin_unlock_bh(&bdi->wb_lock); + wake_up_process(task); + } + break; + + case KILL_THREAD: + __set_current_state(TASK_RUNNING); + kthread_stop(task); + break; + + case NO_ACTION: + if (!wb_has_dirty_io(me) || !dirty_writeback_interval) + /* + * There are no dirty data. The only thing we + * should now care about is checking for + * inactive bdi threads and killing them. Thus, + * let's sleep for longer time, save energy and + * be friendly for battery-driven devices. + */ + schedule_timeout(bdi_longest_inactive()); + else + schedule_timeout(msecs_to_jiffies(dirty_writeback_interval * 10)); + try_to_freeze(); + /* Back to the main loop */ + continue; + } + + /* + * Clear pending bit and wakeup anybody waiting to tear us down. + */ + clear_bit(BDI_pending, &bdi->state); + smp_mb__after_clear_bit(); + wake_up_bit(&bdi->state, BDI_pending); + } + + return 0; +} + +/* + * Remove bdi from bdi_list, and ensure that it is no longer visible + */ +static void bdi_remove_from_list(struct backing_dev_info *bdi) +{ + spin_lock_bh(&bdi_lock); + list_del_rcu(&bdi->bdi_list); + spin_unlock_bh(&bdi_lock); + + synchronize_rcu(); +} + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { va_list args; - int ret = 0; struct device *dev; if (bdi->dev) /* The driver needs to use separate queues per device */ - goto exit; + return 0; va_start(args, fmt); dev = device_create_vargs(bdi_class, parent, MKDEV(0, 0), bdi, fmt, args); va_end(args); - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - goto exit; - } + if (IS_ERR(dev)) + return PTR_ERR(dev); bdi->dev = dev; + + /* + * Just start the forker thread for our default backing_dev_info, + * and add other bdi's to the list. They will get a thread created + * on-demand when they need it. + */ + if (bdi_cap_flush_forker(bdi)) { + struct bdi_writeback *wb = &bdi->wb; + + wb->task = kthread_run(bdi_forker_thread, wb, "bdi-%s", + dev_name(dev)); + if (IS_ERR(wb->task)) + return PTR_ERR(wb->task); + } + bdi_debug_register(bdi, dev_name(dev)); + set_bit(BDI_registered, &bdi->state); -exit: - return ret; + spin_lock_bh(&bdi_lock); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); + + trace_writeback_bdi_register(bdi); + return 0; } EXPORT_SYMBOL(bdi_register); @@ -225,9 +563,61 @@ int bdi_register_dev(struct backing_dev_info *bdi, dev_t dev) } EXPORT_SYMBOL(bdi_register_dev); +/* + * Remove bdi from the global list and shutdown any threads we have running + */ +static void bdi_wb_shutdown(struct backing_dev_info *bdi) +{ + if (!bdi_cap_writeback_dirty(bdi)) + return; + + /* + * Make sure nobody finds us on the bdi_list anymore + */ + bdi_remove_from_list(bdi); + + /* + * If setup is pending, wait for that to complete first + */ + wait_on_bit(&bdi->state, BDI_pending, bdi_sched_wait, + TASK_UNINTERRUPTIBLE); + + /* + * Finally, kill the kernel thread. We don't need to be RCU + * safe anymore, since the bdi is gone from visibility. Force + * unfreeze of the thread before calling kthread_stop(), otherwise + * it would never exet if it is currently stuck in the refrigerator. + */ + if (bdi->wb.task) { + thaw_process(bdi->wb.task); + kthread_stop(bdi->wb.task); + } +} + +/* + * This bdi is going away now, make sure that no super_blocks point to it + */ +static void bdi_prune_sb(struct backing_dev_info *bdi) +{ + struct super_block *sb; + + spin_lock(&sb_lock); + list_for_each_entry(sb, &super_blocks, s_list) { + if (sb->s_bdi == bdi) + sb->s_bdi = NULL; + } + spin_unlock(&sb_lock); +} + void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + trace_writeback_bdi_unregister(bdi); + bdi_prune_sb(bdi); + del_timer_sync(&bdi->wb.wakeup_timer); + + if (!bdi_cap_flush_forker(bdi)) + bdi_wb_shutdown(bdi); bdi_debug_unregister(bdi); device_unregister(bdi->dev); bdi->dev = NULL; @@ -235,16 +625,32 @@ void bdi_unregister(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_unregister); +static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi) +{ + memset(wb, 0, sizeof(*wb)); + + wb->bdi = bdi; + wb->last_old_flush = jiffies; + INIT_LIST_HEAD(&wb->b_dirty); + INIT_LIST_HEAD(&wb->b_io); + INIT_LIST_HEAD(&wb->b_more_io); + setup_timer(&wb->wakeup_timer, wakeup_timer_fn, (unsigned long)bdi); +} + int bdi_init(struct backing_dev_info *bdi) { - int i; - int err; + int i, err; bdi->dev = NULL; bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = PROP_FRAC_BASE; + spin_lock_init(&bdi->wb_lock); + INIT_LIST_HEAD(&bdi->bdi_list); + INIT_LIST_HEAD(&bdi->work_list); + + bdi_wb_init(&bdi->wb, bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) { err = percpu_counter_init(&bdi->bdi_stat[i], 0); @@ -269,6 +675,20 @@ void bdi_destroy(struct backing_dev_info *bdi) { int i; + /* + * Splice our entries to the default_backing_dev_info, if this + * bdi disappears + */ + if (bdi_has_dirty_io(bdi)) { + struct bdi_writeback *dst = &default_backing_dev_info.wb; + + spin_lock(&inode_lock); + list_splice(&bdi->wb.b_dirty, &dst->b_dirty); + list_splice(&bdi->wb.b_io, &dst->b_io); + list_splice(&bdi->wb.b_more_io, &dst->b_more_io); + spin_unlock(&inode_lock); + } + bdi_unregister(bdi); for (i = 0; i < NR_BDI_STAT_ITEMS; i++) @@ -278,11 +698,38 @@ void bdi_destroy(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_destroy); +/* + * For use from filesystems to quickly init and register a bdi associated + * with dirty writeback + */ +int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, + unsigned int cap) +{ + char tmp[32]; + int err; + + bdi->name = name; + bdi->capabilities = cap; + err = bdi_init(bdi); + if (err) + return err; + + sprintf(tmp, "%.28s%s", name, "-%d"); + err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); + if (err) { + bdi_destroy(bdi); + return err; + } + + return 0; +} +EXPORT_SYMBOL(bdi_setup_and_register); + static wait_queue_head_t congestion_wqh[2] = { __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[0]), __WAIT_QUEUE_HEAD_INITIALIZER(congestion_wqh[1]) }; - +static atomic_t nr_bdi_congested[2]; void clear_bdi_congested(struct backing_dev_info *bdi, int sync) { @@ -290,7 +737,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync) wait_queue_head_t *wqh = &congestion_wqh[sync]; bit = sync ? BDI_sync_congested : BDI_async_congested; - clear_bit(bit, &bdi->state); + if (test_and_clear_bit(bit, &bdi->state)) + atomic_dec(&nr_bdi_congested[sync]); smp_mb__after_clear_bit(); if (waitqueue_active(wqh)) wake_up(wqh); @@ -302,29 +750,89 @@ void set_bdi_congested(struct backing_dev_info *bdi, int sync) enum bdi_state bit; bit = sync ? BDI_sync_congested : BDI_async_congested; - set_bit(bit, &bdi->state); + if (!test_and_set_bit(bit, &bdi->state)) + atomic_inc(&nr_bdi_congested[sync]); } EXPORT_SYMBOL(set_bdi_congested); /** * congestion_wait - wait for a backing_dev to become uncongested - * @rw: READ or WRITE + * @sync: SYNC or ASYNC IO * @timeout: timeout in jiffies * * Waits for up to @timeout jiffies for a backing_dev (any backing_dev) to exit * write congestion. If no backing_devs are congested then just wait for the * next write to be completed. */ -long congestion_wait(int rw, long timeout) +long congestion_wait(int sync, long timeout) { long ret; + unsigned long start = jiffies; DEFINE_WAIT(wait); - wait_queue_head_t *wqh = &congestion_wqh[rw]; + wait_queue_head_t *wqh = &congestion_wqh[sync]; prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); ret = io_schedule_timeout(timeout); finish_wait(wqh, &wait); + + trace_writeback_congestion_wait(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + return ret; } EXPORT_SYMBOL(congestion_wait); +/** + * wait_iff_congested - Conditionally wait for a backing_dev to become uncongested or a zone to complete writes + * @zone: A zone to check if it is heavily congested + * @sync: SYNC or ASYNC IO + * @timeout: timeout in jiffies + * + * In the event of a congested backing_dev (any backing_dev) and the given + * @zone has experienced recent congestion, this waits for up to @timeout + * jiffies for either a BDI to exit congestion of the given @sync queue + * or a write to complete. + * + * In the absense of zone congestion, cond_resched() is called to yield + * the processor if necessary but otherwise does not sleep. + * + * The return value is 0 if the sleep is for the full timeout. Otherwise, + * it is the number of jiffies that were still remaining when the function + * returned. return_value == timeout implies the function did not sleep. + */ +long wait_iff_congested(struct zone *zone, int sync, long timeout) +{ + long ret; + unsigned long start = jiffies; + DEFINE_WAIT(wait); + wait_queue_head_t *wqh = &congestion_wqh[sync]; + + /* + * If there is no congestion, or heavy congestion is not being + * encountered in the current zone, yield if necessary instead + * of sleeping on the congestion queue + */ + if (atomic_read(&nr_bdi_congested[sync]) == 0 || + !zone_is_reclaim_congested(zone)) { + cond_resched(); + + /* In case we scheduled, work out time remaining */ + ret = timeout - (jiffies - start); + if (ret < 0) + ret = 0; + + goto out; + } + + /* Sleep until uncongested or a write happens */ + prepare_to_wait(wqh, &wait, TASK_UNINTERRUPTIBLE); + ret = io_schedule_timeout(timeout); + finish_wait(wqh, &wait); + +out: + trace_writeback_wait_iff_congested(jiffies_to_usecs(timeout), + jiffies_to_usecs(jiffies - start)); + + return ret; +} +EXPORT_SYMBOL(wait_iff_congested); diff --git a/mm/bootmem.c b/mm/bootmem.c index d2a9ce9..13b0caa 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -10,8 +10,12 @@ */ #include <linux/init.h> #include <linux/pfn.h> +#include <linux/slab.h> #include <linux/bootmem.h> #include <linux/module.h> +#include <linux/kmemleak.h> +#include <linux/range.h> +#include <linux/memblock.h> #include <asm/bug.h> #include <asm/io.h> @@ -31,6 +35,7 @@ unsigned long max_pfn; unsigned long saved_max_pfn; #endif +#ifndef CONFIG_NO_BOOTMEM bootmem_data_t bootmem_node_data[MAX_NUMNODES] __initdata; static struct list_head bdata_list __initdata = LIST_HEAD_INIT(bdata_list); @@ -141,7 +146,78 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) min_low_pfn = start; return init_bootmem_core(NODE_DATA(0)->bdata, start, 0, pages); } +#endif +/* + * free_bootmem_late - free bootmem pages directly to page allocator + * @addr: starting address of the range + * @size: size of the range in bytes + * + * This is only useful when the bootmem allocator has already been torn + * down, but we are still initializing the system. Pages are given directly + * to the page allocator, no bootmem metadata is updated because it is gone. + */ +void __init free_bootmem_late(unsigned long addr, unsigned long size) +{ + unsigned long cursor, end; + + kmemleak_free_part(__va(addr), size); + + cursor = PFN_UP(addr); + end = PFN_DOWN(addr + size); + + for (; cursor < end; cursor++) { + __free_pages_bootmem(pfn_to_page(cursor), 0); + totalram_pages++; + } +} + +#ifdef CONFIG_NO_BOOTMEM +static void __init __free_pages_memory(unsigned long start, unsigned long end) +{ + int i; + unsigned long start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); + + if (end_aligned <= start_aligned) { + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + return; + } + + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); +} + +unsigned long __init free_all_memory_core_early(int nodeid) +{ + int i; + u64 start, end; + unsigned long count = 0; + struct range *range = NULL; + int nr_range; + + nr_range = get_free_all_memory_range(&range, nodeid); + + for (i = 0; i < nr_range; i++) { + start = range[i].start; + end = range[i].end; + count += end - start; + __free_pages_memory(start, end); + } + + return count; +} +#else static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { int aligned; @@ -202,6 +278,7 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } +#endif /** * free_all_bootmem_node - release a node's free pages to the buddy allocator @@ -212,7 +289,12 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) { register_page_bootmem_info_node(pgdat); +#ifdef CONFIG_NO_BOOTMEM + /* free_all_memory_core_early(MAX_NUMNODES) will be called later */ + return 0; +#else return free_all_bootmem_core(pgdat->bdata); +#endif } /** @@ -222,9 +304,27 @@ unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { - return free_all_bootmem_core(NODE_DATA(0)->bdata); +#ifdef CONFIG_NO_BOOTMEM + /* + * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id + * because in some case like Node0 doesnt have RAM installed + * low ram will be on Node1 + * Use MAX_NUMNODES will make sure all ranges in early_node_map[] + * will be used instead of only Node0 related + */ + return free_all_memory_core_early(MAX_NUMNODES); +#else + unsigned long total_pages = 0; + bootmem_data_t *bdata; + + list_for_each_entry(bdata, &bdata_list, list) + total_pages += free_all_bootmem_core(bdata); + + return total_pages; +#endif } +#ifndef CONFIG_NO_BOOTMEM static void __init __free(bootmem_data_t *bdata, unsigned long sidx, unsigned long eidx) { @@ -319,6 +419,7 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, } BUG(); } +#endif /** * free_bootmem_node - mark a page range as usable @@ -333,12 +434,19 @@ static int __init mark_bootmem(unsigned long start, unsigned long end, void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + kmemleak_free_part(__va(physaddr), size); + memblock_x86_free_range(physaddr, physaddr + size); +#else unsigned long start, end; + kmemleak_free_part(__va(physaddr), size); + start = PFN_UP(physaddr); end = PFN_DOWN(physaddr + size); mark_bootmem_node(pgdat->bdata, start, end, 0, 0); +#endif } /** @@ -352,12 +460,19 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, */ void __init free_bootmem(unsigned long addr, unsigned long size) { +#ifdef CONFIG_NO_BOOTMEM + kmemleak_free_part(__va(addr), size); + memblock_x86_free_range(addr, addr + size); +#else unsigned long start, end; + kmemleak_free_part(__va(addr), size); + start = PFN_UP(addr); end = PFN_DOWN(addr + size); mark_bootmem(start, end, 0, 0); +#endif } /** @@ -374,12 +489,17 @@ void __init free_bootmem(unsigned long addr, unsigned long size) int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(physaddr); end = PFN_UP(physaddr + size); return mark_bootmem_node(pgdat->bdata, start, end, 1, flags); +#endif } /** @@ -395,16 +515,28 @@ int __init reserve_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, int __init reserve_bootmem(unsigned long addr, unsigned long size, int flags) { +#ifdef CONFIG_NO_BOOTMEM + panic("no bootmem"); + return 0; +#else unsigned long start, end; start = PFN_DOWN(addr); end = PFN_UP(addr + size); return mark_bootmem(start, end, 1, flags); +#endif +} + +#ifndef CONFIG_NO_BOOTMEM +int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len, + int flags) +{ + return reserve_bootmem(phys, len, flags); } -static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, - unsigned long step) +static unsigned long __init align_idx(struct bootmem_data *bdata, + unsigned long idx, unsigned long step) { unsigned long base = bdata->node_min_pfn; @@ -416,8 +548,8 @@ static unsigned long align_idx(struct bootmem_data *bdata, unsigned long idx, return ALIGN(base + idx, step) - base; } -static unsigned long align_off(struct bootmem_data *bdata, unsigned long off, - unsigned long align) +static unsigned long __init align_off(struct bootmem_data *bdata, + unsigned long off, unsigned long align) { unsigned long base = PFN_PHYS(bdata->node_min_pfn); @@ -516,6 +648,11 @@ find_block: region = phys_to_virt(PFN_PHYS(bdata->node_min_pfn) + start_off); memset(region, 0, size); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(region, size, 0, 0); return region; } @@ -548,12 +685,33 @@ static void * __init alloc_arch_preferred_bootmem(bootmem_data_t *bdata, #endif return NULL; } +#endif static void * __init ___alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { +#ifdef CONFIG_NO_BOOTMEM + void *ptr; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc(size, GFP_NOWAIT); + +restart: + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, goal, limit); + + if (ptr) + return ptr; + + if (goal != 0) { + goal = 0; + goto restart; + } + + return NULL; +#else bootmem_data_t *bdata; void *region; @@ -579,6 +737,7 @@ restart: } return NULL; +#endif } /** @@ -597,7 +756,13 @@ restart: void * __init __alloc_bootmem_nopanic(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem_nopanic(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem_nopanic(size, align, goal, limit); } static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, @@ -631,9 +796,16 @@ static void * __init ___alloc_bootmem(unsigned long size, unsigned long align, void * __init __alloc_bootmem(unsigned long size, unsigned long align, unsigned long goal) { - return ___alloc_bootmem(size, align, goal, 0); + unsigned long limit = 0; + +#ifdef CONFIG_NO_BOOTMEM + limit = -1UL; +#endif + + return ___alloc_bootmem(size, align, goal, limit); } +#ifndef CONFIG_NO_BOOTMEM static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) @@ -650,6 +822,7 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, return ___alloc_bootmem(size, align, goal, limit); } +#endif /** * __alloc_bootmem_node - allocate boot memory from a specific node @@ -669,10 +842,58 @@ static void * __init ___alloc_bootmem_node(bootmem_data_t *bdata, void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + void *ptr; + if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); + if (ptr) + return ptr; + + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, -1ULL); +#else + ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, goal, 0); +#endif + + return ptr; +} + +void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, + unsigned long align, unsigned long goal) +{ +#ifdef MAX_DMA32_PFN + unsigned long end_pfn; + + if (WARN_ON_ONCE(slab_is_available())) + return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); + + /* update goal according ...MAX_DMA32_PFN */ + end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; + + if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && + (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { + void *ptr; + unsigned long new_goal; + + new_goal = MAX_DMA32_PFN << PAGE_SHIFT; +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + new_goal, -1ULL); +#else + ptr = alloc_bootmem_core(pgdat->bdata, size, align, + new_goal, 0); +#endif + if (ptr) + return ptr; + } +#endif + + return __alloc_bootmem_node(pgdat, size, align, goal); + } #ifdef CONFIG_SPARSEMEM @@ -686,6 +907,16 @@ void * __init __alloc_bootmem_node(pg_data_t *pgdat, unsigned long size, void * __init alloc_bootmem_section(unsigned long size, unsigned long section_nr) { +#ifdef CONFIG_NO_BOOTMEM + unsigned long pfn, goal, limit; + + pfn = section_nr_to_pfn(section_nr); + goal = pfn << PAGE_SHIFT; + limit = section_nr_to_pfn(section_nr + 1) << PAGE_SHIFT; + + return __alloc_memory_core_early(early_pfn_to_nid(pfn), size, + SMP_CACHE_BYTES, goal, limit); +#else bootmem_data_t *bdata; unsigned long pfn, goal, limit; @@ -695,6 +926,7 @@ void * __init alloc_bootmem_section(unsigned long size, bdata = &bootmem_node_data[early_pfn_to_nid(pfn)]; return alloc_bootmem_core(bdata, size, SMP_CACHE_BYTES, goal, limit); +#endif } #endif @@ -706,11 +938,16 @@ void * __init __alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, + goal, -1ULL); +#else ptr = alloc_arch_preferred_bootmem(pgdat->bdata, size, align, goal, 0); if (ptr) return ptr; ptr = alloc_bootmem_core(pgdat->bdata, size, align, goal, 0); +#endif if (ptr) return ptr; @@ -758,9 +995,21 @@ void * __init __alloc_bootmem_low(unsigned long size, unsigned long align, void * __init __alloc_bootmem_low_node(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal) { + void *ptr; + if (WARN_ON_ONCE(slab_is_available())) return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); - return ___alloc_bootmem_node(pgdat->bdata, size, align, +#ifdef CONFIG_NO_BOOTMEM + ptr = __alloc_memory_core_early(pgdat->node_id, size, align, goal, ARCH_LOW_ADDRESS_LIMIT); + if (ptr) + return ptr; + ptr = __alloc_memory_core_early(MAX_NUMNODES, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +#else + ptr = ___alloc_bootmem_node(pgdat->bdata, size, align, + goal, ARCH_LOW_ADDRESS_LIMIT); +#endif + return ptr; } diff --git a/mm/bounce.c b/mm/bounce.c index a2b76a5..1481de6 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -6,6 +6,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/swap.h> +#include <linux/gfp.h> #include <linux/bio.h> #include <linux/pagemap.h> #include <linux/mempool.h> @@ -115,8 +116,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) */ vfrom = page_address(fromvec->bv_page) + tovec->bv_offset; - flush_dcache_page(tovec->bv_page); bounce_copy_vec(tovec, vfrom); + flush_dcache_page(tovec->bv_page); } } diff --git a/mm/compaction.c b/mm/compaction.c new file mode 100644 index 0000000..1a8894e --- /dev/null +++ b/mm/compaction.c @@ -0,0 +1,605 @@ +/* + * linux/mm/compaction.c + * + * Memory compaction for the reduction of external fragmentation. Note that + * this heavily depends upon page migration to do all the real heavy + * lifting + * + * Copyright IBM Corp. 2007-2010 Mel Gorman <mel@csn.ul.ie> + */ +#include <linux/swap.h> +#include <linux/migrate.h> +#include <linux/compaction.h> +#include <linux/mm_inline.h> +#include <linux/backing-dev.h> +#include <linux/sysctl.h> +#include <linux/sysfs.h> +#include "internal.h" + +/* + * compact_control is used to track pages being migrated and the free pages + * they are being migrated to during memory compaction. The free_pfn starts + * at the end of a zone and migrate_pfn begins at the start. Movable pages + * are moved to the end of a zone during a compaction run and the run + * completes when free_pfn <= migrate_pfn + */ +struct compact_control { + struct list_head freepages; /* List of free pages to migrate to */ + struct list_head migratepages; /* List of pages being migrated */ + unsigned long nr_freepages; /* Number of isolated free pages */ + unsigned long nr_migratepages; /* Number of pages to migrate */ + unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long migrate_pfn; /* isolate_migratepages search base */ + + /* Account for isolated anon and file pages */ + unsigned long nr_anon; + unsigned long nr_file; + + unsigned int order; /* order a direct compactor needs */ + int migratetype; /* MOVABLE, RECLAIMABLE etc */ + struct zone *zone; +}; + +static unsigned long release_freepages(struct list_head *freelist) +{ + struct page *page, *next; + unsigned long count = 0; + + list_for_each_entry_safe(page, next, freelist, lru) { + list_del(&page->lru); + __free_page(page); + count++; + } + + return count; +} + +/* Isolate free pages onto a private freelist. Must hold zone->lock */ +static unsigned long isolate_freepages_block(struct zone *zone, + unsigned long blockpfn, + struct list_head *freelist) +{ + unsigned long zone_end_pfn, end_pfn; + int total_isolated = 0; + struct page *cursor; + + /* Get the last PFN we should scan for free pages at */ + zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; + end_pfn = min(blockpfn + pageblock_nr_pages, zone_end_pfn); + + /* Find the first usable PFN in the block to initialse page cursor */ + for (; blockpfn < end_pfn; blockpfn++) { + if (pfn_valid_within(blockpfn)) + break; + } + cursor = pfn_to_page(blockpfn); + + /* Isolate free pages. This assumes the block is valid */ + for (; blockpfn < end_pfn; blockpfn++, cursor++) { + int isolated, i; + struct page *page = cursor; + + if (!pfn_valid_within(blockpfn)) + continue; + + if (!PageBuddy(page)) + continue; + + /* Found a free page, break it into order-0 pages */ + isolated = split_free_page(page); + total_isolated += isolated; + for (i = 0; i < isolated; i++) { + list_add(&page->lru, freelist); + page++; + } + + /* If a page was split, advance to the end of it */ + if (isolated) { + blockpfn += isolated - 1; + cursor += isolated - 1; + } + } + + return total_isolated; +} + +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) +{ + + int migratetype = get_pageblock_migratetype(page); + + /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ + if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) + return false; + + /* If the page is a large free page, then allow migration */ + if (PageBuddy(page) && page_order(page) >= pageblock_order) + return true; + + /* If the block is MIGRATE_MOVABLE, allow migration */ + if (migratetype == MIGRATE_MOVABLE) + return true; + + /* Otherwise skip the block */ + return false; +} + +/* + * Based on information in the current compact_control, find blocks + * suitable for isolating free pages from and then isolate them. + */ +static void isolate_freepages(struct zone *zone, + struct compact_control *cc) +{ + struct page *page; + unsigned long high_pfn, low_pfn, pfn; + unsigned long flags; + int nr_freepages = cc->nr_freepages; + struct list_head *freelist = &cc->freepages; + + pfn = cc->free_pfn; + low_pfn = cc->migrate_pfn + pageblock_nr_pages; + high_pfn = low_pfn; + + /* + * Isolate free pages until enough are available to migrate the + * pages on cc->migratepages. We stop searching if the migrate + * and free page scanners meet or enough free pages are isolated. + */ + spin_lock_irqsave(&zone->lock, flags); + for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; + pfn -= pageblock_nr_pages) { + unsigned long isolated; + + if (!pfn_valid(pfn)) + continue; + + /* + * Check for overlapping nodes/zones. It's possible on some + * configurations to have a setup like + * node0 node1 node0 + * i.e. it's possible that all pages within a zones range of + * pages do not belong to a single zone. + */ + page = pfn_to_page(pfn); + if (page_zone(page) != zone) + continue; + + /* Check the block is suitable for migration */ + if (!suitable_migration_target(page)) + continue; + + /* Found a block suitable for isolating free pages from */ + isolated = isolate_freepages_block(zone, pfn, freelist); + nr_freepages += isolated; + + /* + * Record the highest PFN we isolated pages from. When next + * looking for free pages, the search will restart here as + * page migration may have returned some pages to the allocator + */ + if (isolated) + high_pfn = max(high_pfn, pfn); + } + spin_unlock_irqrestore(&zone->lock, flags); + + /* split_free_page does not map the pages */ + list_for_each_entry(page, freelist, lru) { + arch_alloc_page(page, 0); + kernel_map_pages(page, 1, 1); + } + + cc->free_pfn = high_pfn; + cc->nr_freepages = nr_freepages; +} + +/* Update the number of anon and file isolated pages in the zone */ +static void acct_isolated(struct zone *zone, struct compact_control *cc) +{ + struct page *page; + unsigned int count[NR_LRU_LISTS] = { 0, }; + + list_for_each_entry(page, &cc->migratepages, lru) { + int lru = page_lru_base_type(page); + count[lru]++; + } + + cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file); +} + +/* Similar to reclaim, but different enough that they don't share logic */ +static bool too_many_isolated(struct zone *zone) +{ + unsigned long active, inactive, isolated; + + inactive = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_ANON); + active = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_FILE) + + zone_page_state(zone, NR_ISOLATED_ANON); + + return isolated > (inactive + active) / 2; +} + +/* + * Isolate all pages that can be migrated from the block pointed to by + * the migrate scanner within compact_control. + */ +static unsigned long isolate_migratepages(struct zone *zone, + struct compact_control *cc) +{ + unsigned long low_pfn, end_pfn; + struct list_head *migratelist = &cc->migratepages; + + /* Do not scan outside zone boundaries */ + low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn); + + /* Only scan within a pageblock boundary */ + end_pfn = ALIGN(low_pfn + pageblock_nr_pages, pageblock_nr_pages); + + /* Do not cross the free scanner or scan within a memory hole */ + if (end_pfn > cc->free_pfn || !pfn_valid(low_pfn)) { + cc->migrate_pfn = end_pfn; + return 0; + } + + /* + * Ensure that there are not too many pages isolated from the LRU + * list by either parallel reclaimers or compaction. If there are, + * delay for some time until fewer pages are isolated + */ + while (unlikely(too_many_isolated(zone))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + + if (fatal_signal_pending(current)) + return 0; + } + + /* Time to isolate some pages for migration */ + spin_lock_irq(&zone->lru_lock); + for (; low_pfn < end_pfn; low_pfn++) { + struct page *page; + if (!pfn_valid_within(low_pfn)) + continue; + + /* Get the page and skip if free */ + page = pfn_to_page(low_pfn); + if (PageBuddy(page)) + continue; + + /* Try isolate the page */ + if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) + continue; + + /* Successfully isolated */ + del_page_from_lru_list(zone, page, page_lru(page)); + list_add(&page->lru, migratelist); + cc->nr_migratepages++; + + /* Avoid isolating too much */ + if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) + break; + } + + acct_isolated(zone, cc); + + spin_unlock_irq(&zone->lru_lock); + cc->migrate_pfn = low_pfn; + + return cc->nr_migratepages; +} + +/* + * This is a migrate-callback that "allocates" freepages by taking pages + * from the isolated freelists in the block we are migrating to. + */ +static struct page *compaction_alloc(struct page *migratepage, + unsigned long data, + int **result) +{ + struct compact_control *cc = (struct compact_control *)data; + struct page *freepage; + + /* Isolate free pages if necessary */ + if (list_empty(&cc->freepages)) { + isolate_freepages(cc->zone, cc); + + if (list_empty(&cc->freepages)) + return NULL; + } + + freepage = list_entry(cc->freepages.next, struct page, lru); + list_del(&freepage->lru); + cc->nr_freepages--; + + return freepage; +} + +/* + * We cannot control nr_migratepages and nr_freepages fully when migration is + * running as migrate_pages() has no knowledge of compact_control. When + * migration is complete, we count the number of pages on the lists by hand. + */ +static void update_nr_listpages(struct compact_control *cc) +{ + int nr_migratepages = 0; + int nr_freepages = 0; + struct page *page; + + list_for_each_entry(page, &cc->migratepages, lru) + nr_migratepages++; + list_for_each_entry(page, &cc->freepages, lru) + nr_freepages++; + + cc->nr_migratepages = nr_migratepages; + cc->nr_freepages = nr_freepages; +} + +static int compact_finished(struct zone *zone, + struct compact_control *cc) +{ + unsigned int order; + unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); + + if (fatal_signal_pending(current)) + return COMPACT_PARTIAL; + + /* Compaction run completes if the migrate and free scanner meet */ + if (cc->free_pfn <= cc->migrate_pfn) + return COMPACT_COMPLETE; + + /* Compaction run is not finished if the watermark is not met */ + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) + return COMPACT_CONTINUE; + + if (cc->order == -1) + return COMPACT_CONTINUE; + + /* Direct compactor: Is a suitable page free? */ + for (order = cc->order; order < MAX_ORDER; order++) { + /* Job done if page is free of the right migratetype */ + if (!list_empty(&zone->free_area[order].free_list[cc->migratetype])) + return COMPACT_PARTIAL; + + /* Job done if allocation would set block type */ + if (order >= pageblock_order && zone->free_area[order].nr_free) + return COMPACT_PARTIAL; + } + + return COMPACT_CONTINUE; +} + +static int compact_zone(struct zone *zone, struct compact_control *cc) +{ + int ret; + + /* Setup to move all movable pages to the end of the zone */ + cc->migrate_pfn = zone->zone_start_pfn; + cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; + cc->free_pfn &= ~(pageblock_nr_pages-1); + + migrate_prep_local(); + + while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { + unsigned long nr_migrate, nr_remaining; + + if (!isolate_migratepages(zone, cc)) + continue; + + nr_migrate = cc->nr_migratepages; + migrate_pages(&cc->migratepages, compaction_alloc, + (unsigned long)cc, 0); + update_nr_listpages(cc); + nr_remaining = cc->nr_migratepages; + + count_vm_event(COMPACTBLOCKS); + count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); + if (nr_remaining) + count_vm_events(COMPACTPAGEFAILED, nr_remaining); + + /* Release LRU pages not migrated */ + if (!list_empty(&cc->migratepages)) { + putback_lru_pages(&cc->migratepages); + cc->nr_migratepages = 0; + } + + } + + /* Release free pages and check accounting */ + cc->nr_freepages -= release_freepages(&cc->freepages); + VM_BUG_ON(cc->nr_freepages != 0); + + return ret; +} + +static unsigned long compact_zone_order(struct zone *zone, + int order, gfp_t gfp_mask) +{ + struct compact_control cc = { + .nr_freepages = 0, + .nr_migratepages = 0, + .order = order, + .migratetype = allocflags_to_migratetype(gfp_mask), + .zone = zone, + }; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + return compact_zone(zone, &cc); +} + +int sysctl_extfrag_threshold = 500; + +/** + * try_to_compact_pages - Direct compact to satisfy a high-order allocation + * @zonelist: The zonelist used for the current allocation + * @order: The order of the current allocation + * @gfp_mask: The GFP mask of the current allocation + * @nodemask: The allowed nodes to allocate from + * + * This is the main entry point for direct page compaction. + */ +unsigned long try_to_compact_pages(struct zonelist *zonelist, + int order, gfp_t gfp_mask, nodemask_t *nodemask) +{ + enum zone_type high_zoneidx = gfp_zone(gfp_mask); + int may_enter_fs = gfp_mask & __GFP_FS; + int may_perform_io = gfp_mask & __GFP_IO; + unsigned long watermark; + struct zoneref *z; + struct zone *zone; + int rc = COMPACT_SKIPPED; + + /* + * Check whether it is worth even starting compaction. The order check is + * made because an assumption is made that the page allocator can satisfy + * the "cheaper" orders without taking special steps + */ + if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) + return rc; + + count_vm_event(COMPACTSTALL); + + /* Compact each zone in the list */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, + nodemask) { + int fragindex; + int status; + + /* + * Watermarks for order-0 must be met for compaction. Note + * the 2UL. This is because during migration, copies of + * pages need to be allocated and for a short time, the + * footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + continue; + + /* + * fragmentation index determines if allocation failures are + * due to low memory or external fragmentation + * + * index of -1 implies allocations might succeed depending + * on watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + continue; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { + rc = COMPACT_PARTIAL; + break; + } + + status = compact_zone_order(zone, order, gfp_mask); + rc = max(status, rc); + + if (zone_watermark_ok(zone, order, watermark, 0, 0)) + break; + } + + return rc; +} + + +/* Compact all zones within a node */ +static int compact_node(int nid) +{ + int zoneid; + pg_data_t *pgdat; + struct zone *zone; + + if (nid < 0 || nid >= nr_node_ids || !node_online(nid)) + return -EINVAL; + pgdat = NODE_DATA(nid); + + /* Flush pending updates to the LRU lists */ + lru_add_drain_all(); + + for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) { + struct compact_control cc = { + .nr_freepages = 0, + .nr_migratepages = 0, + .order = -1, + }; + + zone = &pgdat->node_zones[zoneid]; + if (!populated_zone(zone)) + continue; + + cc.zone = zone; + INIT_LIST_HEAD(&cc.freepages); + INIT_LIST_HEAD(&cc.migratepages); + + compact_zone(zone, &cc); + + VM_BUG_ON(!list_empty(&cc.freepages)); + VM_BUG_ON(!list_empty(&cc.migratepages)); + } + + return 0; +} + +/* Compact all nodes in the system */ +static int compact_nodes(void) +{ + int nid; + + for_each_online_node(nid) + compact_node(nid); + + return COMPACT_COMPLETE; +} + +/* The written value is actually unused, all memory is compacted */ +int sysctl_compact_memory; + +/* This is the entry point for compacting all nodes via /proc/sys/vm */ +int sysctl_compaction_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + if (write) + return compact_nodes(); + + return 0; +} + +int sysctl_extfrag_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + proc_dointvec_minmax(table, write, buffer, length, ppos); + + return 0; +} + +#if defined(CONFIG_SYSFS) && defined(CONFIG_NUMA) +ssize_t sysfs_compact_node(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + compact_node(dev->id); + + return count; +} +static SYSDEV_ATTR(compact, S_IWUSR, NULL, sysfs_compact_node); + +int compaction_register_node(struct node *node) +{ + return sysdev_create_file(&node->sysdev, &attr_compact); +} + +void compaction_unregister_node(struct node *node) +{ + return sysdev_remove_file(&node->sysdev, &attr_compact); +} +#endif /* CONFIG_SYSFS && CONFIG_NUMA */ diff --git a/mm/dmapool.c b/mm/dmapool.c index b1f0885..4df2de7 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -86,10 +86,12 @@ show_pools(struct device *dev, struct device_attribute *attr, char *buf) unsigned pages = 0; unsigned blocks = 0; + spin_lock_irq(&pool->lock); list_for_each_entry(page, &pool->page_list, page_list) { pages++; blocks += page->in_use; } + spin_unlock_irq(&pool->lock); /* per-pool info, no real statistics yet */ temp = scnprintf(next, size, "%-16s %4u %4Zu %4Zu %2u\n", @@ -309,6 +311,8 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, size_t offset; void *retval; + might_sleep_if(mem_flags & __GFP_WAIT); + spin_lock_irqsave(&pool->lock, flags); restart: list_for_each_entry(page, &pool->page_list, page_list) { diff --git a/mm/fadvise.c b/mm/fadvise.c index e433592..8d723c9 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -77,12 +77,20 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) switch (advice) { case POSIX_FADV_NORMAL: file->f_ra.ra_pages = bdi->ra_pages; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_RANDOM: - file->f_ra.ra_pages = 0; + spin_lock(&file->f_lock); + file->f_mode |= FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_SEQUENTIAL: file->f_ra.ra_pages = bdi->ra_pages * 2; + spin_lock(&file->f_lock); + file->f_mode &= ~FMODE_RANDOM; + spin_unlock(&file->f_lock); break; case POSIX_FADV_WILLNEED: if (!mapping->a_ops->readpage) { diff --git a/mm/failslab.c b/mm/failslab.c index 9339de5..c5f88f2 100644 --- a/mm/failslab.c +++ b/mm/failslab.c @@ -1,18 +1,21 @@ #include <linux/fault-inject.h> -#include <linux/gfp.h> +#include <linux/slab.h> static struct { struct fault_attr attr; u32 ignore_gfp_wait; + int cache_filter; #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS struct dentry *ignore_gfp_wait_file; + struct dentry *cache_filter_file; #endif } failslab = { .attr = FAULT_ATTR_INITIALIZER, .ignore_gfp_wait = 1, + .cache_filter = 0, }; -bool should_failslab(size_t size, gfp_t gfpflags) +bool should_failslab(size_t size, gfp_t gfpflags, unsigned long cache_flags) { if (gfpflags & __GFP_NOFAIL) return false; @@ -20,6 +23,9 @@ bool should_failslab(size_t size, gfp_t gfpflags) if (failslab.ignore_gfp_wait && (gfpflags & __GFP_WAIT)) return false; + if (failslab.cache_filter && !(cache_flags & SLAB_FAILSLAB)) + return false; + return should_fail(&failslab.attr, size); } @@ -30,7 +36,6 @@ static int __init setup_failslab(char *str) __setup("failslab=", setup_failslab); #ifdef CONFIG_FAULT_INJECTION_DEBUG_FS - static int __init failslab_debugfs_init(void) { mode_t mode = S_IFREG | S_IRUSR | S_IWUSR; @@ -46,8 +51,14 @@ static int __init failslab_debugfs_init(void) debugfs_create_bool("ignore-gfp-wait", mode, dir, &failslab.ignore_gfp_wait); - if (!failslab.ignore_gfp_wait_file) { + failslab.cache_filter_file = + debugfs_create_bool("cache-filter", mode, dir, + &failslab.cache_filter); + + if (!failslab.ignore_gfp_wait_file || + !failslab.cache_filter_file) { err = -ENOMEM; + debugfs_remove(failslab.cache_filter_file); debugfs_remove(failslab.ignore_gfp_wait_file); cleanup_fault_attr_dentries(&failslab.attr); } diff --git a/mm/filemap.c b/mm/filemap.c index 2239671..ca38939 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -10,13 +10,13 @@ * the NFS filesystem used to do this differently, for example) */ #include <linux/module.h> -#include <linux/slab.h> #include <linux/compiler.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/aio.h> #include <linux/capability.h> #include <linux/kernel_stat.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/mman.h> @@ -39,11 +39,10 @@ /* * FIXME: remove all knowledge of the buffer layer from the core VM */ -#include <linux/buffer_head.h> /* for generic_osync_inode */ +#include <linux/buffer_head.h> /* for try_to_free_buffers */ #include <asm/mman.h> - /* * Shared mappings implemented 30.11.1994. It's not fully working yet, * though. @@ -59,7 +58,7 @@ /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_lock (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock @@ -103,8 +102,9 @@ * ->inode_lock (zap_pte_range->set_page_dirty) * ->private_lock (zap_pte_range->__set_page_dirty_buffers) * - * ->task->proc_lock - * ->dcache_lock (proc_pid_lookup) + * (code doesn't rely on that order, so you could switch it around) + * ->tasklist_lock (memory_failure, collect_procs_ao) + * ->i_mmap_lock */ /* @@ -120,6 +120,8 @@ void __remove_from_page_cache(struct page *page) page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __dec_zone_page_state(page, NR_SHMEM); BUG_ON(page_mapped(page)); /* @@ -138,14 +140,20 @@ void __remove_from_page_cache(struct page *page) void remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; + void (*freepage)(struct page *); BUG_ON(!PageLocked(page)); + freepage = mapping->a_ops->freepage; spin_lock_irq(&mapping->tree_lock); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (freepage) + freepage(page); } +EXPORT_SYMBOL(remove_from_page_cache); static int sync_page(void *word) { @@ -255,27 +263,27 @@ int filemap_flush(struct address_space *mapping) EXPORT_SYMBOL(filemap_flush); /** - * wait_on_page_writeback_range - wait for writeback to complete - * @mapping: target address_space - * @start: beginning page index - * @end: ending page index + * filemap_fdatawait_range - wait for writeback to complete + * @mapping: address space structure to wait for + * @start_byte: offset in bytes where the range starts + * @end_byte: offset in bytes where the range ends (inclusive) * - * Wait for writeback to complete against pages indexed by start->end - * inclusive + * Walk the list of under-writeback pages of the given address space + * in the given range and wait for all of them. */ -int wait_on_page_writeback_range(struct address_space *mapping, - pgoff_t start, pgoff_t end) +int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, + loff_t end_byte) { + pgoff_t index = start_byte >> PAGE_CACHE_SHIFT; + pgoff_t end = end_byte >> PAGE_CACHE_SHIFT; struct pagevec pvec; int nr_pages; int ret = 0; - pgoff_t index; - if (end < start) + if (end_byte < start_byte) return 0; pagevec_init(&pvec, 0); - index = start; while ((index <= end) && (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_WRITEBACK, @@ -305,70 +313,7 @@ int wait_on_page_writeback_range(struct address_space *mapping, return ret; } - -/** - * sync_page_range - write and wait on all pages in the passed range - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write - * - * Write and wait upon all the pages in the passed range. This is a "data - * integrity" operation. It waits upon in-flight writeout before starting and - * waiting upon new writeout. If there was an IO error, return it. - * - * We need to re-take i_mutex during the generic_osync_inode list walk because - * it is otherwise livelockable. - */ -int sync_page_range(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) -{ - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) { - mutex_lock(&inode->i_mutex); - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - mutex_unlock(&inode->i_mutex); - } - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; -} -EXPORT_SYMBOL(sync_page_range); - -/** - * sync_page_range_nolock - write & wait on all pages in the passed range without locking - * @inode: target inode - * @mapping: target address_space - * @pos: beginning offset in pages to write - * @count: number of bytes to write - * - * Note: Holding i_mutex across sync_page_range_nolock() is not a good idea - * as it forces O_SYNC writers to different parts of the same file - * to be serialised right until io completion. - */ -int sync_page_range_nolock(struct inode *inode, struct address_space *mapping, - loff_t pos, loff_t count) -{ - pgoff_t start = pos >> PAGE_CACHE_SHIFT; - pgoff_t end = (pos + count - 1) >> PAGE_CACHE_SHIFT; - int ret; - - if (!mapping_cap_writeback_dirty(mapping) || !count) - return 0; - ret = filemap_fdatawrite_range(mapping, pos, pos + count - 1); - if (ret == 0) - ret = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (ret == 0) - ret = wait_on_page_writeback_range(mapping, start, end); - return ret; -} -EXPORT_SYMBOL(sync_page_range_nolock); +EXPORT_SYMBOL(filemap_fdatawait_range); /** * filemap_fdatawait - wait for all under-writeback pages to complete @@ -384,8 +329,7 @@ int filemap_fdatawait(struct address_space *mapping) if (i_size == 0) return 0; - return wait_on_page_writeback_range(mapping, 0, - (i_size - 1) >> PAGE_CACHE_SHIFT); + return filemap_fdatawait_range(mapping, 0, i_size - 1); } EXPORT_SYMBOL(filemap_fdatawait); @@ -432,9 +376,8 @@ int filemap_write_and_wait_range(struct address_space *mapping, WB_SYNC_ALL); /* See comment of filemap_write_and_wait() */ if (err != -EIO) { - int err2 = wait_on_page_writeback_range(mapping, - lstart >> PAGE_CACHE_SHIFT, - lend >> PAGE_CACHE_SHIFT); + int err2 = filemap_fdatawait_range(mapping, + lstart, lend); if (!err) err = err2; } @@ -476,6 +419,8 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, if (likely(!error)) { mapping->nrpages++; __inc_zone_page_state(page, NR_FILE_PAGES); + if (PageSwapBacked(page)) + __inc_zone_page_state(page, NR_SHMEM); spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; @@ -499,7 +444,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, /* * Splice_read and readahead add shmem/tmpfs pages into the page cache * before shmem_readpage has a chance to mark them as SwapBacked: they - * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * need to go on the anon lru below, and mem_cgroup_cache_charge * (called in add_to_page_cache) needs to know where they're going too. */ if (mapping_cap_swap_backed(mapping)) @@ -510,7 +455,7 @@ int add_to_page_cache_lru(struct page *page, struct address_space *mapping, if (page_is_file_cache(page)) lru_cache_add_file(page); else - lru_cache_add_active_anon(page); + lru_cache_add_anon(page); } return ret; } @@ -519,9 +464,15 @@ EXPORT_SYMBOL_GPL(add_to_page_cache_lru); #ifdef CONFIG_NUMA struct page *__page_cache_alloc(gfp_t gfp) { + int n; + struct page *page; + if (cpuset_do_page_mem_spread()) { - int n = cpuset_mem_spread_node(); - return alloc_pages_exact_node(n, gfp, 0); + get_mems_allowed(); + n = cpuset_mem_spread_node(); + page = alloc_pages_exact_node(n, gfp, 0); + put_mems_allowed(); + return page; } return alloc_pages(gfp, 0); } @@ -663,6 +614,19 @@ void __lock_page_nosync(struct page *page) TASK_UNINTERRUPTIBLE); } +int __lock_page_or_retry(struct page *page, struct mm_struct *mm, + unsigned int flags) +{ + if (!(flags & FAULT_FLAG_ALLOW_RETRY)) { + __lock_page(page); + return 1; + } else { + up_read(&mm->mmap_sem); + wait_on_page_locked(page); + return 0; + } +} + /** * find_get_page - find and get a page reference * @mapping: the address_space to search @@ -682,7 +646,9 @@ repeat: pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); if (pagep) { page = radix_tree_deref_slot(pagep); - if (unlikely(!page || page == RADIX_TREE_RETRY)) + if (unlikely(!page)) + goto out; + if (radix_tree_deref_retry(page)) goto repeat; if (!page_cache_get_speculative(page)) @@ -698,6 +664,7 @@ repeat: goto repeat; } } +out: rcu_read_unlock(); return page; @@ -815,12 +782,11 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) { + if (ret) + start = pages[ret-1]->index; goto restart; + } if (!page_cache_get_speculative(page)) goto repeat; @@ -868,11 +834,7 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) goto restart; if (page->mapping == NULL || page->index != index) @@ -925,11 +887,7 @@ repeat: page = radix_tree_deref_slot((void **)pages[i]); if (unlikely(!page)) continue; - /* - * this can only trigger if nr_found == 1, making livelock - * a non issue. - */ - if (unlikely(page == RADIX_TREE_RETRY)) + if (radix_tree_deref_retry(page)) goto restart; if (!page_cache_get_speculative(page)) @@ -1067,6 +1025,9 @@ find_page: goto page_not_up_to_date; if (!trylock_page(page)) goto page_not_up_to_date; + /* Did it get truncated before we got the lock? */ + if (!page->mapping) + goto page_not_up_to_date_locked; if (!mapping->a_ops->is_partially_uptodate(page, desc, offset)) goto page_not_up_to_date_locked; @@ -1157,6 +1118,12 @@ page_not_up_to_date_locked: } readpage: + /* + * A previous I/O error may have been due to temporary + * failures, eg. multipath errors. + * PG_error will be set again if readpage fails. + */ + ClearPageError(page); /* Start the actual read. The read will unlock the page. */ error = mapping->a_ops->readpage(filp, page); @@ -1175,7 +1142,7 @@ readpage: if (!PageUptodate(page)) { if (page->mapping == NULL) { /* - * invalidate_inode_pages got it + * invalidate_mapping_pages got it */ unlock_page(page); page_cache_release(page); @@ -1321,7 +1288,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, { struct file *filp = iocb->ki_filp; ssize_t retval; - unsigned long seg; + unsigned long seg = 0; size_t count; loff_t *ppos = &iocb->ki_pos; @@ -1348,21 +1315,47 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, retval = mapping->a_ops->direct_IO(READ, iocb, iov, pos, nr_segs); } - if (retval > 0) + if (retval > 0) { *ppos = pos + retval; - if (retval) { + count -= retval; + } + + /* + * Btrfs can have a short DIO read if we encounter + * compressed extents, so if there was an error, or if + * we've already read everything we wanted to, or if + * there was a short read because we hit EOF, go ahead + * and return. Otherwise fallthrough to buffered io for + * the rest of the read. + */ + if (retval < 0 || !count || *ppos >= size) { file_accessed(filp); goto out; } } } + count = retval; for (seg = 0; seg < nr_segs; seg++) { read_descriptor_t desc; + loff_t offset = 0; + + /* + * If we did a short DIO read we need to skip the section of the + * iov that we've already read data into. + */ + if (count) { + if (count > iov[seg].iov_len) { + count -= iov[seg].iov_len; + continue; + } + offset = count; + count = 0; + } desc.written = 0; - desc.arg.buf = iov[seg].iov_base; - desc.count = iov[seg].iov_len; + desc.arg.buf = iov[seg].iov_base + offset; + desc.count = iov[seg].iov_len - offset; if (desc.count == 0) continue; desc.error = 0; @@ -1558,25 +1551,30 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * waiting for the lock. */ do_async_mmap_readahead(vma, ra, file, page, offset); - lock_page(page); - - /* Did it get truncated? */ - if (unlikely(page->mapping != mapping)) { - unlock_page(page); - put_page(page); - goto no_cached_page; - } } else { /* No page in the page cache at all */ do_sync_mmap_readahead(vma, ra, file, offset); count_vm_event(PGMAJFAULT); ret = VM_FAULT_MAJOR; retry_find: - page = find_lock_page(mapping, offset); + page = find_get_page(mapping, offset); if (!page) goto no_cached_page; } + if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) { + page_cache_release(page); + return ret | VM_FAULT_RETRY; + } + + /* Did it get truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + put_page(page); + goto retry_find; + } + VM_BUG_ON(page->index != offset); + /* * We have a locked page in the page cache, now we need to check * that it's up-to-date. If not, it is going to be due to an error. @@ -1648,7 +1646,7 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, }; @@ -1692,14 +1690,15 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) { struct page *page; int err; repeat: page = find_get_page(mapping, index); if (!page) { - page = page_cache_alloc_cold(mapping); + page = __page_cache_alloc(gfp | __GFP_COLD); if (!page) return ERR_PTR(-ENOMEM); err = add_to_page_cache_lru(page, mapping, index, GFP_KERNEL); @@ -1719,31 +1718,18 @@ repeat: return page; } -/** - * read_cache_page_async - read into page cache, fill it if needed - * @mapping: the page's address_space - * @index: the page index - * @filler: function to perform the read - * @data: destination for read data - * - * Same as read_cache_page, but don't wait for page to become unlocked - * after submitting it to the filler. - * - * Read into the page cache. If a page already exists, and PageUptodate() is - * not set, try to fill the page but don't wait for it to become unlocked. - * - * If the page does not get brought uptodate, return -EIO. - */ -struct page *read_cache_page_async(struct address_space *mapping, +static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, int (*filler)(void *,struct page*), - void *data) + void *data, + gfp_t gfp) + { struct page *page; int err; retry: - page = __read_cache_page(mapping, index, filler, data); + page = __read_cache_page(mapping, index, filler, data, gfp); if (IS_ERR(page)) return page; if (PageUptodate(page)) @@ -1768,8 +1754,67 @@ out: mark_page_accessed(page); return page; } + +/** + * read_cache_page_async - read into page cache, fill it if needed + * @mapping: the page's address_space + * @index: the page index + * @filler: function to perform the read + * @data: destination for read data + * + * Same as read_cache_page, but don't wait for page to become unlocked + * after submitting it to the filler. + * + * Read into the page cache. If a page already exists, and PageUptodate() is + * not set, try to fill the page but don't wait for it to become unlocked. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_async(struct address_space *mapping, + pgoff_t index, + int (*filler)(void *,struct page*), + void *data) +{ + return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); +} EXPORT_SYMBOL(read_cache_page_async); +static struct page *wait_on_page_read(struct page *page) +{ + if (!IS_ERR(page)) { + wait_on_page_locked(page); + if (!PageUptodate(page)) { + page_cache_release(page); + page = ERR_PTR(-EIO); + } + } + return page; +} + +/** + * read_cache_page_gfp - read into page cache, using specified page allocation flags. + * @mapping: the page's address_space + * @index: the page index + * @gfp: the page allocator flags to use if allocating + * + * This is the same as "read_mapping_page(mapping, index, NULL)", but with + * any new page allocations done using the specified allocation flags. Note + * that the Radix tree operations will still use GFP_KERNEL, so you can't + * expect to do this atomically or anything like that - but you can pass in + * other page requirements. + * + * If the page does not get brought uptodate, return -EIO. + */ +struct page *read_cache_page_gfp(struct address_space *mapping, + pgoff_t index, + gfp_t gfp) +{ + filler_t *filler = (filler_t *)mapping->a_ops->readpage; + + return wait_on_page_read(do_read_cache_page(mapping, index, filler, NULL, gfp)); +} +EXPORT_SYMBOL(read_cache_page_gfp); + /** * read_cache_page - read into page cache, fill it if needed * @mapping: the page's address_space @@ -1787,18 +1832,7 @@ struct page *read_cache_page(struct address_space *mapping, int (*filler)(void *,struct page*), void *data) { - struct page *page; - - page = read_cache_page_async(mapping, index, filler, data); - if (IS_ERR(page)) - goto out; - wait_on_page_locked(page); - if (!PageUptodate(page)) { - page_cache_release(page); - page = ERR_PTR(-EIO); - } - out: - return page; + return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); } EXPORT_SYMBOL(read_cache_page); @@ -1881,7 +1915,7 @@ static size_t __iovec_copy_from_user_inatomic(char *vaddr, /* * Copy as much as we can into the page and return the number of bytes which - * were sucessfully copied. If a fault is encountered then return the number of + * were successfully copied. If a fault is encountered then return the number of * bytes which were copied. */ size_t iov_iter_copy_from_user_atomic(struct page *page, @@ -2008,7 +2042,7 @@ EXPORT_SYMBOL(iov_iter_single_seg_count); inline int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk) { struct inode *inode = file->f_mapping->host; - unsigned long limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; + unsigned long limit = rlimit(RLIMIT_FSIZE); if (unlikely(*pos < 0)) return -EINVAL; @@ -2160,27 +2194,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov, } if (written > 0) { - loff_t end = pos + written; - if (end > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { - i_size_write(inode, end); + pos += written; + if (pos > i_size_read(inode) && !S_ISBLK(inode->i_mode)) { + i_size_write(inode, pos); mark_inode_dirty(inode); } - *ppos = end; + *ppos = pos; } - - /* - * Sync the fs metadata but not the minor inode changes and - * of course not the data as we did direct DMA for the IO. - * i_mutex is held, which protects generic_osync_inode() from - * livelocking. AIO O_DIRECT ops attempt to sync metadata here. - */ out: - if ((written >= 0 || written == -EIOCBQUEUED) && - ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); - if (err < 0) - written = err; - } return written; } EXPORT_SYMBOL(generic_file_direct_write); @@ -2234,14 +2255,12 @@ static ssize_t generic_perform_write(struct file *file, do { struct page *page; - pgoff_t index; /* Pagecache index for current page */ unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ void *fsdata; offset = (pos & (PAGE_CACHE_SIZE - 1)); - index = pos >> PAGE_CACHE_SHIFT; bytes = min_t(unsigned long, PAGE_CACHE_SIZE - offset, iov_iter_count(i)); @@ -2267,11 +2286,15 @@ again: if (unlikely(status)) break; + if (mapping_writably_mapped(mapping)) + flush_dcache_page(page); + pagefault_disable(); copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); pagefault_enable(); flush_dcache_page(page); + mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) @@ -2310,9 +2333,6 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, size_t count, ssize_t written) { struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - const struct address_space_operations *a_ops = mapping->a_ops; - struct inode *inode = mapping->host; ssize_t status; struct iov_iter i; @@ -2322,34 +2342,33 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov, if (likely(status >= 0)) { written += status; *ppos = pos + status; - - /* - * For now, when the user asks for O_SYNC, we'll actually give - * O_DSYNC - */ - if (unlikely((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - if (!a_ops->writepage || !is_sync_kiocb(iocb)) - status = generic_osync_inode(inode, mapping, - OSYNC_METADATA|OSYNC_DATA); - } } - /* - * If we get here for O_DIRECT writes then we must have fallen through - * to buffered writes (block instantiation inside i_size). So we sync - * the file data here, to try to honour O_DIRECT expectations. - */ - if (unlikely(file->f_flags & O_DIRECT) && written) - status = filemap_write_and_wait_range(mapping, - pos, pos + written - 1); - return written ? written : status; } EXPORT_SYMBOL(generic_file_buffered_write); -static ssize_t -__generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t *ppos) +/** + * __generic_file_aio_write - write data to a file + * @iocb: IO state structure (file, offset, etc.) + * @iov: vector with data to write + * @nr_segs: number of segments in the vector + * @ppos: position where to write + * + * This function does all the work needed for actually writing data to a + * file. It does all basic checks, removes SUID from the file, updates + * modification times and calls proper subroutines depending on whether we + * do direct IO or a standard buffered write. + * + * It expects i_mutex to be grabbed unless we work on a block device or similar + * object which does not need locking at all. + * + * This function does *not* take care of syncing data in case of O_SYNC write. + * A caller has to handle it. This is mainly due to the fact that we want to + * avoid syncing under i_mutex. + */ +ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct file *file = iocb->ki_filp; struct address_space * mapping = file->f_mapping; @@ -2423,10 +2442,7 @@ __generic_file_aio_write_nolock(struct kiocb *iocb, const struct iovec *iov, * semantics. */ endbyte = pos + written_buffered - written - 1; - err = do_sync_mapping_range(file->f_mapping, pos, endbyte, - SYNC_FILE_RANGE_WAIT_BEFORE| - SYNC_FILE_RANGE_WRITE| - SYNC_FILE_RANGE_WAIT_AFTER); + err = filemap_write_and_wait_range(file->f_mapping, pos, endbyte); if (err == 0) { written = written_buffered; invalidate_mapping_pages(mapping, @@ -2446,51 +2462,37 @@ out: current->backing_dev_info = NULL; return written ? written : err; } +EXPORT_SYMBOL(__generic_file_aio_write); -ssize_t generic_file_aio_write_nolock(struct kiocb *iocb, - const struct iovec *iov, unsigned long nr_segs, loff_t pos) -{ - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; - ssize_t ret; - - BUG_ON(iocb->ki_pos != pos); - - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); - - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { - ssize_t err; - - err = sync_page_range_nolock(inode, mapping, pos, ret); - if (err < 0) - ret = err; - } - return ret; -} -EXPORT_SYMBOL(generic_file_aio_write_nolock); - +/** + * generic_file_aio_write - write data to a file + * @iocb: IO state structure + * @iov: vector with data to write + * @nr_segs: number of segments in the vector + * @pos: position in file where to write + * + * This is a wrapper around __generic_file_aio_write() to be used by most + * filesystems. It takes care of syncing the file in case of O_SYNC file + * and acquires i_mutex as needed. + */ ssize_t generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; - struct inode *inode = mapping->host; + struct inode *inode = file->f_mapping->host; ssize_t ret; BUG_ON(iocb->ki_pos != pos); mutex_lock(&inode->i_mutex); - ret = __generic_file_aio_write_nolock(iocb, iov, nr_segs, - &iocb->ki_pos); + ret = __generic_file_aio_write(iocb, iov, nr_segs, &iocb->ki_pos); mutex_unlock(&inode->i_mutex); - if (ret > 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { + if (ret > 0 || ret == -EIOCBQUEUED) { ssize_t err; - err = sync_page_range(inode, mapping, pos, ret); - if (err < 0) + err = generic_write_sync(file, pos, ret); + if (err < 0 && ret > 0) ret = err; } return ret; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 427dfe3..83364df 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -17,6 +17,7 @@ #include <linux/sched.h> #include <linux/seqlock.h> #include <linux/mutex.h> +#include <linux/gfp.h> #include <asm/tlbflush.h> #include <asm/io.h> @@ -194,7 +195,7 @@ retry: flush_cache_page(vma, address, pte_pfn(*pte)); pteval = ptep_clear_flush_notify(vma, address, pte); page_remove_rmap(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); BUG_ON(pte_dirty(pteval)); pte_unmap_unlock(pte, ptl); page_cache_release(page); @@ -296,7 +297,7 @@ out: } } -static struct vm_operations_struct xip_file_vm_ops = { +static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, }; diff --git a/mm/fremap.c b/mm/fremap.c index b6ec85a..ec520c7 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -40,7 +40,7 @@ static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, page_remove_rmap(page); page_cache_release(page); update_hiwater_rss(mm); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); } } else { if (!pte_file(pte)) @@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, { struct mm_struct *mm = current->mm; struct address_space *mapping; - unsigned long end = start + size; struct vm_area_struct *vma; int err = -EINVAL; int has_write_lock = 0; @@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (start + size <= start) return err; + /* Does pgoff wrap? */ + if (pgoff + (size >> PAGE_SHIFT) < pgoff) + return err; + /* Can we represent this offset inside this architecture's pte's? */ #if PTE_FILE_MAX_BITS < BITS_PER_LONG if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS)) @@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (!(vma->vm_flags & VM_CAN_NONLINEAR)) goto out; - if (end <= start || start < vma->vm_start || end > vma->vm_end) + if (start < vma->vm_start || start + size > vma->vm_end) goto out; /* Must set VM_NONLINEAR before any pages are populated. */ diff --git a/mm/highmem.c b/mm/highmem.c index 25878cc..693394d 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -26,8 +26,14 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> +#include <linux/kgdb.h> #include <asm/tlbflush.h> + +#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) +DEFINE_PER_CPU(int, __kmap_atomic_idx); +#endif + /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped @@ -41,6 +47,9 @@ unsigned long totalhigh_pages __read_mostly; EXPORT_SYMBOL(totalhigh_pages); + +EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); + unsigned int nr_free_highpages (void) { pg_data_t *pgdat; @@ -220,7 +229,7 @@ EXPORT_SYMBOL(kmap_high); * @page: &struct page to pin * * Returns the page's current virtual memory address, or NULL if no mapping - * exists. When and only when a non null address is returned then a + * exists. If and only if a non null address is returned then a * matching call to kunmap_high() is necessary. * * This can be called from any context. @@ -421,48 +430,3 @@ void __init page_address_init(void) } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ - -#if defined(CONFIG_DEBUG_HIGHMEM) && defined(CONFIG_TRACE_IRQFLAGS_SUPPORT) - -void debug_kmap_atomic(enum km_type type) -{ - static unsigned warn_count = 10; - - if (unlikely(warn_count == 0)) - return; - - if (unlikely(in_interrupt())) { - if (in_irq()) { - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_BIO_SRC_IRQ && type != KM_BIO_DST_IRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } else if (!irqs_disabled()) { /* softirq */ - if (type != KM_IRQ0 && type != KM_IRQ1 && - type != KM_SOFTIRQ0 && type != KM_SOFTIRQ1 && - type != KM_SKB_SUNRPC_DATA && - type != KM_SKB_DATA_SOFTIRQ && - type != KM_BOUNCE_READ) { - WARN_ON(1); - warn_count--; - } - } - } - - if (type == KM_IRQ0 || type == KM_IRQ1 || type == KM_BOUNCE_READ || - type == KM_BIO_SRC_IRQ || type == KM_BIO_DST_IRQ) { - if (!irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } else if (type == KM_SOFTIRQ0 || type == KM_SOFTIRQ1) { - if (irq_count() == 0 && !irqs_disabled()) { - WARN_ON(1); - warn_count--; - } - } -} - -#endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a56e6f3..8585524 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2,7 +2,6 @@ * Generic hugetlb support. * (C) William Irwin, April 2004 */ -#include <linux/gfp.h> #include <linux/list.h> #include <linux/init.h> #include <linux/module.h> @@ -18,12 +17,17 @@ #include <linux/mutex.h> #include <linux/bootmem.h> #include <linux/sysfs.h> +#include <linux/slab.h> +#include <linux/rmap.h> +#include <linux/swap.h> +#include <linux/swapops.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/io.h> #include <linux/hugetlb.h> +#include <linux/node.h> #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; @@ -219,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, (vma->vm_pgoff >> huge_page_order(h)); } +pgoff_t linear_hugepage_index(struct vm_area_struct *vma, + unsigned long address) +{ + return vma_hugecache_offset(hstate_vma(vma), vma, address); +} + /* * Return the size of the pages allocated when backing a VMA. In the majority * cases this will be same size as used by the page table entries. @@ -234,6 +244,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) return 1UL << (hstate->order + PAGE_SHIFT); } +EXPORT_SYMBOL_GPL(vma_kernel_pagesize); /* * Return the page size being used by the MMU to back a VMA. In the majority @@ -400,7 +411,7 @@ static void clear_huge_page(struct page *page, { int i; - if (unlikely(sz > MAX_ORDER_NR_PAGES)) { + if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { clear_gigantic_page(page, addr, sz); return; } @@ -412,14 +423,14 @@ static void clear_huge_page(struct page *page, } } -static void copy_gigantic_page(struct page *dst, struct page *src, +static void copy_user_gigantic_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); struct page *dst_base = dst; struct page *src_base = src; - might_sleep(); + for (i = 0; i < pages_per_huge_page(h); ) { cond_resched(); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); @@ -429,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src, src = mem_map_next(src, src_base, i); } } -static void copy_huge_page(struct page *dst, struct page *src, + +static void copy_user_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_gigantic_page(dst, src, addr, vma); + copy_user_gigantic_page(dst, src, addr, vma); return; } @@ -447,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src, } } +static void copy_gigantic_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page(h); ) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_huge_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { + copy_gigantic_page(dst, src); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page(h); i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } +} + static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); @@ -455,21 +501,17 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(struct hstate *h) +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) { - int nid; - struct page *page = NULL; + struct page *page; - for (nid = 0; nid < MAX_NUMNODES; ++nid) { - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - break; - } - } + if (list_empty(&h->hugepage_freelists[nid])) + return NULL; + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); + set_page_refcounted(page); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; return page; } @@ -477,15 +519,16 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - int nid; struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; - struct zonelist *zonelist = huge_zonelist(vma, address, - htlb_alloc_mask, &mpol, &nodemask); + struct zonelist *zonelist; struct zone *zone; struct zoneref *z; + get_mems_allowed(); + zonelist = huge_zonelist(vma, address, + htlb_alloc_mask, &mpol, &nodemask); /* * A child process with MAP_PRIVATE mappings created by their parent * have no page reserves. This check ensures that reservations are @@ -493,30 +536,26 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, */ if (!vma_has_reserves(vma) && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) - return NULL; + goto err;; for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - nid = zone_to_nid(zone); - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && - !list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - - if (!avoid_reserve) - decrement_hugepage_resv_vma(h, vma); - - break; + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + page = dequeue_huge_page_node(h, zone_to_nid(zone)); + if (page) { + if (!avoid_reserve) + decrement_hugepage_resv_vma(h, vma); + break; + } } } +err: mpol_cond_put(mpol); + put_mems_allowed(); return page; } @@ -562,7 +601,9 @@ static void free_huge_page(struct page *page) mapping = (struct address_space *) page_private(page); set_page_private(page, 0); + page->mapping = NULL; BUG_ON(page_count(page)); + BUG_ON(page_mapcount(page)); INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); @@ -616,6 +657,8 @@ int PageHuge(struct page *page) return dtor == free_huge_page; } +EXPORT_SYMBOL_GPL(PageHuge); + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -639,41 +682,66 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) } /* - * Use a helper variable to find the next node and then - * copy it back to hugetlb_next_nid afterwards: - * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. - * But we don't need to use a spin_lock here: it really - * doesn't matter if occasionally a racer chooses the - * same nid as we do. Move nid forward in the mask even - * if we just successfully allocated a hugepage so that - * the next caller gets hugepages on the next node. + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. */ -static int hstate_next_node(struct hstate *h) +static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { - int next_nid; - next_nid = next_node(h->hugetlb_next_nid, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - h->hugetlb_next_nid = next_nid; - return next_nid; + nid = next_node(nid, *nodes_allowed); + if (nid == MAX_NUMNODES) + nid = first_node(*nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; } -static int alloc_fresh_huge_page(struct hstate *h) +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); + h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int start_nid; int next_nid; int ret = 0; - start_nid = h->hugetlb_next_nid; + start_nid = hstate_next_node_to_alloc(h, nodes_allowed); + next_nid = start_nid; do { - page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); - if (page) + page = alloc_fresh_huge_page_node(h, next_nid); + if (page) { ret = 1; - next_nid = hstate_next_node(h); - } while (!page && h->hugetlb_next_nid != start_nid); + break; + } + next_nid = hstate_next_node_to_alloc(h, nodes_allowed); + } while (next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -683,11 +751,71 @@ static int alloc_fresh_huge_page(struct hstate *h) return ret; } -static struct page *alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) +/* + * helper for free_pool_huge_page() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. + */ +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +/* + * Free huge page from pool from next node to free. + * Attempt to keep persistent huge pages more or less + * balanced over allowed nodes. + * Called with hugetlb_lock locked. + */ +static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + bool acct_surplus) +{ + int start_nid; + int next_nid; + int ret = 0; + + start_nid = hstate_next_node_to_free(h, nodes_allowed); + next_nid = start_nid; + + do { + /* + * If we're returning unused surplus pages, only examine + * nodes with surplus pages. + */ + if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && + !list_empty(&h->hugepage_freelists[next_nid])) { + struct page *page = + list_entry(h->hugepage_freelists[next_nid].next, + struct page, lru); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[next_nid]--; + if (acct_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[next_nid]--; + } + update_and_free_page(h, page); + ret = 1; + break; + } + next_nid = hstate_next_node_to_free(h, nodes_allowed); + } while (next_nid != start_nid); + + return ret; +} + +static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) { struct page *page; - unsigned int nid; + unsigned int r_nid; if (h->order >= MAX_ORDER) return NULL; @@ -725,9 +853,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } spin_unlock(&hugetlb_lock); - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| - __GFP_REPEAT|__GFP_NOWARN, - huge_page_order(h)); + if (nid == NUMA_NO_NODE) + page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + __GFP_REPEAT|__GFP_NOWARN, + huge_page_order(h)); + else + page = alloc_pages_exact_node(nid, + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); @@ -736,19 +869,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, spin_lock(&hugetlb_lock); if (page) { - /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the buddy allocator's reference. - */ - put_page_testzero(page); - VM_BUG_ON(page_count(page)); - nid = page_to_nid(page); + r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); /* * We incremented the global counters already */ - h->nr_huge_pages_node[nid]++; - h->surplus_huge_pages_node[nid]++; + h->nr_huge_pages_node[r_nid]++; + h->surplus_huge_pages_node[r_nid]++; __count_vm_event(HTLB_BUDDY_PGALLOC); } else { h->nr_huge_pages--; @@ -761,6 +888,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } /* + * This allocation function is useful in the context where vma is irrelevant. + * E.g. soft-offlining uses this function because it only cares physical + * address of error page. + */ +struct page *alloc_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + spin_lock(&hugetlb_lock); + page = dequeue_huge_page_node(h, nid); + spin_unlock(&hugetlb_lock); + + if (!page) + page = alloc_buddy_huge_page(h, nid); + + return page; +} + +/* * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */ @@ -784,17 +930,14 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(h, NULL, 0); - if (!page) { + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); + if (!page) /* * We were not able to allocate enough pages to * satisfy the entire reservation so we free what * we've allocated so far. */ - spin_lock(&hugetlb_lock); - needed = 0; goto free; - } list_add(&page->lru, &surplus_list); } @@ -821,31 +964,31 @@ retry: needed += allocated; h->resv_huge_pages += delta; ret = 0; -free: + + spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) break; list_del(&page->lru); + /* + * This page is now managed by the hugetlb allocator and has + * no users -- drop the buddy allocator's reference. + */ + put_page_testzero(page); + VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } /* Free unnecessary surplus pages to the buddy allocator */ +free: if (!list_empty(&surplus_list)) { - spin_unlock(&hugetlb_lock); list_for_each_entry_safe(page, tmp, &surplus_list, lru) { list_del(&page->lru); - /* - * The page has a reference count of zero already, so - * call free_huge_page directly instead of using - * put_page. This must be done with hugetlb_lock - * unlocked which is safe because free_huge_page takes - * hugetlb_lock before deciding how to free the page. - */ - free_huge_page(page); + put_page(page); } - spin_lock(&hugetlb_lock); } + spin_lock(&hugetlb_lock); return ret; } @@ -854,22 +997,13 @@ free: * When releasing a hugetlb pool reservation, any surplus pages that were * allocated to satisfy the reservation must be explicitly freed if they were * never used. + * Called with hugetlb_lock held. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { - static int nid = -1; - struct page *page; unsigned long nr_pages; - /* - * We want to release as many surplus pages as possible, spread - * evenly across all nodes. Iterate across all nodes until we - * can no longer free unreserved surplus pages. This occurs when - * the nodes with surplus pages have no free pages. - */ - unsigned long remaining_iterations = nr_online_nodes; - /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -879,26 +1013,17 @@ static void return_unused_surplus_pages(struct hstate *h, nr_pages = min(unused_resv_pages, h->surplus_huge_pages); - while (remaining_iterations-- && nr_pages) { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - if (!h->surplus_huge_pages_node[nid]) - continue; - - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; - nr_pages--; - remaining_iterations = nr_online_nodes; - } + /* + * We want to release as many surplus pages as possible, spread + * evenly across all nodes with memory. Iterate across these nodes + * until we can no longer free unreserved surplus pages. This occurs + * when the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the freed pages across the + * on-line nodes with memory and will handle the hstate accounting. + */ + while (nr_pages--) { + if (!free_pool_huge_page(h, &node_states[N_HIGH_MEMORY], 1)) + break; } } @@ -983,14 +1108,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, spin_unlock(&hugetlb_lock); if (!page) { - page = alloc_buddy_huge_page(h, vma, addr); + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugetlb_put_quota(inode->i_mapping, chg); - return ERR_PTR(-VM_FAULT_OOM); + return ERR_PTR(-VM_FAULT_SIGBUS); } } - set_page_refcounted(page); set_page_private(page, (unsigned long) mapping); vma_commit_reservation(h, vma, addr); @@ -1001,13 +1125,14 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; - int nr_nodes = nodes_weight(node_online_map); + int nr_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); while (nr_nodes) { void *addr; addr = __alloc_bootmem_node_nopanic( - NODE_DATA(h->hugetlb_next_nid), + NODE_DATA(hstate_next_node_to_alloc(h, + &node_states[N_HIGH_MEMORY])), huge_page_size(h), huge_page_size(h), 0); if (addr) { @@ -1019,7 +1144,6 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) m = addr; goto found; } - hstate_next_node(h); nr_nodes--; } return 0; @@ -1063,7 +1187,8 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (h->order >= MAX_ORDER) { if (!alloc_bootmem_huge_page(h)) break; - } else if (!alloc_fresh_huge_page(h)) + } else if (!alloc_fresh_huge_page(h, + &node_states[N_HIGH_MEMORY])) break; } h->max_huge_pages = i; @@ -1105,14 +1230,15 @@ static void __init report_hugepages(void) } #ifdef CONFIG_HIGHMEM -static void try_to_free_low(struct hstate *h, unsigned long count) +static void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { int i; if (h->order >= MAX_ORDER) return; - for (i = 0; i < MAX_NUMNODES; ++i) { + for_each_node_mask(i, *nodes_allowed) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { @@ -1128,7 +1254,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) } } #else -static inline void try_to_free_low(struct hstate *h, unsigned long count) +static inline void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { } #endif @@ -1138,38 +1265,56 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ -static int adjust_pool_surplus(struct hstate *h, int delta) +static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, + int delta) { - static int prev_nid; - int nid = prev_nid; + int start_nid, next_nid; int ret = 0; VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) - continue; + if (delta < 0) + start_nid = hstate_next_node_to_alloc(h, nodes_allowed); + else + start_nid = hstate_next_node_to_free(h, nodes_allowed); + next_nid = start_nid; + + do { + int nid = next_nid; + if (delta < 0) { + /* + * To shrink on this node, there must be a surplus page + */ + if (!h->surplus_huge_pages_node[nid]) { + next_nid = hstate_next_node_to_alloc(h, + nodes_allowed); + continue; + } + } + if (delta > 0) { + /* + * Surplus cannot exceed the total number of pages + */ + if (h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) { + next_nid = hstate_next_node_to_free(h, + nodes_allowed); + continue; + } + } h->surplus_huge_pages += delta; h->surplus_huge_pages_node[nid] += delta; ret = 1; break; - } while (nid != prev_nid); + } while (next_nid != start_nid); - prev_nid = nid; return ret; } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) +static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { unsigned long min_count, ret; @@ -1189,7 +1334,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) */ spin_lock(&hugetlb_lock); while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { - if (!adjust_pool_surplus(h, -1)) + if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; } @@ -1200,11 +1345,14 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(h); + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; + /* Bail for signals. Probably ctrl-c from user */ + if (signal_pending(current)) + goto out; } /* @@ -1224,15 +1372,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) */ min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); - try_to_free_low(h, min_count); + try_to_free_low(h, min_count, nodes_allowed); while (min_count < persistent_huge_pages(h)) { - struct page *page = dequeue_huge_page(h); - if (!page) + if (!free_pool_huge_page(h, nodes_allowed, 0)) break; - update_and_free_page(h, page); } while (count < persistent_huge_pages(h)) { - if (!adjust_pool_surplus(h, 1)) + if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: @@ -1251,43 +1397,117 @@ out: static struct kobject *hugepages_kobj; static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; -static struct hstate *kobj_to_hstate(struct kobject *kobj) +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); + +static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) { int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) - if (hstate_kobjs[i] == kobj) + if (hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = NUMA_NO_NODE; return &hstates[i]; - BUG(); - return NULL; + } + + return kobj_to_node_hstate(kobj, nidp); } -static ssize_t nr_hugepages_show(struct kobject *kobj, +static ssize_t nr_hugepages_show_common(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); - return sprintf(buf, "%lu\n", h->nr_huge_pages); + struct hstate *h; + unsigned long nr_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + nr_huge_pages = h->nr_huge_pages; + else + nr_huge_pages = h->nr_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", nr_huge_pages); } -static ssize_t nr_hugepages_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) +static ssize_t nr_hugepages_store_common(bool obey_mempolicy, + struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t len) { int err; - unsigned long input; - struct hstate *h = kobj_to_hstate(kobj); + int nid; + unsigned long count; + struct hstate *h; + NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); - err = strict_strtoul(buf, 10, &input); + err = strict_strtoul(buf, 10, &count); if (err) return 0; - h->max_huge_pages = set_max_huge_pages(h, input); + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) { + /* + * global hstate attribute + */ + if (!(obey_mempolicy && + init_nodemask_of_mempolicy(nodes_allowed))) { + NODEMASK_FREE(nodes_allowed); + nodes_allowed = &node_states[N_HIGH_MEMORY]; + } + } else if (nodes_allowed) { + /* + * per node hstate attribute: adjust count to global, + * but restrict alloc/free to the specified node. + */ + count += h->nr_huge_pages - h->nr_huge_pages_node[nid]; + init_nodemask_of_node(nodes_allowed, nid); + } else + nodes_allowed = &node_states[N_HIGH_MEMORY]; + + h->max_huge_pages = set_max_huge_pages(h, count, nodes_allowed); - return count; + if (nodes_allowed != &node_states[N_HIGH_MEMORY]) + NODEMASK_FREE(nodes_allowed); + + return len; +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(false, kobj, attr, buf, len); } HSTATE_ATTR(nr_hugepages); +#ifdef CONFIG_NUMA + +/* + * hstate attribute for optionally mempolicy-based constraint on persistent + * huge page alloc/free. + */ +static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return nr_hugepages_show_common(kobj, attr, buf); +} + +static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t len) +{ + return nr_hugepages_store_common(true, kobj, attr, buf, len); +} +HSTATE_ATTR(nr_hugepages_mempolicy); +#endif + + static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); + struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); } static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, @@ -1295,7 +1515,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, { int err; unsigned long input; - struct hstate *h = kobj_to_hstate(kobj); + struct hstate *h = kobj_to_hstate(kobj, NULL); err = strict_strtoul(buf, 10, &input); if (err) @@ -1312,15 +1532,24 @@ HSTATE_ATTR(nr_overcommit_hugepages); static ssize_t free_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); - return sprintf(buf, "%lu\n", h->free_huge_pages); + struct hstate *h; + unsigned long free_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + free_huge_pages = h->free_huge_pages; + else + free_huge_pages = h->free_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", free_huge_pages); } HSTATE_ATTR_RO(free_hugepages); static ssize_t resv_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); + struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->resv_huge_pages); } HSTATE_ATTR_RO(resv_hugepages); @@ -1328,8 +1557,17 @@ HSTATE_ATTR_RO(resv_hugepages); static ssize_t surplus_hugepages_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - struct hstate *h = kobj_to_hstate(kobj); - return sprintf(buf, "%lu\n", h->surplus_huge_pages); + struct hstate *h; + unsigned long surplus_huge_pages; + int nid; + + h = kobj_to_hstate(kobj, &nid); + if (nid == NUMA_NO_NODE) + surplus_huge_pages = h->surplus_huge_pages; + else + surplus_huge_pages = h->surplus_huge_pages_node[nid]; + + return sprintf(buf, "%lu\n", surplus_huge_pages); } HSTATE_ATTR_RO(surplus_hugepages); @@ -1339,6 +1577,9 @@ static struct attribute *hstate_attrs[] = { &free_hugepages_attr.attr, &resv_hugepages_attr.attr, &surplus_hugepages_attr.attr, +#ifdef CONFIG_NUMA + &nr_hugepages_mempolicy_attr.attr, +#endif NULL, }; @@ -1346,19 +1587,20 @@ static struct attribute_group hstate_attr_group = { .attrs = hstate_attrs, }; -static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, + struct kobject **hstate_kobjs, + struct attribute_group *hstate_attr_group) { int retval; + int hi = h - hstates; - hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, - hugepages_kobj); - if (!hstate_kobjs[h - hstates]) + hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); + if (!hstate_kobjs[hi]) return -ENOMEM; - retval = sysfs_create_group(hstate_kobjs[h - hstates], - &hstate_attr_group); + retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); if (retval) - kobject_put(hstate_kobjs[h - hstates]); + kobject_put(hstate_kobjs[hi]); return retval; } @@ -1373,17 +1615,184 @@ static void __init hugetlb_sysfs_init(void) return; for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h); + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); if (err) printk(KERN_ERR "Hugetlb: Unable to add hstate %s", h->name); } } +#ifdef CONFIG_NUMA + +/* + * node_hstate/s - associate per node hstate attributes, via their kobjects, + * with node sysdevs in node_devices[] using a parallel array. The array + * index of a node sysdev or _hstate == node id. + * This is here to avoid any static dependency of the node sysdev driver, in + * the base kernel, on the hugetlb module. + */ +struct node_hstate { + struct kobject *hugepages_kobj; + struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; +}; +struct node_hstate node_hstates[MAX_NUMNODES]; + +/* + * A subset of global hstate attributes for node sysdevs + */ +static struct attribute *per_node_hstate_attrs[] = { + &nr_hugepages_attr.attr, + &free_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct attribute_group per_node_hstate_attr_group = { + .attrs = per_node_hstate_attrs, +}; + +/* + * kobj_to_node_hstate - lookup global hstate for node sysdev hstate attr kobj. + * Returns node id via non-NULL nidp. + */ +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + int nid; + + for (nid = 0; nid < nr_node_ids; nid++) { + struct node_hstate *nhs = &node_hstates[nid]; + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (nhs->hstate_kobjs[i] == kobj) { + if (nidp) + *nidp = nid; + return &hstates[i]; + } + } + + BUG(); + return NULL; +} + +/* + * Unregister hstate attributes from a single node sysdev. + * No-op if no hstate attributes attached. + */ +void hugetlb_unregister_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + + if (!nhs->hugepages_kobj) + return; /* no hstate attributes */ + + for_each_hstate(h) + if (nhs->hstate_kobjs[h - hstates]) { + kobject_put(nhs->hstate_kobjs[h - hstates]); + nhs->hstate_kobjs[h - hstates] = NULL; + } + + kobject_put(nhs->hugepages_kobj); + nhs->hugepages_kobj = NULL; +} + +/* + * hugetlb module exit: unregister hstate attributes from node sysdevs + * that have them. + */ +static void hugetlb_unregister_all_nodes(void) +{ + int nid; + + /* + * disable node sysdev registrations. + */ + register_hugetlbfs_with_node(NULL, NULL); + + /* + * remove hstate attributes from any nodes that have them. + */ + for (nid = 0; nid < nr_node_ids; nid++) + hugetlb_unregister_node(&node_devices[nid]); +} + +/* + * Register hstate attributes for a single node sysdev. + * No-op if attributes already registered. + */ +void hugetlb_register_node(struct node *node) +{ + struct hstate *h; + struct node_hstate *nhs = &node_hstates[node->sysdev.id]; + int err; + + if (nhs->hugepages_kobj) + return; /* already allocated */ + + nhs->hugepages_kobj = kobject_create_and_add("hugepages", + &node->sysdev.kobj); + if (!nhs->hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, + nhs->hstate_kobjs, + &per_node_hstate_attr_group); + if (err) { + printk(KERN_ERR "Hugetlb: Unable to add hstate %s" + " for node %d\n", + h->name, node->sysdev.id); + hugetlb_unregister_node(node); + break; + } + } +} + +/* + * hugetlb init time: register hstate attributes for all registered node + * sysdevs of nodes that have memory. All on-line nodes should have + * registered their associated sysdev by this time. + */ +static void hugetlb_register_all_nodes(void) +{ + int nid; + + for_each_node_state(nid, N_HIGH_MEMORY) { + struct node *node = &node_devices[nid]; + if (node->sysdev.id == nid) + hugetlb_register_node(node); + } + + /* + * Let the node sysdev driver know we're here so it can + * [un]register hstate attributes on node hotplug. + */ + register_hugetlbfs_with_node(hugetlb_register_node, + hugetlb_unregister_node); +} +#else /* !CONFIG_NUMA */ + +static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) +{ + BUG(); + if (nidp) + *nidp = -1; + return NULL; +} + +static void hugetlb_unregister_all_nodes(void) { } + +static void hugetlb_register_all_nodes(void) { } + +#endif + static void __exit hugetlb_exit(void) { struct hstate *h; + hugetlb_unregister_all_nodes(); + for_each_hstate(h) { kobject_put(hstate_kobjs[h - hstates]); } @@ -1418,6 +1827,8 @@ static int __init hugetlb_init(void) hugetlb_sysfs_init(); + hugetlb_register_all_nodes(); + return 0; } module_init(hugetlb_init); @@ -1441,7 +1852,8 @@ void __init hugetlb_add_hstate(unsigned order) h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); - h->hugetlb_next_nid = first_node(node_online_map); + h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); + h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); @@ -1504,9 +1916,9 @@ static unsigned int cpuset_mems_nr(unsigned int *array) } #ifdef CONFIG_SYSCTL -int hugetlb_sysctl_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, - size_t *length, loff_t *ppos) +static int hugetlb_sysctl_handler_common(bool obey_mempolicy, + struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; unsigned long tmp; @@ -1516,19 +1928,47 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); + + if (write) { + NODEMASK_ALLOC(nodemask_t, nodes_allowed, + GFP_KERNEL | __GFP_NORETRY); + if (!(obey_mempolicy && + init_nodemask_of_mempolicy(nodes_allowed))) { + NODEMASK_FREE(nodes_allowed); + nodes_allowed = &node_states[N_HIGH_MEMORY]; + } + h->max_huge_pages = set_max_huge_pages(h, tmp, nodes_allowed); - if (write) - h->max_huge_pages = set_max_huge_pages(h, tmp); + if (nodes_allowed != &node_states[N_HIGH_MEMORY]) + NODEMASK_FREE(nodes_allowed); + } return 0; } +int hugetlb_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + + return hugetlb_sysctl_handler_common(false, table, write, + buffer, length, ppos); +} + +#ifdef CONFIG_NUMA +int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *length, loff_t *ppos) +{ + return hugetlb_sysctl_handler_common(true, table, write, + buffer, length, ppos); +} +#endif /* CONFIG_NUMA */ + int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (hugepages_treat_as_movable) htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; else @@ -1537,7 +1977,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write, } int hugetlb_overcommit_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1548,7 +1988,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) { spin_lock(&hugetlb_lock); @@ -1689,7 +2129,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -struct vm_operations_struct hugetlb_vm_ops = { +const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, @@ -1719,7 +2159,7 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, ptep); } } @@ -1756,6 +2196,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_pte); ptepage = pte_page(entry); get_page(ptepage); + page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } spin_unlock(&src->page_table_lock); @@ -1767,6 +2208,32 @@ nomem: return -ENOMEM; } +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) { + return 1; + } else + return 0; +} + +static int is_hugetlb_entry_hwpoisoned(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { + return 1; + } else + return 0; +} + void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { @@ -1825,6 +2292,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, if (huge_pte_none(pte)) continue; + /* + * HWPoisoned hugepage is already unmapped and dropped reference + */ + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) + continue; + page = pte_page(pte); if (pte_dirty(pte)) set_page_dirty(page); @@ -1834,6 +2307,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, flush_tlb_range(vma, start, end); mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { + page_remove_rmap(page); list_del(&page->lru); put_page(page); } @@ -1871,6 +2345,12 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + (vma->vm_pgoff >> PAGE_SHIFT); mapping = (struct address_space *)page_private(page); + /* + * Take the mapping lock for the duration of the table walk. As + * this mapping should be shared between all the VMAs, + * __unmap_hugepage_range() is called as the lock is already held + */ + spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(iter_vma, &iter, &mapping->i_mmap, pgoff, pgoff) { /* Do not unmap the current VMA */ if (iter_vma == vma) @@ -1884,14 +2364,18 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from the time of fork. This would look like data corruption */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) - unmap_hugepage_range(iter_vma, + __unmap_hugepage_range(iter_vma, address, address + huge_page_size(h), page); } + spin_unlock(&mapping->i_mmap_lock); return 1; } +/* + * Hugetlb_cow() should be called with page lock of the original hugepage held. + */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, struct page *pagecache_page) @@ -1906,8 +2390,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, retry_avoidcopy: /* If no-one else is actually using this page, avoid the copy * and just make the page writable */ - avoidcopy = (page_count(old_page) == 1); + avoidcopy = (page_mapcount(old_page) == 1); if (avoidcopy) { + if (PageAnon(old_page)) + page_move_anon_rmap(old_page, vma, address); set_huge_ptep_writable(vma, address, ptep); return 0; } @@ -1927,6 +2413,9 @@ retry_avoidcopy: outside_reserve = 1; page_cache_get(old_page); + + /* Drop page_table_lock as buddy allocator may be called */ + spin_unlock(&mm->page_table_lock); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { @@ -1944,27 +2433,51 @@ retry_avoidcopy: if (unmap_ref_private(mm, vma, old_page, address)) { BUG_ON(page_count(old_page) != 1); BUG_ON(huge_pte_none(pte)); + spin_lock(&mm->page_table_lock); goto retry_avoidcopy; } WARN_ON_ONCE(1); } + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); return -PTR_ERR(new_page); } - spin_unlock(&mm->page_table_lock); - copy_huge_page(new_page, old_page, address, vma); + /* + * When the original hugepage is shared one, it does not have + * anon_vma prepared. + */ + if (unlikely(anon_vma_prepare(vma))) { + /* Caller expects lock to be held */ + spin_lock(&mm->page_table_lock); + return VM_FAULT_OOM; + } + + copy_user_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); - spin_lock(&mm->page_table_lock); + /* + * Retake the page_table_lock to check for racing updates + * before the page tables are altered + */ + spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { /* Break COW */ + mmu_notifier_invalidate_range_start(mm, + address & huge_page_mask(h), + (address & huge_page_mask(h)) + huge_page_size(h)); huge_ptep_clear_flush(vma, address, ptep); set_huge_pte_at(mm, address, ptep, make_huge_pte(vma, new_page, 1)); + page_remove_rmap(old_page); + hugepage_add_new_anon_rmap(new_page, vma, address); /* Make the old page be freed below */ new_page = old_page; + mmu_notifier_invalidate_range_end(mm, + address & huge_page_mask(h), + (address & huge_page_mask(h)) + huge_page_size(h)); } page_cache_release(new_page); page_cache_release(old_page); @@ -1984,8 +2497,28 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, return find_lock_page(mapping, idx); } +/* + * Return whether there is a pagecache page to back given address within VMA. + * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. + */ +static bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct address_space *mapping; + pgoff_t idx; + struct page *page; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + page = find_get_page(mapping, idx); + if (page) + put_page(page); + return page != NULL; +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, int write_access) + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; @@ -2043,8 +2576,27 @@ retry: spin_lock(&inode->i_lock); inode->i_blocks += blocks_per_huge_page(h); spin_unlock(&inode->i_lock); - } else + page_dup_rmap(page); + } else { lock_page(page); + if (unlikely(anon_vma_prepare(vma))) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + hugepage_add_new_anon_rmap(page, vma, address); + } + } else { + /* + * If memory error occurs between mmap() and fault, some process + * don't have hwpoisoned swap entry for errored virtual address. + * So we need to block hugepage fault by PG_hwpoison bit check. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON | + VM_FAULT_SET_HINDEX(h - hstates); + goto backout_unlocked; + } + page_dup_rmap(page); } /* @@ -2053,7 +2605,7 @@ retry: * any allocations necessary to record that reservation occur outside * the spinlock. */ - if (write_access && !(vma->vm_flags & VM_SHARED)) + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; @@ -2072,7 +2624,7 @@ retry: && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); - if (write_access && !(vma->vm_flags & VM_SHARED)) { + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); } @@ -2091,15 +2643,27 @@ backout_unlocked: } int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) + unsigned long address, unsigned int flags) { pte_t *ptep; pte_t entry; int ret; + struct page *page = NULL; struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); + ptep = huge_pte_offset(mm, address); + if (ptep) { + entry = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_migration(entry))) { + migration_entry_wait(mm, (pmd_t *)ptep, address); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(h - hstates); + } + ptep = huge_pte_alloc(mm, address, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; @@ -2112,7 +2676,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_lock(&hugetlb_instantiation_mutex); entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, address, ptep, write_access); + ret = hugetlb_no_page(mm, vma, address, ptep, flags); goto out_mutex; } @@ -2126,7 +2690,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * page now as it is used to determine if a reservation has been * consumed. */ - if (write_access && !pte_write(entry)) { + if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; @@ -2137,13 +2701,24 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, vma, address); } + /* + * hugetlb_cow() requires page locks of pte_page(entry) and + * pagecache_page, so here we need take the former one + * when page != pagecache_page or !pagecache_page. + * Note that locking order is always pagecache_page -> page, + * so no worry about deadlock. + */ + page = pte_page(entry); + if (page != pagecache_page) + lock_page(page); + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) goto out_page_table_lock; - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page); @@ -2152,8 +2727,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) - update_mmu_cache(vma, address, entry); + if (huge_ptep_set_access_flags(vma, address, ptep, entry, + flags & FAULT_FLAG_WRITE)) + update_mmu_cache(vma, address, ptep); out_page_table_lock: spin_unlock(&mm->page_table_lock); @@ -2162,6 +2738,8 @@ out_page_table_lock: unlock_page(pagecache_page); put_page(pagecache_page); } + if (page != pagecache_page) + unlock_page(page); out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); @@ -2178,54 +2756,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return NULL; } -static int huge_zeropage_ok(pte_t *ptep, int write, int shared) -{ - if (!ptep || write || shared) - return 0; - else - return huge_pte_none(huge_ptep_get(ptep)); -} - int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, - int write) + unsigned int flags) { unsigned long pfn_offset; unsigned long vaddr = *position; int remainder = *length; struct hstate *h = hstate_vma(vma); - int zeropage_ok = 0; - int shared = vma->vm_flags & VM_SHARED; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + int absent; struct page *page; /* * Some archs (sparc64, sh*) have multiple pte_ts to - * each hugepage. We have to make * sure we get the + * each hugepage. We have to make sure we get the * first, for the page indexing below to work. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); - if (huge_zeropage_ok(pte, write, shared)) - zeropage_ok = 1; + absent = !pte || huge_pte_none(huge_ptep_get(pte)); + + /* + * When coredumping, it suits get_dump_page if we just return + * an error where there's an empty slot with no huge pagecache + * to back it. This way, we avoid allocating a hugepage, and + * the sparse dumpfile avoids allocating disk blocks, but its + * huge holes still show up with zeroes where they need to be. + */ + if (absent && (flags & FOLL_DUMP) && + !hugetlbfs_pagecache_present(h, vma, vaddr)) { + remainder = 0; + break; + } - if (!pte || - (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || - (write && !pte_write(huge_ptep_get(pte)))) { + if (absent || + ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { int ret; spin_unlock(&mm->page_table_lock); - ret = hugetlb_fault(mm, vma, vaddr, write); + ret = hugetlb_fault(mm, vma, vaddr, + (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; - if (!i) - i = -EFAULT; break; } @@ -2233,10 +2812,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { - if (zeropage_ok) - pages[i] = ZERO_PAGE(0); - else - pages[i] = mem_map_offset(page, pfn_offset); + pages[i] = mem_map_offset(page, pfn_offset); get_page(pages[i]); } @@ -2260,7 +2836,7 @@ same_page: *length = remainder; *position = vaddr; - return i; + return i ? i : -EFAULT; } void hugetlb_change_protection(struct vm_area_struct *vma, @@ -2369,9 +2945,48 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) long chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); - inode->i_blocks -= blocks_per_huge_page(h); + inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); hugetlb_put_quota(inode->i_mapping, (chg - freed)); hugetlb_acct_memory(h, -(chg - freed)); } + +#ifdef CONFIG_MEMORY_FAILURE + +/* Should be called in hugetlb_lock */ +static int is_hugepage_on_freelist(struct page *hpage) +{ + struct page *page; + struct page *tmp; + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + + list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) + if (page == hpage) + return 1; + return 0; +} + +/* + * This function is called from memory failure code. + * Assume the caller holds page lock of the head page. + */ +int dequeue_hwpoisoned_huge_page(struct page *hpage) +{ + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + int ret = -EBUSY; + + spin_lock(&hugetlb_lock); + if (is_hugepage_on_freelist(hpage)) { + list_del(&hpage->lru); + set_page_refcounted(hpage); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + ret = 0; + } + spin_unlock(&hugetlb_lock); + return ret; +} +#endif diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c new file mode 100644 index 0000000..0948f10 --- /dev/null +++ b/mm/hwpoison-inject.c @@ -0,0 +1,141 @@ +/* Inject a hwpoison memory failure on a arbitary pfn */ +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/swap.h> +#include <linux/pagemap.h> +#include <linux/hugetlb.h> +#include "internal.h" + +static struct dentry *hwpoison_dir; + +static int hwpoison_inject(void *data, u64 val) +{ + unsigned long pfn = val; + struct page *p; + struct page *hpage; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!hwpoison_filter_enable) + goto inject; + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + hpage = compound_head(p); + /* + * This implies unable to support free buddy pages. + */ + if (!get_page_unless_zero(hpage)) + return 0; + + if (!PageLRU(p) && !PageHuge(p)) + shake_page(p, 0); + /* + * This implies unable to support non-LRU pages. + */ + if (!PageLRU(p) && !PageHuge(p)) + return 0; + + /* + * do a racy check with elevated page count, to make sure PG_hwpoison + * will only be set for the targeted owner (or on a free page). + * We temporarily take page lock for try_get_mem_cgroup_from_page(). + * __memory_failure() will redo the check reliably inside page lock. + */ + lock_page(hpage); + err = hwpoison_filter(hpage); + unlock_page(hpage); + if (err) + return 0; + +inject: + printk(KERN_INFO "Injecting memory failure at pfn %lx\n", pfn); + return __memory_failure(pfn, 18, MF_COUNT_INCREASED); +} + +static int hwpoison_unpoison(void *data, u64 val) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return unpoison_memory(val); +} + +DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); +DEFINE_SIMPLE_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); + +static void pfn_inject_exit(void) +{ + if (hwpoison_dir) + debugfs_remove_recursive(hwpoison_dir); +} + +static int pfn_inject_init(void) +{ + struct dentry *dentry; + + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); + if (hwpoison_dir == NULL) + return -ENOMEM; + + /* + * Note that the below poison/unpoison interfaces do not involve + * hardware status change, hence do not require hardware support. + * They are mainly for testing hwpoison in software level. + */ + dentry = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + NULL, &hwpoison_fops); + if (!dentry) + goto fail; + + dentry = debugfs_create_file("unpoison-pfn", 0600, hwpoison_dir, + NULL, &unpoison_fops); + if (!dentry) + goto fail; + + dentry = debugfs_create_u32("corrupt-filter-enable", 0600, + hwpoison_dir, &hwpoison_filter_enable); + if (!dentry) + goto fail; + + dentry = debugfs_create_u32("corrupt-filter-dev-major", 0600, + hwpoison_dir, &hwpoison_filter_dev_major); + if (!dentry) + goto fail; + + dentry = debugfs_create_u32("corrupt-filter-dev-minor", 0600, + hwpoison_dir, &hwpoison_filter_dev_minor); + if (!dentry) + goto fail; + + dentry = debugfs_create_u64("corrupt-filter-flags-mask", 0600, + hwpoison_dir, &hwpoison_filter_flags_mask); + if (!dentry) + goto fail; + + dentry = debugfs_create_u64("corrupt-filter-flags-value", 0600, + hwpoison_dir, &hwpoison_filter_flags_value); + if (!dentry) + goto fail; + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP + dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, + hwpoison_dir, &hwpoison_filter_memcg); + if (!dentry) + goto fail; +#endif + + return 0; +fail: + pfn_inject_exit(); + return -ENOMEM; +} + +module_init(pfn_inject_init); +module_exit(pfn_inject_exit); +MODULE_LICENSE("GPL"); diff --git a/mm/init-mm.c b/mm/init-mm.c index 57aba0d..1d29cdf 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -7,6 +7,11 @@ #include <asm/atomic.h> #include <asm/pgtable.h> +#include <asm/mmu.h> + +#ifndef INIT_MM_CONTEXT +#define INIT_MM_CONTEXT(name) +#endif struct mm_struct init_mm = { .mm_rb = RB_ROOT, @@ -17,4 +22,5 @@ struct mm_struct init_mm = { .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .cpu_vm_mask = CPU_MASK_ALL, + INIT_MM_CONTEXT(init_mm) }; diff --git a/mm/internal.h b/mm/internal.h index f290c4d..dedb0af 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -37,6 +37,8 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +extern unsigned long highest_memmap_pfn; + /* * in mm/vmscan.c: */ @@ -46,9 +48,11 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ -extern unsigned long highest_memmap_pfn; extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); +#ifdef CONFIG_MEMORY_FAILURE +extern bool is_free_buddy_page(struct page *page); +#endif /* @@ -58,11 +62,11 @@ extern void prep_compound_page(struct page *page, unsigned long order); */ static inline unsigned long page_order(struct page *page) { - VM_BUG_ON(!PageBuddy(page)); + /* PageBuddy() must be checked by the caller */ return page_private(page); } -#ifdef CONFIG_HAVE_MLOCK +#ifdef CONFIG_MMU extern long mlock_vma_pages_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void munlock_vma_pages_range(struct vm_area_struct *vma, @@ -71,22 +75,8 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) { munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); } -#endif /* - * unevictable_migrate_page() called only from migrate_page_copy() to - * migrate unevictable flag to new page. - * Note that the old page has been isolated from the LRU lists at this - * point so we don't need to worry about LRU statistics. - */ -static inline void unevictable_migrate_page(struct page *new, struct page *old) -{ - if (TestClearPageUnevictable(old)) - SetPageUnevictable(new); -} - -#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT -/* * Called only in fault path via page_evictable() for a new page * to determine if it's being mapped into a LOCKED vma. * If so, mark page as mlocked. @@ -106,9 +96,10 @@ static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) } /* - * must be called with vma's mmap_sem held for read, and page locked. + * must be called with vma's mmap_sem held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); +extern void munlock_vma_page(struct page *page); /* * Clear the page's PageMlocked(). This can be useful in a situation where @@ -143,7 +134,7 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } -#else /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ +#else /* !CONFIG_MMU */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { return 0; @@ -152,7 +143,7 @@ static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } -#endif /* CONFIG_HAVE_MLOCKED_PAGE_BIT */ +#endif /* !CONFIG_MMU */ /* * Return the mem_map entry representing the 'offset' subpage within @@ -250,13 +241,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ -#define GUP_FLAGS_WRITE 0x1 -#define GUP_FLAGS_FORCE 0x2 -#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 -#define GUP_FLAGS_IGNORE_SIGKILL 0x8 - int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, + unsigned long start, int len, unsigned int foll_flags, struct page **pages, struct vm_area_struct **vmas); #define ZONE_RECLAIM_NOSCAN -2 @@ -264,3 +250,12 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, #define ZONE_RECLAIM_SOME 0 #define ZONE_RECLAIM_SUCCESS 1 #endif + +extern int hwpoison_filter(struct page *p); + +extern u32 hwpoison_filter_dev_major; +extern u32 hwpoison_filter_dev_minor; +extern u64 hwpoison_filter_flags_mask; +extern u64 hwpoison_filter_flags_value; +extern u64 hwpoison_filter_memcg; +extern u32 hwpoison_filter_enable; diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index d5292fc..177a516 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -36,7 +36,7 @@ struct test_node { }; static LIST_HEAD(test_list); -static DEFINE_PER_CPU(void *, test_pointer); +static DEFINE_PER_CPU(void *, kmemleak_test_pointer); /* * Some very simple testing. This function needs to be extended for @@ -86,9 +86,9 @@ static int __init kmemleak_test_init(void) } for_each_possible_cpu(i) { - per_cpu(test_pointer, i) = kmalloc(129, GFP_KERNEL); + per_cpu(kmemleak_test_pointer, i) = kmalloc(129, GFP_KERNEL); pr_info("kmemleak: kmalloc(129) = %p\n", - per_cpu(test_pointer, i)); + per_cpu(kmemleak_test_pointer, i)); } return 0; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index ec759b6..bd9bc21 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -48,10 +48,10 @@ * scanned. This list is only modified during a scanning episode when the * scan_mutex is held. At the end of a scan, the gray_list is always empty. * Note that the kmemleak_object.use_count is incremented when an object is - * added to the gray_list and therefore cannot be freed - * - kmemleak_mutex (mutex): prevents multiple users of the "kmemleak" debugfs - * file together with modifications to the memory scanning parameters - * including the scan_thread pointer + * added to the gray_list and therefore cannot be freed. This mutex also + * prevents multiple users of the "kmemleak" debugfs file together with + * modifications to the memory scanning parameters including the scan_thread + * pointer * * The kmemleak_object structures have a use_count incremented or decremented * using the get_object()/put_object() functions. When the use_count becomes @@ -61,6 +61,8 @@ * structure. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/init.h> #include <linux/kernel.h> #include <linux/list.h> @@ -70,7 +72,6 @@ #include <linux/module.h> #include <linux/kthread.h> #include <linux/prio_tree.h> -#include <linux/gfp.h> #include <linux/fs.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -90,22 +91,24 @@ #include <linux/string.h> #include <linux/nodemask.h> #include <linux/mm.h> +#include <linux/workqueue.h> +#include <linux/crc32.h> #include <asm/sections.h> #include <asm/processor.h> #include <asm/atomic.h> +#include <linux/kmemcheck.h> #include <linux/kmemleak.h> /* * Kmemleak configuration and common defines. */ #define MAX_TRACE 16 /* stack trace length */ -#define REPORTS_NR 50 /* maximum number of reported leaks */ #define MSECS_MIN_AGE 5000 /* minimum object age for reporting */ -#define MSECS_SCAN_YIELD 10 /* CPU yielding period */ #define SECS_FIRST_SCAN 60 /* delay before the first scan */ #define SECS_SCAN_WAIT 600 /* subsequent auto scanning delay */ +#define MAX_SCAN_SIZE 4096 /* maximum size of a scanned block */ #define BYTES_PER_POINTER sizeof(void *) @@ -115,10 +118,13 @@ /* scanning area inside a memory block */ struct kmemleak_scan_area { struct hlist_node node; - unsigned long offset; - size_t length; + unsigned long start; + size_t size; }; +#define KMEMLEAK_GREY 0 +#define KMEMLEAK_BLACK -1 + /* * Structure holding the metadata for each allocated memory block. * Modifications to such objects should be made while holding the @@ -142,6 +148,8 @@ struct kmemleak_object { int min_count; /* the total number of pointers found pointing to this object */ int count; + /* checksum for detecting modified objects */ + u32 checksum; /* memory ranges to be scanned inside an object (empty for all) */ struct hlist_head area_list; unsigned long trace[MAX_TRACE]; @@ -158,6 +166,15 @@ struct kmemleak_object { /* flag set to not scan the object */ #define OBJECT_NO_SCAN (1 << 2) +/* number of bytes to print per line; must be 16 or 32 */ +#define HEX_ROW_SIZE 16 +/* number of bytes to print at a time (1, 2, 4, 8) */ +#define HEX_GROUP_SIZE 1 +/* include ASCII after the hex output */ +#define HEX_ASCII 1 +/* max number of lines to be printed */ +#define HEX_MAX_LINES 2 + /* the list of all allocated objects */ static LIST_HEAD(object_list); /* the list of gray-colored objects (see color_gray comment below) */ @@ -184,22 +201,19 @@ static atomic_t kmemleak_error = ATOMIC_INIT(0); static unsigned long min_addr = ULONG_MAX; static unsigned long max_addr; -/* used for yielding the CPU to other tasks during scanning */ -static unsigned long next_scan_yield; static struct task_struct *scan_thread; -static unsigned long jiffies_scan_yield; +/* used to avoid reporting of recently allocated objects */ static unsigned long jiffies_min_age; +static unsigned long jiffies_last_scan; /* delay between automatic memory scannings */ static signed long jiffies_scan_wait; /* enables or disables the task stacks scanning */ -static int kmemleak_stack_scan; -/* mutex protecting the memory scanning */ +static int kmemleak_stack_scan = 1; +/* protects the memory scanning, parameters and debug/kmemleak file access */ static DEFINE_MUTEX(scan_mutex); -/* mutex protecting the access to the /sys/kernel/debug/kmemleak file */ -static DEFINE_MUTEX(kmemleak_mutex); +/* setting kmemleak=on, will set this var, skipping the disable */ +static int kmemleak_skip_disable; -/* number of leaks reported (for limitation purposes) */ -static int reported_leaks; /* * Early object allocation/freeing logging. Kmemleak is initialized after the @@ -213,6 +227,7 @@ static int reported_leaks; enum { KMEMLEAK_ALLOC, KMEMLEAK_FREE, + KMEMLEAK_FREE_PART, KMEMLEAK_NOT_LEAK, KMEMLEAK_IGNORE, KMEMLEAK_SCAN_AREA, @@ -228,13 +243,14 @@ struct early_log { const void *ptr; /* allocated/freed memory block */ size_t size; /* memory block size */ int min_count; /* minimum reference count */ - unsigned long offset; /* scan area offset */ - size_t length; /* scan area length */ + unsigned long trace[MAX_TRACE]; /* stack trace */ + unsigned int trace_len; /* stack trace length */ }; /* early logging buffer and current position */ -static struct early_log early_log[200]; -static int crt_early_log; +static struct early_log + early_log[CONFIG_DEBUG_KMEMLEAK_EARLY_LOG_SIZE] __initdata; +static int crt_early_log __initdata; static void kmemleak_disable(void); @@ -257,6 +273,35 @@ static void kmemleak_disable(void); } while (0) /* + * Printing of the objects hex dump to the seq file. The number of lines to be + * printed is limited to HEX_MAX_LINES to prevent seq file spamming. The + * actual number of printed bytes depends on HEX_ROW_SIZE. It must be called + * with the object->lock held. + */ +static void hex_dump_object(struct seq_file *seq, + struct kmemleak_object *object) +{ + const u8 *ptr = (const u8 *)object->pointer; + int i, len, remaining; + unsigned char linebuf[HEX_ROW_SIZE * 5]; + + /* limit the number of lines to HEX_MAX_LINES */ + remaining = len = + min(object->size, (size_t)(HEX_MAX_LINES * HEX_ROW_SIZE)); + + seq_printf(seq, " hex dump (first %d bytes):\n", len); + for (i = 0; i < len; i += HEX_ROW_SIZE) { + int linelen = min(remaining, HEX_ROW_SIZE); + + remaining -= HEX_ROW_SIZE; + hex_dump_to_buffer(ptr + i, linelen, HEX_ROW_SIZE, + HEX_GROUP_SIZE, linebuf, sizeof(linebuf), + HEX_ASCII); + seq_printf(seq, " %s\n", linebuf); + } +} + +/* * Object colors, encoded with count and min_count: * - white - orphan object, not enough references to it (count < min_count) * - gray - not orphan, not marked as false positive (min_count == 0) or @@ -266,23 +311,16 @@ static void kmemleak_disable(void); * Newly created objects don't have any color assigned (object->count == -1) * before the next memory scan when they become white. */ -static int color_white(const struct kmemleak_object *object) +static bool color_white(const struct kmemleak_object *object) { - return object->count != -1 && object->count < object->min_count; + return object->count != KMEMLEAK_BLACK && + object->count < object->min_count; } -static int color_gray(const struct kmemleak_object *object) +static bool color_gray(const struct kmemleak_object *object) { - return object->min_count != -1 && object->count >= object->min_count; -} - -/* - * Objects are considered referenced if their color is gray and they have not - * been deleted. - */ -static int referenced_object(struct kmemleak_object *object) -{ - return (object->flags & OBJECT_ALLOCATED) && color_gray(object); + return object->min_count != KMEMLEAK_BLACK && + object->count >= object->min_count; } /* @@ -290,45 +328,34 @@ static int referenced_object(struct kmemleak_object *object) * not be deleted and have a minimum age to avoid false positives caused by * pointers temporarily stored in CPU registers. */ -static int unreferenced_object(struct kmemleak_object *object) +static bool unreferenced_object(struct kmemleak_object *object) { - return (object->flags & OBJECT_ALLOCATED) && color_white(object) && - time_is_before_eq_jiffies(object->jiffies + jiffies_min_age); + return (color_white(object) && object->flags & OBJECT_ALLOCATED) && + time_before_eq(object->jiffies + jiffies_min_age, + jiffies_last_scan); } /* - * Printing of the (un)referenced objects information, either to the seq file - * or to the kernel log. The print_referenced/print_unreferenced functions - * must be called with the object->lock held. + * Printing of the unreferenced objects information to the seq file. The + * print_unreferenced function must be called with the object->lock held. */ -#define print_helper(seq, x...) do { \ - struct seq_file *s = (seq); \ - if (s) \ - seq_printf(s, x); \ - else \ - pr_info(x); \ -} while (0) - -static void print_referenced(struct kmemleak_object *object) -{ - pr_info("kmemleak: referenced object 0x%08lx (size %zu)\n", - object->pointer, object->size); -} - static void print_unreferenced(struct seq_file *seq, struct kmemleak_object *object) { int i; + unsigned int msecs_age = jiffies_to_msecs(jiffies - object->jiffies); - print_helper(seq, "kmemleak: unreferenced object 0x%08lx (size %zu):\n", - object->pointer, object->size); - print_helper(seq, " comm \"%s\", pid %d, jiffies %lu\n", - object->comm, object->pid, object->jiffies); - print_helper(seq, " backtrace:\n"); + seq_printf(seq, "unreferenced object 0x%08lx (size %zu):\n", + object->pointer, object->size); + seq_printf(seq, " comm \"%s\", pid %d, jiffies %lu (age %d.%03ds)\n", + object->comm, object->pid, object->jiffies, + msecs_age / 1000, msecs_age % 1000); + hex_dump_object(seq, object); + seq_printf(seq, " backtrace:\n"); for (i = 0; i < object->trace_len; i++) { void *ptr = (void *)object->trace[i]; - print_helper(seq, " [<%p>] %pS\n", ptr, ptr); + seq_printf(seq, " [<%p>] %pS\n", ptr, ptr); } } @@ -344,12 +371,14 @@ static void dump_object_info(struct kmemleak_object *object) trace.nr_entries = object->trace_len; trace.entries = object->trace; - pr_notice("kmemleak: Object 0x%08lx (size %zu):\n", + pr_notice("Object 0x%08lx (size %zu):\n", object->tree_node.start, object->size); pr_notice(" comm \"%s\", pid %d, jiffies %lu\n", object->comm, object->pid, object->jiffies); pr_notice(" min_count = %d\n", object->min_count); pr_notice(" count = %d\n", object->count); + pr_notice(" flags = 0x%lx\n", object->flags); + pr_notice(" checksum = %d\n", object->checksum); pr_notice(" backtrace:\n"); print_stack_trace(&trace, 4); } @@ -372,7 +401,9 @@ static struct kmemleak_object *lookup_object(unsigned long ptr, int alias) object = prio_tree_entry(node, struct kmemleak_object, tree_node); if (!alias && object->pointer != ptr) { - kmemleak_warn("kmemleak: Found object by alias"); + pr_warning("Found object by alias at 0x%08lx\n", ptr); + dump_stack(); + dump_object_info(object); object = NULL; } } else @@ -454,22 +485,36 @@ static struct kmemleak_object *find_and_get_object(unsigned long ptr, int alias) } /* + * Save stack trace to the given array of MAX_TRACE size. + */ +static int __save_stack_trace(unsigned long *trace) +{ + struct stack_trace stack_trace; + + stack_trace.max_entries = MAX_TRACE; + stack_trace.nr_entries = 0; + stack_trace.entries = trace; + stack_trace.skip = 2; + save_stack_trace(&stack_trace); + + return stack_trace.nr_entries; +} + +/* * Create the metadata (struct kmemleak_object) corresponding to an allocated * memory block and add it to the object_list and object_tree_root. */ -static void create_object(unsigned long ptr, size_t size, int min_count, - gfp_t gfp) +static struct kmemleak_object *create_object(unsigned long ptr, size_t size, + int min_count, gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; struct prio_tree_node *node; - struct stack_trace trace; object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); if (!object) { - kmemleak_stop("kmemleak: Cannot allocate a kmemleak_object " - "structure\n"); - return; + kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); + return NULL; } INIT_LIST_HEAD(&object->object_list); @@ -481,8 +526,9 @@ static void create_object(unsigned long ptr, size_t size, int min_count, object->pointer = ptr; object->size = size; object->min_count = min_count; - object->count = -1; /* no color initially */ + object->count = 0; /* white color initially */ object->jiffies = jiffies; + object->checksum = 0; /* task information */ if (in_irq()) { @@ -503,18 +549,14 @@ static void create_object(unsigned long ptr, size_t size, int min_count, } /* kernel backtrace */ - trace.max_entries = MAX_TRACE; - trace.nr_entries = 0; - trace.entries = object->trace; - trace.skip = 1; - save_stack_trace(&trace); - object->trace_len = trace.nr_entries; + object->trace_len = __save_stack_trace(object->trace); INIT_PRIO_TREE_NODE(&object->tree_node); object->tree_node.start = ptr; object->tree_node.last = ptr + size - 1; write_lock_irqsave(&kmemleak_lock, flags); + min_addr = min(min_addr, ptr); max_addr = max(max_addr, ptr + size); node = prio_tree_insert(&object_tree_root, &object->tree_node); @@ -525,138 +567,190 @@ static void create_object(unsigned long ptr, size_t size, int min_count, * random memory blocks. */ if (node != &object->tree_node) { - unsigned long flags; - - kmemleak_stop("kmemleak: Cannot insert 0x%lx into the object " - "search tree (already existing)\n", ptr); + kmemleak_stop("Cannot insert 0x%lx into the object search tree " + "(already existing)\n", ptr); object = lookup_object(ptr, 1); - spin_lock_irqsave(&object->lock, flags); + spin_lock(&object->lock); dump_object_info(object); - spin_unlock_irqrestore(&object->lock, flags); + spin_unlock(&object->lock); goto out; } list_add_tail_rcu(&object->object_list, &object_list); out: write_unlock_irqrestore(&kmemleak_lock, flags); + return object; } /* * Remove the metadata (struct kmemleak_object) for a memory block from the * object_list and object_tree_root and decrement its use_count. */ -static void delete_object(unsigned long ptr) +static void __delete_object(struct kmemleak_object *object) { unsigned long flags; - struct kmemleak_object *object; write_lock_irqsave(&kmemleak_lock, flags); - object = lookup_object(ptr, 0); - if (!object) { - kmemleak_warn("kmemleak: Freeing unknown object at 0x%08lx\n", - ptr); - write_unlock_irqrestore(&kmemleak_lock, flags); - return; - } prio_tree_remove(&object_tree_root, &object->tree_node); list_del_rcu(&object->object_list); write_unlock_irqrestore(&kmemleak_lock, flags); WARN_ON(!(object->flags & OBJECT_ALLOCATED)); - WARN_ON(atomic_read(&object->use_count) < 1); + WARN_ON(atomic_read(&object->use_count) < 2); /* * Locking here also ensures that the corresponding memory block * cannot be freed when it is being scanned. */ spin_lock_irqsave(&object->lock, flags); - if (object->flags & OBJECT_REPORTED) - print_referenced(object); object->flags &= ~OBJECT_ALLOCATED; spin_unlock_irqrestore(&object->lock, flags); put_object(object); } /* - * Make a object permanently as gray-colored so that it can no longer be - * reported as a leak. This is used in general to mark a false positive. + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. */ -static void make_gray_object(unsigned long ptr) +static void delete_object_full(unsigned long ptr) { - unsigned long flags; struct kmemleak_object *object; object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("kmemleak: Graying unknown object at 0x%08lx\n", +#ifdef DEBUG + kmemleak_warn("Freeing unknown object at 0x%08lx\n", ptr); +#endif return; } - - spin_lock_irqsave(&object->lock, flags); - object->min_count = 0; - spin_unlock_irqrestore(&object->lock, flags); + __delete_object(object); put_object(object); } /* - * Mark the object as black-colored so that it is ignored from scans and - * reporting. + * Look up the metadata (struct kmemleak_object) corresponding to ptr and + * delete it. If the memory block is partially freed, the function may create + * additional metadata for the remaining parts of the block. */ -static void make_black_object(unsigned long ptr) +static void delete_object_part(unsigned long ptr, size_t size) { - unsigned long flags; struct kmemleak_object *object; + unsigned long start, end; - object = find_and_get_object(ptr, 0); + object = find_and_get_object(ptr, 1); if (!object) { - kmemleak_warn("kmemleak: Blacking unknown object at 0x%08lx\n", - ptr); +#ifdef DEBUG + kmemleak_warn("Partially freeing unknown object at 0x%08lx " + "(size %zu)\n", ptr, size); +#endif return; } + __delete_object(object); + + /* + * Create one or two objects that may result from the memory block + * split. Note that partial freeing is only done by free_bootmem() and + * this happens before kmemleak_init() is called. The path below is + * only executed during early log recording in kmemleak_init(), so + * GFP_KERNEL is enough. + */ + start = object->pointer; + end = object->pointer + object->size; + if (ptr > start) + create_object(start, ptr - start, object->min_count, + GFP_KERNEL); + if (ptr + size < end) + create_object(ptr + size, end - ptr - size, object->min_count, + GFP_KERNEL); + + put_object(object); +} + +static void __paint_it(struct kmemleak_object *object, int color) +{ + object->min_count = color; + if (color == KMEMLEAK_BLACK) + object->flags |= OBJECT_NO_SCAN; +} + +static void paint_it(struct kmemleak_object *object, int color) +{ + unsigned long flags; spin_lock_irqsave(&object->lock, flags); - object->min_count = -1; + __paint_it(object, color); spin_unlock_irqrestore(&object->lock, flags); +} + +static void paint_ptr(unsigned long ptr, int color) +{ + struct kmemleak_object *object; + + object = find_and_get_object(ptr, 0); + if (!object) { + kmemleak_warn("Trying to color unknown object " + "at 0x%08lx as %s\n", ptr, + (color == KMEMLEAK_GREY) ? "Grey" : + (color == KMEMLEAK_BLACK) ? "Black" : "Unknown"); + return; + } + paint_it(object, color); put_object(object); } /* + * Mark an object permanently as gray-colored so that it can no longer be + * reported as a leak. This is used in general to mark a false positive. + */ +static void make_gray_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_GREY); +} + +/* + * Mark the object as black-colored so that it is ignored from scans and + * reporting. + */ +static void make_black_object(unsigned long ptr) +{ + paint_ptr(ptr, KMEMLEAK_BLACK); +} + +/* * Add a scanning area to the object. If at least one such area is added, * kmemleak will only scan these ranges rather than the whole memory block. */ -static void add_scan_area(unsigned long ptr, unsigned long offset, - size_t length, gfp_t gfp) +static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) { unsigned long flags; struct kmemleak_object *object; struct kmemleak_scan_area *area; - object = find_and_get_object(ptr, 0); + object = find_and_get_object(ptr, 1); if (!object) { - kmemleak_warn("kmemleak: Adding scan area to unknown " - "object at 0x%08lx\n", ptr); + kmemleak_warn("Adding scan area to unknown object at 0x%08lx\n", + ptr); return; } area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); if (!area) { - kmemleak_warn("kmemleak: Cannot allocate a scan area\n"); + kmemleak_warn("Cannot allocate a scan area\n"); goto out; } spin_lock_irqsave(&object->lock, flags); - if (offset + length > object->size) { - kmemleak_warn("kmemleak: Scan area larger than object " - "0x%08lx\n", ptr); + if (ptr + size > object->pointer + object->size) { + kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); dump_object_info(object); kmem_cache_free(scan_area_cache, area); goto out_unlock; } INIT_HLIST_NODE(&area->node); - area->offset = offset; - area->length = length; + area->start = ptr; + area->size = size; hlist_add_head(&area->node, &object->area_list); out_unlock: @@ -677,8 +771,7 @@ static void object_no_scan(unsigned long ptr) object = find_and_get_object(ptr, 0); if (!object) { - kmemleak_warn("kmemleak: Not scanning unknown object at " - "0x%08lx\n", ptr); + kmemleak_warn("Not scanning unknown object at 0x%08lx\n", ptr); return; } @@ -692,14 +785,16 @@ static void object_no_scan(unsigned long ptr) * Log an early kmemleak_* call to the early_log buffer. These calls will be * processed later once kmemleak is fully initialized. */ -static void log_early(int op_type, const void *ptr, size_t size, - int min_count, unsigned long offset, size_t length) +static void __init log_early(int op_type, const void *ptr, size_t size, + int min_count) { unsigned long flags; struct early_log *log; if (crt_early_log >= ARRAY_SIZE(early_log)) { - kmemleak_stop("kmemleak: Early log buffer exceeded\n"); + pr_warning("Early log buffer exceeded, " + "please increase DEBUG_KMEMLEAK_EARLY_LOG_SIZE\n"); + kmemleak_disable(); return; } @@ -713,116 +808,197 @@ static void log_early(int op_type, const void *ptr, size_t size, log->ptr = ptr; log->size = size; log->min_count = min_count; - log->offset = offset; - log->length = length; + if (op_type == KMEMLEAK_ALLOC) + log->trace_len = __save_stack_trace(log->trace); crt_early_log++; local_irq_restore(flags); } /* - * Memory allocation function callback. This function is called from the - * kernel allocators when a new block is allocated (kmem_cache_alloc, kmalloc, - * vmalloc etc.). + * Log an early allocated block and populate the stack trace. + */ +static void early_alloc(struct early_log *log) +{ + struct kmemleak_object *object; + unsigned long flags; + int i; + + if (!atomic_read(&kmemleak_enabled) || !log->ptr || IS_ERR(log->ptr)) + return; + + /* + * RCU locking needed to ensure object is not freed via put_object(). + */ + rcu_read_lock(); + object = create_object((unsigned long)log->ptr, log->size, + log->min_count, GFP_ATOMIC); + if (!object) + goto out; + spin_lock_irqsave(&object->lock, flags); + for (i = 0; i < log->trace_len; i++) + object->trace[i] = log->trace[i]; + object->trace_len = log->trace_len; + spin_unlock_irqrestore(&object->lock, flags); +out: + rcu_read_unlock(); +} + +/** + * kmemleak_alloc - register a newly allocated object + * @ptr: pointer to beginning of the object + * @size: size of the object + * @min_count: minimum number of references to this object. If during memory + * scanning a number of references less than @min_count is found, + * the object is reported as a memory leak. If @min_count is 0, + * the object is never reported as a leak. If @min_count is -1, + * the object is ignored (not scanned and not reported as a leak) + * @gfp: kmalloc() flags used for kmemleak internal memory allocations + * + * This function is called from the kernel allocators when a new object + * (memory block) is allocated (kmem_cache_alloc, kmalloc, vmalloc etc.). */ -void kmemleak_alloc(const void *ptr, size_t size, int min_count, gfp_t gfp) +void __ref kmemleak_alloc(const void *ptr, size_t size, int min_count, + gfp_t gfp) { pr_debug("%s(0x%p, %zu, %d)\n", __func__, ptr, size, min_count); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) create_object((unsigned long)ptr, size, min_count, gfp); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_ALLOC, ptr, size, min_count, 0, 0); + log_early(KMEMLEAK_ALLOC, ptr, size, min_count); } EXPORT_SYMBOL_GPL(kmemleak_alloc); -/* - * Memory freeing function callback. This function is called from the kernel - * allocators when a block is freed (kmem_cache_free, kfree, vfree etc.). +/** + * kmemleak_free - unregister a previously registered object + * @ptr: pointer to beginning of the object + * + * This function is called from the kernel allocators when an object (memory + * block) is freed (kmem_cache_free, kfree, vfree etc.). */ -void kmemleak_free(const void *ptr) +void __ref kmemleak_free(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) - delete_object((unsigned long)ptr); + delete_object_full((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_FREE, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_FREE, ptr, 0, 0); } EXPORT_SYMBOL_GPL(kmemleak_free); -/* - * Mark an already allocated memory block as a false positive. This will cause - * the block to no longer be reported as leak and always be scanned. +/** + * kmemleak_free_part - partially unregister a previously registered object + * @ptr: pointer to the beginning or inside the object. This also + * represents the start of the range to be freed + * @size: size to be unregistered + * + * This function is called when only a part of a memory block is freed + * (usually from the bootmem allocator). + */ +void __ref kmemleak_free_part(const void *ptr, size_t size) +{ + pr_debug("%s(0x%p)\n", __func__, ptr); + + if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) + delete_object_part((unsigned long)ptr, size); + else if (atomic_read(&kmemleak_early_log)) + log_early(KMEMLEAK_FREE_PART, ptr, size, 0); +} +EXPORT_SYMBOL_GPL(kmemleak_free_part); + +/** + * kmemleak_not_leak - mark an allocated object as false positive + * @ptr: pointer to beginning of the object + * + * Calling this function on an object will cause the memory block to no longer + * be reported as leak and always be scanned. */ -void kmemleak_not_leak(const void *ptr) +void __ref kmemleak_not_leak(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) make_gray_object((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_NOT_LEAK, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_not_leak); -/* - * Ignore a memory block. This is usually done when it is known that the - * corresponding block is not a leak and does not contain any references to - * other allocated memory blocks. +/** + * kmemleak_ignore - ignore an allocated object + * @ptr: pointer to beginning of the object + * + * Calling this function on an object will cause the memory block to be + * ignored (not scanned and not reported as a leak). This is usually done when + * it is known that the corresponding block is not a leak and does not contain + * any references to other allocated memory blocks. */ -void kmemleak_ignore(const void *ptr) +void __ref kmemleak_ignore(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) make_black_object((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_IGNORE, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_IGNORE, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_ignore); -/* - * Limit the range to be scanned in an allocated memory block. +/** + * kmemleak_scan_area - limit the range to be scanned in an allocated object + * @ptr: pointer to beginning or inside the object. This also + * represents the start of the scan area + * @size: size of the scan area + * @gfp: kmalloc() flags used for kmemleak internal memory allocations + * + * This function is used when it is known that only certain parts of an object + * contain references to other objects. Kmemleak will only scan these areas + * reducing the number false negatives. */ -void kmemleak_scan_area(const void *ptr, unsigned long offset, size_t length, - gfp_t gfp) +void __ref kmemleak_scan_area(const void *ptr, size_t size, gfp_t gfp) { pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) - add_scan_area((unsigned long)ptr, offset, length, gfp); + add_scan_area((unsigned long)ptr, size, gfp); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_SCAN_AREA, ptr, 0, 0, offset, length); + log_early(KMEMLEAK_SCAN_AREA, ptr, size, 0); } EXPORT_SYMBOL(kmemleak_scan_area); -/* - * Inform kmemleak not to scan the given memory block. +/** + * kmemleak_no_scan - do not scan an allocated object + * @ptr: pointer to beginning of the object + * + * This function notifies kmemleak not to scan the given memory block. Useful + * in situations where it is known that the given object does not contain any + * references to other objects. Kmemleak will not scan such objects reducing + * the number of false negatives. */ -void kmemleak_no_scan(const void *ptr) +void __ref kmemleak_no_scan(const void *ptr) { pr_debug("%s(0x%p)\n", __func__, ptr); if (atomic_read(&kmemleak_enabled) && ptr && !IS_ERR(ptr)) object_no_scan((unsigned long)ptr); else if (atomic_read(&kmemleak_early_log)) - log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0, 0, 0); + log_early(KMEMLEAK_NO_SCAN, ptr, 0, 0); } EXPORT_SYMBOL(kmemleak_no_scan); /* - * Yield the CPU so that other tasks get a chance to run. The yielding is - * rate-limited to avoid excessive number of calls to the schedule() function - * during memory scanning. + * Update an object's checksum and return true if it was modified. */ -static void scan_yield(void) +static bool update_checksum(struct kmemleak_object *object) { - might_sleep(); + u32 old_csum = object->checksum; - if (time_is_before_eq_jiffies(next_scan_yield)) { - schedule(); - next_scan_yield = jiffies + jiffies_scan_yield; - } + if (!kmemcheck_is_obj_initialized(object->pointer, object->size)) + return false; + + object->checksum = crc32(0, (void *)object->pointer, object->size); + return object->checksum != old_csum; } /* @@ -851,28 +1027,28 @@ static int scan_should_stop(void) * found to the gray list. */ static void scan_block(void *_start, void *_end, - struct kmemleak_object *scanned) + struct kmemleak_object *scanned, int allow_resched) { unsigned long *ptr; unsigned long *start = PTR_ALIGN(_start, BYTES_PER_POINTER); unsigned long *end = _end - (BYTES_PER_POINTER - 1); for (ptr = start; ptr < end; ptr++) { - unsigned long flags; - unsigned long pointer = *ptr; struct kmemleak_object *object; + unsigned long flags; + unsigned long pointer; + if (allow_resched) + cond_resched(); if (scan_should_stop()) break; - /* - * When scanning a memory block with a corresponding - * kmemleak_object, the CPU yielding is handled in the calling - * code since it holds the object->lock to avoid the block - * freeing. - */ - if (!scanned) - scan_yield(); + /* don't scan uninitialized memory */ + if (!kmemcheck_is_obj_initialized((unsigned long)ptr, + BYTES_PER_POINTER)) + continue; + + pointer = *ptr; object = find_and_get_object(pointer, 1); if (!object) @@ -904,11 +1080,14 @@ static void scan_block(void *_start, void *_end, * added to the gray_list. */ object->count++; - if (color_gray(object)) + if (color_gray(object)) { list_add_tail(&object->gray_list, &gray_list); - else - put_object(object); + spin_unlock_irqrestore(&object->lock, flags); + continue; + } + spin_unlock_irqrestore(&object->lock, flags); + put_object(object); } } @@ -923,8 +1102,8 @@ static void scan_object(struct kmemleak_object *object) unsigned long flags; /* - * Once the object->lock is aquired, the corresponding memory block - * cannot be freed (the same lock is aquired in delete_object). + * Once the object->lock is acquired, the corresponding memory block + * cannot be freed (the same lock is acquired in delete_object). */ spin_lock_irqsave(&object->lock, flags); if (object->flags & OBJECT_NO_SCAN) @@ -932,19 +1111,63 @@ static void scan_object(struct kmemleak_object *object) if (!(object->flags & OBJECT_ALLOCATED)) /* already freed object */ goto out; - if (hlist_empty(&object->area_list)) - scan_block((void *)object->pointer, - (void *)(object->pointer + object->size), object); - else + if (hlist_empty(&object->area_list)) { + void *start = (void *)object->pointer; + void *end = (void *)(object->pointer + object->size); + + while (start < end && (object->flags & OBJECT_ALLOCATED) && + !(object->flags & OBJECT_NO_SCAN)) { + scan_block(start, min(start + MAX_SCAN_SIZE, end), + object, 0); + start += MAX_SCAN_SIZE; + + spin_unlock_irqrestore(&object->lock, flags); + cond_resched(); + spin_lock_irqsave(&object->lock, flags); + } + } else hlist_for_each_entry(area, elem, &object->area_list, node) - scan_block((void *)(object->pointer + area->offset), - (void *)(object->pointer + area->offset - + area->length), object); + scan_block((void *)area->start, + (void *)(area->start + area->size), + object, 0); out: spin_unlock_irqrestore(&object->lock, flags); } /* + * Scan the objects already referenced (gray objects). More objects will be + * referenced and, if there are no memory leaks, all the objects are scanned. + */ +static void scan_gray_list(void) +{ + struct kmemleak_object *object, *tmp; + + /* + * The list traversal is safe for both tail additions and removals + * from inside the loop. The kmemleak objects cannot be freed from + * outside the loop because their use_count was incremented. + */ + object = list_entry(gray_list.next, typeof(*object), gray_list); + while (&object->gray_list != &gray_list) { + cond_resched(); + + /* may add new objects to the list */ + if (!scan_should_stop()) + scan_object(object); + + tmp = list_entry(object->gray_list.next, typeof(*object), + gray_list); + + /* remove the object from the list and release it */ + list_del(&object->gray_list); + put_object(object); + + object = tmp; + } + WARN_ON(!list_empty(&gray_list)); +} + +/* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the * scan_mutex held. @@ -952,9 +1175,11 @@ out: static void kmemleak_scan(void) { unsigned long flags; - struct kmemleak_object *object, *tmp; - struct task_struct *task; + struct kmemleak_object *object; int i; + int new_leaks = 0; + + jiffies_last_scan = jiffies; /* prepare the kmemleak_object's */ rcu_read_lock(); @@ -966,7 +1191,7 @@ static void kmemleak_scan(void) * 1 reference to any object at this point. */ if (atomic_read(&object->use_count) > 1) { - pr_debug("kmemleak: object->use_count = %d\n", + pr_debug("object->use_count = %d\n", atomic_read(&object->use_count)); dump_object_info(object); } @@ -981,14 +1206,14 @@ static void kmemleak_scan(void) rcu_read_unlock(); /* data/bss scanning */ - scan_block(_sdata, _edata, NULL); - scan_block(__bss_start, __bss_stop, NULL); + scan_block(_sdata, _edata, NULL, 1); + scan_block(__bss_start, __bss_stop, NULL, 1); #ifdef CONFIG_SMP /* per-cpu sections scanning */ for_each_possible_cpu(i) scan_block(__per_cpu_start + per_cpu_offset(i), - __per_cpu_end + per_cpu_offset(i), NULL); + __per_cpu_end + per_cpu_offset(i), NULL, 1); #endif /* @@ -1010,48 +1235,77 @@ static void kmemleak_scan(void) /* only scan if page is in use */ if (page_count(page) == 0) continue; - scan_block(page, page + 1, NULL); + scan_block(page, page + 1, NULL, 1); } } /* - * Scanning the task stacks may introduce false negatives and it is - * not enabled by default. + * Scanning the task stacks (may introduce false negatives). */ if (kmemleak_stack_scan) { + struct task_struct *p, *g; + read_lock(&tasklist_lock); - for_each_process(task) - scan_block(task_stack_page(task), - task_stack_page(task) + THREAD_SIZE, NULL); + do_each_thread(g, p) { + scan_block(task_stack_page(p), task_stack_page(p) + + THREAD_SIZE, NULL, 0); + } while_each_thread(g, p); read_unlock(&tasklist_lock); } /* * Scan the objects already referenced from the sections scanned - * above. More objects will be referenced and, if there are no memory - * leaks, all the objects will be scanned. The list traversal is safe - * for both tail additions and removals from inside the loop. The - * kmemleak objects cannot be freed from outside the loop because their - * use_count was increased. + * above. */ - object = list_entry(gray_list.next, typeof(*object), gray_list); - while (&object->gray_list != &gray_list) { - scan_yield(); + scan_gray_list(); - /* may add new objects to the list */ - if (!scan_should_stop()) - scan_object(object); + /* + * Check for new or unreferenced objects modified since the previous + * scan and color them gray until the next scan. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if (color_white(object) && (object->flags & OBJECT_ALLOCATED) + && update_checksum(object) && get_object(object)) { + /* color it gray temporarily */ + object->count = object->min_count; + list_add_tail(&object->gray_list, &gray_list); + } + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); - tmp = list_entry(object->gray_list.next, typeof(*object), - gray_list); + /* + * Re-scan the gray list for modified unreferenced objects. + */ + scan_gray_list(); - /* remove the object from the list and release it */ - list_del(&object->gray_list); - put_object(object); + /* + * If scanning was stopped do not report any new unreferenced objects. + */ + if (scan_should_stop()) + return; - object = tmp; + /* + * Scanning result reporting. + */ + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if (unreferenced_object(object) && + !(object->flags & OBJECT_REPORTED)) { + object->flags |= OBJECT_REPORTED; + new_leaks++; + } + spin_unlock_irqrestore(&object->lock, flags); } - WARN_ON(!list_empty(&gray_list)); + rcu_read_unlock(); + + if (new_leaks) + pr_info("%d new suspected memory leaks (see " + "/sys/kernel/debug/kmemleak)\n", new_leaks); + } /* @@ -1062,7 +1316,8 @@ static int kmemleak_scan_thread(void *arg) { static int first_run = 1; - pr_info("kmemleak: Automatic memory scanning thread started\n"); + pr_info("Automatic memory scanning thread started\n"); + set_user_nice(current, 10); /* * Wait before the first scan to allow the system to fully initialize. @@ -1073,66 +1328,42 @@ static int kmemleak_scan_thread(void *arg) } while (!kthread_should_stop()) { - struct kmemleak_object *object; signed long timeout = jiffies_scan_wait; mutex_lock(&scan_mutex); - kmemleak_scan(); - reported_leaks = 0; - - rcu_read_lock(); - list_for_each_entry_rcu(object, &object_list, object_list) { - unsigned long flags; - - if (reported_leaks >= REPORTS_NR) - break; - spin_lock_irqsave(&object->lock, flags); - if (!(object->flags & OBJECT_REPORTED) && - unreferenced_object(object)) { - print_unreferenced(NULL, object); - object->flags |= OBJECT_REPORTED; - reported_leaks++; - } else if ((object->flags & OBJECT_REPORTED) && - referenced_object(object)) { - print_referenced(object); - object->flags &= ~OBJECT_REPORTED; - } - spin_unlock_irqrestore(&object->lock, flags); - } - rcu_read_unlock(); - mutex_unlock(&scan_mutex); + /* wait before the next scan */ while (timeout && !kthread_should_stop()) timeout = schedule_timeout_interruptible(timeout); } - pr_info("kmemleak: Automatic memory scanning thread ended\n"); + pr_info("Automatic memory scanning thread ended\n"); return 0; } /* * Start the automatic memory scanning thread. This function must be called - * with the kmemleak_mutex held. + * with the scan_mutex held. */ -void start_scan_thread(void) +static void start_scan_thread(void) { if (scan_thread) return; scan_thread = kthread_run(kmemleak_scan_thread, NULL, "kmemleak"); if (IS_ERR(scan_thread)) { - pr_warning("kmemleak: Failed to create the scan thread\n"); + pr_warning("Failed to create the scan thread\n"); scan_thread = NULL; } } /* * Stop the automatic memory scanning thread. This function must be called - * with the kmemleak_mutex held. + * with the scan_mutex held. */ -void stop_scan_thread(void) +static void stop_scan_thread(void) { if (scan_thread) { kthread_stop(scan_thread); @@ -1149,13 +1380,11 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) { struct kmemleak_object *object; loff_t n = *pos; + int err; - if (!n) { - kmemleak_scan(); - reported_leaks = 0; - } - if (reported_leaks >= REPORTS_NR) - return NULL; + err = mutex_lock_interruptible(&scan_mutex); + if (err < 0) + return ERR_PTR(err); rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) { @@ -1166,7 +1395,6 @@ static void *kmemleak_seq_start(struct seq_file *seq, loff_t *pos) } object = NULL; out: - rcu_read_unlock(); return object; } @@ -1181,17 +1409,13 @@ static void *kmemleak_seq_next(struct seq_file *seq, void *v, loff_t *pos) struct list_head *n = &prev_obj->object_list; ++(*pos); - if (reported_leaks >= REPORTS_NR) - goto out; - rcu_read_lock(); list_for_each_continue_rcu(n, &object_list) { next_obj = list_entry(n, struct kmemleak_object, object_list); if (get_object(next_obj)) break; } - rcu_read_unlock(); -out: + put_object(prev_obj); return next_obj; } @@ -1201,8 +1425,16 @@ out: */ static void kmemleak_seq_stop(struct seq_file *seq, void *v) { - if (v) - put_object(v); + if (!IS_ERR(v)) { + /* + * kmemleak_seq_start may return ERR_PTR if the scan_mutex + * waiting was interrupted, so only release it if !IS_ERR. + */ + rcu_read_unlock(); + mutex_unlock(&scan_mutex); + if (v) + put_object(v); + } } /* @@ -1214,11 +1446,8 @@ static int kmemleak_seq_show(struct seq_file *seq, void *v) unsigned long flags; spin_lock_irqsave(&object->lock, flags); - if (!unreferenced_object(object)) - goto out; - print_unreferenced(seq, object); - reported_leaks++; -out: + if ((object->flags & OBJECT_REPORTED) && unreferenced_object(object)) + print_unreferenced(seq, object); spin_unlock_irqrestore(&object->lock, flags); return 0; } @@ -1232,43 +1461,58 @@ static const struct seq_operations kmemleak_seq_ops = { static int kmemleak_open(struct inode *inode, struct file *file) { - int ret = 0; - if (!atomic_read(&kmemleak_enabled)) return -EBUSY; - ret = mutex_lock_interruptible(&kmemleak_mutex); - if (ret < 0) - goto out; - if (file->f_mode & FMODE_READ) { - ret = mutex_lock_interruptible(&scan_mutex); - if (ret < 0) - goto kmemleak_unlock; - ret = seq_open(file, &kmemleak_seq_ops); - if (ret < 0) - goto scan_unlock; - } - return ret; - -scan_unlock: - mutex_unlock(&scan_mutex); -kmemleak_unlock: - mutex_unlock(&kmemleak_mutex); -out: - return ret; + return seq_open(file, &kmemleak_seq_ops); } static int kmemleak_release(struct inode *inode, struct file *file) { - int ret = 0; + return seq_release(inode, file); +} - if (file->f_mode & FMODE_READ) { - seq_release(inode, file); - mutex_unlock(&scan_mutex); +static int dump_str_object_info(const char *str) +{ + unsigned long flags; + struct kmemleak_object *object; + unsigned long addr; + + addr= simple_strtoul(str, NULL, 0); + object = find_and_get_object(addr, 0); + if (!object) { + pr_info("Unknown object at 0x%08lx\n", addr); + return -EINVAL; } - mutex_unlock(&kmemleak_mutex); - return ret; + spin_lock_irqsave(&object->lock, flags); + dump_object_info(object); + spin_unlock_irqrestore(&object->lock, flags); + + put_object(object); + return 0; +} + +/* + * We use grey instead of black to ensure we can do future scans on the same + * objects. If we did not do future scans these black objects could + * potentially contain references to newly allocated objects in the future and + * we'd end up with false positives. + */ +static void kmemleak_clear(void) +{ + struct kmemleak_object *object; + unsigned long flags; + + rcu_read_lock(); + list_for_each_entry_rcu(object, &object_list, object_list) { + spin_lock_irqsave(&object->lock, flags); + if ((object->flags & OBJECT_REPORTED) && + unreferenced_object(object)) + __paint_it(object, KMEMLEAK_GREY); + spin_unlock_irqrestore(&object->lock, flags); + } + rcu_read_unlock(); } /* @@ -1281,21 +1525,27 @@ static int kmemleak_release(struct inode *inode, struct file *file) * scan=off - stop the automatic memory scanning thread * scan=... - set the automatic memory scanning period in seconds (0 to * disable it) + * scan - trigger a memory scan + * clear - mark all current reported unreferenced kmemleak objects as + * grey to ignore printing them + * dump=... - dump information about the object found at the given address */ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, size_t size, loff_t *ppos) { char buf[64]; int buf_size; - - if (!atomic_read(&kmemleak_enabled)) - return -EBUSY; + int ret; buf_size = min(size, (sizeof(buf) - 1)); if (strncpy_from_user(buf, user_buf, buf_size) < 0) return -EFAULT; buf[buf_size] = 0; + ret = mutex_lock_interruptible(&scan_mutex); + if (ret < 0) + return ret; + if (strncmp(buf, "off", 3) == 0) kmemleak_disable(); else if (strncmp(buf, "stack=on", 8) == 0) @@ -1308,18 +1558,28 @@ static ssize_t kmemleak_write(struct file *file, const char __user *user_buf, stop_scan_thread(); else if (strncmp(buf, "scan=", 5) == 0) { unsigned long secs; - int err; - err = strict_strtoul(buf + 5, 0, &secs); - if (err < 0) - return err; + ret = strict_strtoul(buf + 5, 0, &secs); + if (ret < 0) + goto out; stop_scan_thread(); if (secs) { jiffies_scan_wait = msecs_to_jiffies(secs * 1000); start_scan_thread(); } - } else - return -EINVAL; + } else if (strncmp(buf, "scan", 4) == 0) + kmemleak_scan(); + else if (strncmp(buf, "clear", 5) == 0) + kmemleak_clear(); + else if (strncmp(buf, "dump=", 5) == 0) + ret = dump_str_object_info(buf + 5); + else + ret = -EINVAL; + +out: + mutex_unlock(&scan_mutex); + if (ret < 0) + return ret; /* ignore the rest of the buffer, only one command at a time */ *ppos += size; @@ -1339,36 +1599,21 @@ static const struct file_operations kmemleak_fops = { * Perform the freeing of the kmemleak internal objects after waiting for any * current memory scan to complete. */ -static int kmemleak_cleanup_thread(void *arg) +static void kmemleak_do_cleanup(struct work_struct *work) { struct kmemleak_object *object; - mutex_lock(&kmemleak_mutex); + mutex_lock(&scan_mutex); stop_scan_thread(); - mutex_unlock(&kmemleak_mutex); - mutex_lock(&scan_mutex); rcu_read_lock(); list_for_each_entry_rcu(object, &object_list, object_list) - delete_object(object->pointer); + delete_object_full(object->pointer); rcu_read_unlock(); mutex_unlock(&scan_mutex); - - return 0; } -/* - * Start the clean-up thread. - */ -static void kmemleak_cleanup(void) -{ - struct task_struct *cleanup_thread; - - cleanup_thread = kthread_run(kmemleak_cleanup_thread, NULL, - "kmemleak-clean"); - if (IS_ERR(cleanup_thread)) - pr_warning("kmemleak: Failed to create the clean-up thread\n"); -} +static DECLARE_WORK(cleanup_work, kmemleak_do_cleanup); /* * Disable kmemleak. No memory allocation/freeing will be traced once this @@ -1386,7 +1631,7 @@ static void kmemleak_disable(void) /* check whether it is too early for a kernel thread */ if (atomic_read(&kmemleak_initialized)) - kmemleak_cleanup(); + schedule_work(&cleanup_work); pr_info("Kernel memory leak detector disabled\n"); } @@ -1400,7 +1645,9 @@ static int kmemleak_boot_config(char *str) return -EINVAL; if (strcmp(str, "off") == 0) kmemleak_disable(); - else if (strcmp(str, "on") != 0) + else if (strcmp(str, "on") == 0) + kmemleak_skip_disable = 1; + else return -EINVAL; return 0; } @@ -1414,7 +1661,13 @@ void __init kmemleak_init(void) int i; unsigned long flags; - jiffies_scan_yield = msecs_to_jiffies(MSECS_SCAN_YIELD); +#ifdef CONFIG_DEBUG_KMEMLEAK_DEFAULT_OFF + if (!kmemleak_skip_disable) { + kmemleak_disable(); + return; + } +#endif + jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); @@ -1440,12 +1693,14 @@ void __init kmemleak_init(void) switch (log->op_type) { case KMEMLEAK_ALLOC: - kmemleak_alloc(log->ptr, log->size, log->min_count, - GFP_KERNEL); + early_alloc(log); break; case KMEMLEAK_FREE: kmemleak_free(log->ptr); break; + case KMEMLEAK_FREE_PART: + kmemleak_free_part(log->ptr, log->size); + break; case KMEMLEAK_NOT_LEAK: kmemleak_not_leak(log->ptr); break; @@ -1453,8 +1708,7 @@ void __init kmemleak_init(void) kmemleak_ignore(log->ptr); break; case KMEMLEAK_SCAN_AREA: - kmemleak_scan_area(log->ptr, log->offset, log->length, - GFP_KERNEL); + kmemleak_scan_area(log->ptr, log->size, GFP_KERNEL); break; case KMEMLEAK_NO_SCAN: kmemleak_no_scan(log->ptr); @@ -1481,18 +1735,17 @@ static int __init kmemleak_late_init(void) * after setting kmemleak_initialized and we may end up with * two clean-up threads but serialized by scan_mutex. */ - kmemleak_cleanup(); + schedule_work(&cleanup_work); return -ENOMEM; } dentry = debugfs_create_file("kmemleak", S_IRUGO, NULL, NULL, &kmemleak_fops); if (!dentry) - pr_warning("kmemleak: Failed to create the debugfs kmemleak " - "file\n"); - mutex_lock(&kmemleak_mutex); + pr_warning("Failed to create the debugfs kmemleak file\n"); + mutex_lock(&scan_mutex); start_scan_thread(); - mutex_unlock(&kmemleak_mutex); + mutex_unlock(&scan_mutex); pr_info("Kernel memory leak detector initialized\n"); diff --git a/mm/ksm.c b/mm/ksm.c new file mode 100644 index 0000000..43bc893 --- /dev/null +++ b/mm/ksm.c @@ -0,0 +1,1964 @@ +/* + * Memory merging support. + * + * This code enables dynamic sharing of identical pages found in different + * memory areas, even if they are not shared by fork() + * + * Copyright (C) 2008-2009 Red Hat, Inc. + * Authors: + * Izik Eidus + * Andrea Arcangeli + * Chris Wright + * Hugh Dickins + * + * This work is licensed under the terms of the GNU GPL, version 2. + */ + +#include <linux/errno.h> +#include <linux/mm.h> +#include <linux/fs.h> +#include <linux/mman.h> +#include <linux/sched.h> +#include <linux/rwsem.h> +#include <linux/pagemap.h> +#include <linux/rmap.h> +#include <linux/spinlock.h> +#include <linux/jhash.h> +#include <linux/delay.h> +#include <linux/kthread.h> +#include <linux/wait.h> +#include <linux/slab.h> +#include <linux/rbtree.h> +#include <linux/memory.h> +#include <linux/mmu_notifier.h> +#include <linux/swap.h> +#include <linux/ksm.h> +#include <linux/hash.h> + +#include <asm/tlbflush.h> +#include "internal.h" + +/* + * A few notes about the KSM scanning process, + * to make it easier to understand the data structures below: + * + * In order to reduce excessive scanning, KSM sorts the memory pages by their + * contents into a data structure that holds pointers to the pages' locations. + * + * Since the contents of the pages may change at any moment, KSM cannot just + * insert the pages into a normal sorted tree and expect it to find anything. + * Therefore KSM uses two data structures - the stable and the unstable tree. + * + * The stable tree holds pointers to all the merged pages (ksm pages), sorted + * by their contents. Because each such page is write-protected, searching on + * this tree is fully assured to be working (except when pages are unmapped), + * and therefore this tree is called the stable tree. + * + * In addition to the stable tree, KSM uses a second data structure called the + * unstable tree: this tree holds pointers to pages which have been found to + * be "unchanged for a period of time". The unstable tree sorts these pages + * by their contents, but since they are not write-protected, KSM cannot rely + * upon the unstable tree to work correctly - the unstable tree is liable to + * be corrupted as its contents are modified, and so it is called unstable. + * + * KSM solves this problem by several techniques: + * + * 1) The unstable tree is flushed every time KSM completes scanning all + * memory areas, and then the tree is rebuilt again from the beginning. + * 2) KSM will only insert into the unstable tree, pages whose hash value + * has not changed since the previous scan of all memory areas. + * 3) The unstable tree is a RedBlack Tree - so its balancing is based on the + * colors of the nodes and not on their contents, assuring that even when + * the tree gets "corrupted" it won't get out of balance, so scanning time + * remains the same (also, searching and inserting nodes in an rbtree uses + * the same algorithm, so we have no overhead when we flush and rebuild). + * 4) KSM never flushes the stable tree, which means that even if it were to + * take 10 attempts to find a page in the unstable tree, once it is found, + * it is secured in the stable tree. (When we scan a new page, we first + * compare it against the stable tree, and then against the unstable tree.) + */ + +/** + * struct mm_slot - ksm information per mm that is being scanned + * @link: link to the mm_slots hash list + * @mm_list: link into the mm_slots list, rooted in ksm_mm_head + * @rmap_list: head for this mm_slot's singly-linked list of rmap_items + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node link; + struct list_head mm_list; + struct rmap_item *rmap_list; + struct mm_struct *mm; +}; + +/** + * struct ksm_scan - cursor for scanning + * @mm_slot: the current mm_slot we are scanning + * @address: the next address inside that to be scanned + * @rmap_list: link to the next rmap to be scanned in the rmap_list + * @seqnr: count of completed full scans (needed when removing unstable node) + * + * There is only the one ksm_scan instance of this cursor structure. + */ +struct ksm_scan { + struct mm_slot *mm_slot; + unsigned long address; + struct rmap_item **rmap_list; + unsigned long seqnr; +}; + +/** + * struct stable_node - node of the stable rbtree + * @node: rb node of this ksm page in the stable tree + * @hlist: hlist head of rmap_items using this ksm page + * @kpfn: page frame number of this ksm page + */ +struct stable_node { + struct rb_node node; + struct hlist_head hlist; + unsigned long kpfn; +}; + +/** + * struct rmap_item - reverse mapping item for virtual addresses + * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list + * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree + * @mm: the memory structure this rmap_item is pointing into + * @address: the virtual address this rmap_item tracks (+ flags in low bits) + * @oldchecksum: previous checksum of the page at that virtual address + * @node: rb node of this rmap_item in the unstable tree + * @head: pointer to stable_node heading this list in the stable tree + * @hlist: link into hlist of rmap_items hanging off that stable_node + */ +struct rmap_item { + struct rmap_item *rmap_list; + struct anon_vma *anon_vma; /* when stable */ + struct mm_struct *mm; + unsigned long address; /* + low bits used for flags below */ + unsigned int oldchecksum; /* when unstable */ + union { + struct rb_node node; /* when node of unstable tree */ + struct { /* when listed from stable tree */ + struct stable_node *head; + struct hlist_node hlist; + }; + }; +}; + +#define SEQNR_MASK 0x0ff /* low bits of unstable tree seqnr */ +#define UNSTABLE_FLAG 0x100 /* is a node of the unstable tree */ +#define STABLE_FLAG 0x200 /* is listed from the stable tree */ + +/* The stable and unstable tree heads */ +static struct rb_root root_stable_tree = RB_ROOT; +static struct rb_root root_unstable_tree = RB_ROOT; + +#define MM_SLOTS_HASH_SHIFT 10 +#define MM_SLOTS_HASH_HEADS (1 << MM_SLOTS_HASH_SHIFT) +static struct hlist_head mm_slots_hash[MM_SLOTS_HASH_HEADS]; + +static struct mm_slot ksm_mm_head = { + .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), +}; +static struct ksm_scan ksm_scan = { + .mm_slot = &ksm_mm_head, +}; + +static struct kmem_cache *rmap_item_cache; +static struct kmem_cache *stable_node_cache; +static struct kmem_cache *mm_slot_cache; + +/* The number of nodes in the stable tree */ +static unsigned long ksm_pages_shared; + +/* The number of page slots additionally sharing those nodes */ +static unsigned long ksm_pages_sharing; + +/* The number of nodes in the unstable tree */ +static unsigned long ksm_pages_unshared; + +/* The number of rmap_items in use: to calculate pages_volatile */ +static unsigned long ksm_rmap_items; + +/* Number of pages ksmd should scan in one batch */ +static unsigned int ksm_thread_pages_to_scan = 100; + +/* Milliseconds ksmd should sleep between batches */ +static unsigned int ksm_thread_sleep_millisecs = 20; + +#define KSM_RUN_STOP 0 +#define KSM_RUN_MERGE 1 +#define KSM_RUN_UNMERGE 2 +static unsigned int ksm_run = KSM_RUN_STOP; + +static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); +static DEFINE_MUTEX(ksm_thread_mutex); +static DEFINE_SPINLOCK(ksm_mmlist_lock); + +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ + sizeof(struct __struct), __alignof__(struct __struct),\ + (__flags), NULL) + +static int __init ksm_slab_init(void) +{ + rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); + if (!rmap_item_cache) + goto out; + + stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); + if (!stable_node_cache) + goto out_free1; + + mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); + if (!mm_slot_cache) + goto out_free2; + + return 0; + +out_free2: + kmem_cache_destroy(stable_node_cache); +out_free1: + kmem_cache_destroy(rmap_item_cache); +out: + return -ENOMEM; +} + +static void __init ksm_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + kmem_cache_destroy(stable_node_cache); + kmem_cache_destroy(rmap_item_cache); + mm_slot_cache = NULL; +} + +static inline struct rmap_item *alloc_rmap_item(void) +{ + struct rmap_item *rmap_item; + + rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL); + if (rmap_item) + ksm_rmap_items++; + return rmap_item; +} + +static inline void free_rmap_item(struct rmap_item *rmap_item) +{ + ksm_rmap_items--; + rmap_item->mm = NULL; /* debug safety */ + kmem_cache_free(rmap_item_cache, rmap_item); +} + +static inline struct stable_node *alloc_stable_node(void) +{ + return kmem_cache_alloc(stable_node_cache, GFP_KERNEL); +} + +static inline void free_stable_node(struct stable_node *stable_node) +{ + kmem_cache_free(stable_node_cache, stable_node); +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; + hlist_for_each_entry(mm_slot, node, bucket, link) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[hash_ptr(mm, MM_SLOTS_HASH_SHIFT)]; + mm_slot->mm = mm; + hlist_add_head(&mm_slot->link, bucket); +} + +static inline int in_stable_tree(struct rmap_item *rmap_item) +{ + return rmap_item->address & STABLE_FLAG; +} + +static void hold_anon_vma(struct rmap_item *rmap_item, + struct anon_vma *anon_vma) +{ + rmap_item->anon_vma = anon_vma; + get_anon_vma(anon_vma); +} + +static void ksm_drop_anon_vma(struct rmap_item *rmap_item) +{ + struct anon_vma *anon_vma = rmap_item->anon_vma; + + drop_anon_vma(anon_vma); +} + +/* + * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's + * page tables after it has passed through ksm_exit() - which, if necessary, + * takes mmap_sem briefly to serialize against them. ksm_exit() does not set + * a special flag: they can just back out as soon as mm_users goes to zero. + * ksm_test_exit() is used throughout to make this test for exit: in some + * places for correctness, in some places just to avoid unnecessary work. + */ +static inline bool ksm_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +/* + * We use break_ksm to break COW on a ksm page: it's a stripped down + * + * if (get_user_pages(current, mm, addr, 1, 1, 1, &page, NULL) == 1) + * put_page(page); + * + * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, + * in case the application has unmapped and remapped mm,addr meanwhile. + * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP + * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. + */ +static int break_ksm(struct vm_area_struct *vma, unsigned long addr) +{ + struct page *page; + int ret = 0; + + do { + cond_resched(); + page = follow_page(vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) + break; + if (PageKsm(page)) + ret = handle_mm_fault(vma->vm_mm, vma, addr, + FAULT_FLAG_WRITE); + else + ret = VM_FAULT_WRITE; + put_page(page); + } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM))); + /* + * We must loop because handle_mm_fault() may back out if there's + * any difficulty e.g. if pte accessed bit gets updated concurrently. + * + * VM_FAULT_WRITE is what we have been hoping for: it indicates that + * COW has been broken, even if the vma does not permit VM_WRITE; + * but note that a concurrent fault might break PageKsm for us. + * + * VM_FAULT_SIGBUS could occur if we race with truncation of the + * backing file, which also invalidates anonymous pages: that's + * okay, that truncation will have unmapped the PageKsm for us. + * + * VM_FAULT_OOM: at the time of writing (late July 2009), setting + * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the + * current task has TIF_MEMDIE set, and will be OOM killed on return + * to user; and ksmd, having no mm, would never be chosen for that. + * + * But if the mm is in a limited mem_cgroup, then the fault may fail + * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and + * even ksmd can fail in this way - though it's usually breaking ksm + * just to undo a merge it made a moment before, so unlikely to oom. + * + * That's a pity: we might therefore have more kernel pages allocated + * than we're counting as nodes in the stable tree; but ksm_do_scan + * will retry to break_cow on each pass, so should recover the page + * in due course. The important thing is to not let VM_MERGEABLE + * be cleared while any such pages might remain in the area. + */ + return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; +} + +static void break_cow(struct rmap_item *rmap_item) +{ + struct mm_struct *mm = rmap_item->mm; + unsigned long addr = rmap_item->address; + struct vm_area_struct *vma; + + /* + * It is not an accident that whenever we want to break COW + * to undo, we also need to drop a reference to the anon_vma. + */ + ksm_drop_anon_vma(rmap_item); + + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + goto out; + break_ksm(vma, addr); +out: + up_read(&mm->mmap_sem); +} + +static struct page *get_mergeable_page(struct rmap_item *rmap_item) +{ + struct mm_struct *mm = rmap_item->mm; + unsigned long addr = rmap_item->address; + struct vm_area_struct *vma; + struct page *page; + + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, addr); + if (!vma || vma->vm_start > addr) + goto out; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + goto out; + + page = follow_page(vma, addr, FOLL_GET); + if (IS_ERR_OR_NULL(page)) + goto out; + if (PageAnon(page)) { + flush_anon_page(vma, page, addr); + flush_dcache_page(page); + } else { + put_page(page); +out: page = NULL; + } + up_read(&mm->mmap_sem); + return page; +} + +static void remove_node_from_stable_tree(struct stable_node *stable_node) +{ + struct rmap_item *rmap_item; + struct hlist_node *hlist; + + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + if (rmap_item->hlist.next) + ksm_pages_sharing--; + else + ksm_pages_shared--; + ksm_drop_anon_vma(rmap_item); + rmap_item->address &= PAGE_MASK; + cond_resched(); + } + + rb_erase(&stable_node->node, &root_stable_tree); + free_stable_node(stable_node); +} + +/* + * get_ksm_page: checks if the page indicated by the stable node + * is still its ksm page, despite having held no reference to it. + * In which case we can trust the content of the page, and it + * returns the gotten page; but if the page has now been zapped, + * remove the stale node from the stable tree and return NULL. + * + * You would expect the stable_node to hold a reference to the ksm page. + * But if it increments the page's count, swapping out has to wait for + * ksmd to come around again before it can free the page, which may take + * seconds or even minutes: much too unresponsive. So instead we use a + * "keyhole reference": access to the ksm page from the stable node peeps + * out through its keyhole to see if that page still holds the right key, + * pointing back to this stable node. This relies on freeing a PageAnon + * page to reset its page->mapping to NULL, and relies on no other use of + * a page to put something that might look like our key in page->mapping. + * + * include/linux/pagemap.h page_cache_get_speculative() is a good reference, + * but this is different - made simpler by ksm_thread_mutex being held, but + * interesting for assuming that no other use of the struct page could ever + * put our expected_mapping into page->mapping (or a field of the union which + * coincides with page->mapping). The RCU calls are not for KSM at all, but + * to keep the page_count protocol described with page_cache_get_speculative. + * + * Note: it is possible that get_ksm_page() will return NULL one moment, + * then page the next, if the page is in between page_freeze_refs() and + * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page + * is on its way to being freed; but it is an anomaly to bear in mind. + */ +static struct page *get_ksm_page(struct stable_node *stable_node) +{ + struct page *page; + void *expected_mapping; + + page = pfn_to_page(stable_node->kpfn); + expected_mapping = (void *)stable_node + + (PAGE_MAPPING_ANON | PAGE_MAPPING_KSM); + rcu_read_lock(); + if (page->mapping != expected_mapping) + goto stale; + if (!get_page_unless_zero(page)) + goto stale; + if (page->mapping != expected_mapping) { + put_page(page); + goto stale; + } + rcu_read_unlock(); + return page; +stale: + rcu_read_unlock(); + remove_node_from_stable_tree(stable_node); + return NULL; +} + +/* + * Removing rmap_item from stable or unstable tree. + * This function will clean the information from the stable/unstable tree. + */ +static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +{ + if (rmap_item->address & STABLE_FLAG) { + struct stable_node *stable_node; + struct page *page; + + stable_node = rmap_item->head; + page = get_ksm_page(stable_node); + if (!page) + goto out; + + lock_page(page); + hlist_del(&rmap_item->hlist); + unlock_page(page); + put_page(page); + + if (stable_node->hlist.first) + ksm_pages_sharing--; + else + ksm_pages_shared--; + + ksm_drop_anon_vma(rmap_item); + rmap_item->address &= PAGE_MASK; + + } else if (rmap_item->address & UNSTABLE_FLAG) { + unsigned char age; + /* + * Usually ksmd can and must skip the rb_erase, because + * root_unstable_tree was already reset to RB_ROOT. + * But be careful when an mm is exiting: do the rb_erase + * if this rmap_item was inserted by this scan, rather + * than left over from before. + */ + age = (unsigned char)(ksm_scan.seqnr - rmap_item->address); + BUG_ON(age > 1); + if (!age) + rb_erase(&rmap_item->node, &root_unstable_tree); + + ksm_pages_unshared--; + rmap_item->address &= PAGE_MASK; + } +out: + cond_resched(); /* we're called from many long loops */ +} + +static void remove_trailing_rmap_items(struct mm_slot *mm_slot, + struct rmap_item **rmap_list) +{ + while (*rmap_list) { + struct rmap_item *rmap_item = *rmap_list; + *rmap_list = rmap_item->rmap_list; + remove_rmap_item_from_tree(rmap_item); + free_rmap_item(rmap_item); + } +} + +/* + * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather + * than check every pte of a given vma, the locking doesn't quite work for + * that - an rmap_item is assigned to the stable tree after inserting ksm + * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing + * rmap_items from parent to child at fork time (so as not to waste time + * if exit comes before the next scan reaches it). + * + * Similarly, although we'd like to remove rmap_items (so updating counts + * and freeing memory) when unmerging an area, it's easier to leave that + * to the next pass of ksmd - consider, for example, how ksmd might be + * in cmp_and_merge_page on one of the rmap_items we would be removing. + */ +static int unmerge_ksm_pages(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + unsigned long addr; + int err = 0; + + for (addr = start; addr < end && !err; addr += PAGE_SIZE) { + if (ksm_test_exit(vma->vm_mm)) + break; + if (signal_pending(current)) + err = -ERESTARTSYS; + else + err = break_ksm(vma, addr); + } + return err; +} + +#ifdef CONFIG_SYSFS +/* + * Only called through the sysfs control interface: + */ +static int unmerge_and_remove_all_rmap_items(void) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int err = 0; + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, + struct mm_slot, mm_list); + spin_unlock(&ksm_mmlist_lock); + + for (mm_slot = ksm_scan.mm_slot; + mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (ksm_test_exit(mm)) + break; + if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) + continue; + err = unmerge_ksm_pages(vma, + vma->vm_start, vma->vm_end); + if (err) + goto error; + } + + remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list); + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_test_exit(mm)) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } + } + + ksm_scan.seqnr = 0; + return 0; + +error: + up_read(&mm->mmap_sem); + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = &ksm_mm_head; + spin_unlock(&ksm_mmlist_lock); + return err; +} +#endif /* CONFIG_SYSFS */ + +static u32 calc_checksum(struct page *page) +{ + u32 checksum; + void *addr = kmap_atomic(page, KM_USER0); + checksum = jhash2(addr, PAGE_SIZE / 4, 17); + kunmap_atomic(addr, KM_USER0); + return checksum; +} + +static int memcmp_pages(struct page *page1, struct page *page2) +{ + char *addr1, *addr2; + int ret; + + addr1 = kmap_atomic(page1, KM_USER0); + addr2 = kmap_atomic(page2, KM_USER1); + ret = memcmp(addr1, addr2, PAGE_SIZE); + kunmap_atomic(addr2, KM_USER1); + kunmap_atomic(addr1, KM_USER0); + return ret; +} + +static inline int pages_identical(struct page *page1, struct page *page2) +{ + return !memcmp_pages(page1, page2); +} + +static int write_protect_page(struct vm_area_struct *vma, struct page *page, + pte_t *orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr; + pte_t *ptep; + spinlock_t *ptl; + int swapped; + int err = -EFAULT; + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + ptep = page_check_address(page, mm, addr, &ptl, 0); + if (!ptep) + goto out; + + if (pte_write(*ptep) || pte_dirty(*ptep)) { + pte_t entry; + + swapped = PageSwapCache(page); + flush_cache_page(vma, addr, page_to_pfn(page)); + /* + * Ok this is tricky, when get_user_pages_fast() run it doesnt + * take any lock, therefore the check that we are going to make + * with the pagecount against the mapcount is racey and + * O_DIRECT can happen right after the check. + * So we clear the pte and flush the tlb before the check + * this assure us that no O_DIRECT can happen after the check + * or in the middle of the check. + */ + entry = ptep_clear_flush(vma, addr, ptep); + /* + * Check that no O_DIRECT or similar I/O is in progress on the + * page + */ + if (page_mapcount(page) + 1 + swapped != page_count(page)) { + set_pte_at(mm, addr, ptep, entry); + goto out_unlock; + } + if (pte_dirty(entry)) + set_page_dirty(page); + entry = pte_mkclean(pte_wrprotect(entry)); + set_pte_at_notify(mm, addr, ptep, entry); + } + *orig_pte = *ptep; + err = 0; + +out_unlock: + pte_unmap_unlock(ptep, ptl); +out: + return err; +} + +/** + * replace_page - replace page in vma by new ksm page + * @vma: vma that holds the pte pointing to page + * @page: the page we are replacing by kpage + * @kpage: the ksm page we replace page by + * @orig_pte: the original value of the pte + * + * Returns 0 on success, -EFAULT on failure. + */ +static int replace_page(struct vm_area_struct *vma, struct page *page, + struct page *kpage, pte_t orig_pte) +{ + struct mm_struct *mm = vma->vm_mm; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep; + spinlock_t *ptl; + unsigned long addr; + int err = -EFAULT; + + addr = page_address_in_vma(page, vma); + if (addr == -EFAULT) + goto out; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); + if (!pte_same(*ptep, orig_pte)) { + pte_unmap_unlock(ptep, ptl); + goto out; + } + + get_page(kpage); + page_add_anon_rmap(kpage, vma, addr); + + flush_cache_page(vma, addr, pte_pfn(*ptep)); + ptep_clear_flush(vma, addr, ptep); + set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); + + page_remove_rmap(page); + put_page(page); + + pte_unmap_unlock(ptep, ptl); + err = 0; +out: + return err; +} + +/* + * try_to_merge_one_page - take two pages and merge them into one + * @vma: the vma that holds the pte pointing to page + * @page: the PageAnon page that we want to replace with kpage + * @kpage: the PageKsm page that we want to map instead of page, + * or NULL the first time when we want to use page as kpage. + * + * This function returns 0 if the pages were merged, -EFAULT otherwise. + */ +static int try_to_merge_one_page(struct vm_area_struct *vma, + struct page *page, struct page *kpage) +{ + pte_t orig_pte = __pte(0); + int err = -EFAULT; + + if (page == kpage) /* ksm page forked */ + return 0; + + if (!(vma->vm_flags & VM_MERGEABLE)) + goto out; + if (!PageAnon(page)) + goto out; + + /* + * We need the page lock to read a stable PageSwapCache in + * write_protect_page(). We use trylock_page() instead of + * lock_page() because we don't want to wait here - we + * prefer to continue scanning and merging different pages, + * then come back to this page when it is unlocked. + */ + if (!trylock_page(page)) + goto out; + /* + * If this anonymous page is mapped only here, its pte may need + * to be write-protected. If it's mapped elsewhere, all of its + * ptes are necessarily already write-protected. But in either + * case, we need to lock and check page_count is not raised. + */ + if (write_protect_page(vma, page, &orig_pte) == 0) { + if (!kpage) { + /* + * While we hold page lock, upgrade page from + * PageAnon+anon_vma to PageKsm+NULL stable_node: + * stable_tree_insert() will update stable_node. + */ + set_page_stable_node(page, NULL); + mark_page_accessed(page); + err = 0; + } else if (pages_identical(page, kpage)) + err = replace_page(vma, page, kpage, orig_pte); + } + + if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { + munlock_vma_page(page); + if (!PageMlocked(kpage)) { + unlock_page(page); + lock_page(kpage); + mlock_vma_page(kpage); + page = kpage; /* for final unlock */ + } + } + + unlock_page(page); +out: + return err; +} + +/* + * try_to_merge_with_ksm_page - like try_to_merge_two_pages, + * but no new kernel page is allocated: kpage must already be a ksm page. + * + * This function returns 0 if the pages were merged, -EFAULT otherwise. + */ +static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, + struct page *page, struct page *kpage) +{ + struct mm_struct *mm = rmap_item->mm; + struct vm_area_struct *vma; + int err = -EFAULT; + + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + goto out; + vma = find_vma(mm, rmap_item->address); + if (!vma || vma->vm_start > rmap_item->address) + goto out; + + err = try_to_merge_one_page(vma, page, kpage); + if (err) + goto out; + + /* Must get reference to anon_vma while still holding mmap_sem */ + hold_anon_vma(rmap_item, vma->anon_vma); +out: + up_read(&mm->mmap_sem); + return err; +} + +/* + * try_to_merge_two_pages - take two identical pages and prepare them + * to be merged into one page. + * + * This function returns the kpage if we successfully merged two identical + * pages into one ksm page, NULL otherwise. + * + * Note that this function upgrades page to ksm page: if one of the pages + * is already a ksm page, try_to_merge_with_ksm_page should be used. + */ +static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, + struct page *page, + struct rmap_item *tree_rmap_item, + struct page *tree_page) +{ + int err; + + err = try_to_merge_with_ksm_page(rmap_item, page, NULL); + if (!err) { + err = try_to_merge_with_ksm_page(tree_rmap_item, + tree_page, page); + /* + * If that fails, we have a ksm page with only one pte + * pointing to it: so break it. + */ + if (err) + break_cow(rmap_item); + } + return err ? NULL : page; +} + +/* + * stable_tree_search - search for page inside the stable tree + * + * This function checks if there is a page inside the stable tree + * with identical content to the page that we are scanning right now. + * + * This function returns the stable tree node of identical content if found, + * NULL otherwise. + */ +static struct page *stable_tree_search(struct page *page) +{ + struct rb_node *node = root_stable_tree.rb_node; + struct stable_node *stable_node; + + stable_node = page_stable_node(page); + if (stable_node) { /* ksm page forked */ + get_page(page); + return page; + } + + while (node) { + struct page *tree_page; + int ret; + + cond_resched(); + stable_node = rb_entry(node, struct stable_node, node); + tree_page = get_ksm_page(stable_node); + if (!tree_page) + return NULL; + + ret = memcmp_pages(page, tree_page); + + if (ret < 0) { + put_page(tree_page); + node = node->rb_left; + } else if (ret > 0) { + put_page(tree_page); + node = node->rb_right; + } else + return tree_page; + } + + return NULL; +} + +/* + * stable_tree_insert - insert rmap_item pointing to new ksm page + * into the stable tree. + * + * This function returns the stable tree node just allocated on success, + * NULL otherwise. + */ +static struct stable_node *stable_tree_insert(struct page *kpage) +{ + struct rb_node **new = &root_stable_tree.rb_node; + struct rb_node *parent = NULL; + struct stable_node *stable_node; + + while (*new) { + struct page *tree_page; + int ret; + + cond_resched(); + stable_node = rb_entry(*new, struct stable_node, node); + tree_page = get_ksm_page(stable_node); + if (!tree_page) + return NULL; + + ret = memcmp_pages(kpage, tree_page); + put_page(tree_page); + + parent = *new; + if (ret < 0) + new = &parent->rb_left; + else if (ret > 0) + new = &parent->rb_right; + else { + /* + * It is not a bug that stable_tree_search() didn't + * find this node: because at that time our page was + * not yet write-protected, so may have changed since. + */ + return NULL; + } + } + + stable_node = alloc_stable_node(); + if (!stable_node) + return NULL; + + rb_link_node(&stable_node->node, parent, new); + rb_insert_color(&stable_node->node, &root_stable_tree); + + INIT_HLIST_HEAD(&stable_node->hlist); + + stable_node->kpfn = page_to_pfn(kpage); + set_page_stable_node(kpage, stable_node); + + return stable_node; +} + +/* + * unstable_tree_search_insert - search for identical page, + * else insert rmap_item into the unstable tree. + * + * This function searches for a page in the unstable tree identical to the + * page currently being scanned; and if no identical page is found in the + * tree, we insert rmap_item as a new object into the unstable tree. + * + * This function returns pointer to rmap_item found to be identical + * to the currently scanned page, NULL otherwise. + * + * This function does both searching and inserting, because they share + * the same walking algorithm in an rbtree. + */ +static +struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, + struct page *page, + struct page **tree_pagep) + +{ + struct rb_node **new = &root_unstable_tree.rb_node; + struct rb_node *parent = NULL; + + while (*new) { + struct rmap_item *tree_rmap_item; + struct page *tree_page; + int ret; + + cond_resched(); + tree_rmap_item = rb_entry(*new, struct rmap_item, node); + tree_page = get_mergeable_page(tree_rmap_item); + if (IS_ERR_OR_NULL(tree_page)) + return NULL; + + /* + * Don't substitute a ksm page for a forked page. + */ + if (page == tree_page) { + put_page(tree_page); + return NULL; + } + + ret = memcmp_pages(page, tree_page); + + parent = *new; + if (ret < 0) { + put_page(tree_page); + new = &parent->rb_left; + } else if (ret > 0) { + put_page(tree_page); + new = &parent->rb_right; + } else { + *tree_pagep = tree_page; + return tree_rmap_item; + } + } + + rmap_item->address |= UNSTABLE_FLAG; + rmap_item->address |= (ksm_scan.seqnr & SEQNR_MASK); + rb_link_node(&rmap_item->node, parent, new); + rb_insert_color(&rmap_item->node, &root_unstable_tree); + + ksm_pages_unshared++; + return NULL; +} + +/* + * stable_tree_append - add another rmap_item to the linked list of + * rmap_items hanging off a given node of the stable tree, all sharing + * the same ksm page. + */ +static void stable_tree_append(struct rmap_item *rmap_item, + struct stable_node *stable_node) +{ + rmap_item->head = stable_node; + rmap_item->address |= STABLE_FLAG; + hlist_add_head(&rmap_item->hlist, &stable_node->hlist); + + if (rmap_item->hlist.next) + ksm_pages_sharing++; + else + ksm_pages_shared++; +} + +/* + * cmp_and_merge_page - first see if page can be merged into the stable tree; + * if not, compare checksum to previous and if it's the same, see if page can + * be inserted into the unstable tree, or merged with a page already there and + * both transferred to the stable tree. + * + * @page: the page that we are searching identical page to. + * @rmap_item: the reverse mapping into the virtual address of this page + */ +static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) +{ + struct rmap_item *tree_rmap_item; + struct page *tree_page = NULL; + struct stable_node *stable_node; + struct page *kpage; + unsigned int checksum; + int err; + + remove_rmap_item_from_tree(rmap_item); + + /* We first start with searching the page inside the stable tree */ + kpage = stable_tree_search(page); + if (kpage) { + err = try_to_merge_with_ksm_page(rmap_item, page, kpage); + if (!err) { + /* + * The page was successfully merged: + * add its rmap_item to the stable tree. + */ + lock_page(kpage); + stable_tree_append(rmap_item, page_stable_node(kpage)); + unlock_page(kpage); + } + put_page(kpage); + return; + } + + /* + * If the hash value of the page has changed from the last time + * we calculated it, this page is changing frequently: therefore we + * don't want to insert it in the unstable tree, and we don't want + * to waste our time searching for something identical to it there. + */ + checksum = calc_checksum(page); + if (rmap_item->oldchecksum != checksum) { + rmap_item->oldchecksum = checksum; + return; + } + + tree_rmap_item = + unstable_tree_search_insert(rmap_item, page, &tree_page); + if (tree_rmap_item) { + kpage = try_to_merge_two_pages(rmap_item, page, + tree_rmap_item, tree_page); + put_page(tree_page); + /* + * As soon as we merge this page, we want to remove the + * rmap_item of the page we have merged with from the unstable + * tree, and insert it instead as new node in the stable tree. + */ + if (kpage) { + remove_rmap_item_from_tree(tree_rmap_item); + + lock_page(kpage); + stable_node = stable_tree_insert(kpage); + if (stable_node) { + stable_tree_append(tree_rmap_item, stable_node); + stable_tree_append(rmap_item, stable_node); + } + unlock_page(kpage); + + /* + * If we fail to insert the page into the stable tree, + * we will have 2 virtual addresses that are pointing + * to a ksm page left outside the stable tree, + * in which case we need to break_cow on both. + */ + if (!stable_node) { + break_cow(tree_rmap_item); + break_cow(rmap_item); + } + } + } +} + +static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, + struct rmap_item **rmap_list, + unsigned long addr) +{ + struct rmap_item *rmap_item; + + while (*rmap_list) { + rmap_item = *rmap_list; + if ((rmap_item->address & PAGE_MASK) == addr) + return rmap_item; + if (rmap_item->address > addr) + break; + *rmap_list = rmap_item->rmap_list; + remove_rmap_item_from_tree(rmap_item); + free_rmap_item(rmap_item); + } + + rmap_item = alloc_rmap_item(); + if (rmap_item) { + /* It has already been zeroed */ + rmap_item->mm = mm_slot->mm; + rmap_item->address = addr; + rmap_item->rmap_list = *rmap_list; + *rmap_list = rmap_item; + } + return rmap_item; +} + +static struct rmap_item *scan_get_next_rmap_item(struct page **page) +{ + struct mm_struct *mm; + struct mm_slot *slot; + struct vm_area_struct *vma; + struct rmap_item *rmap_item; + + if (list_empty(&ksm_mm_head.mm_list)) + return NULL; + + slot = ksm_scan.mm_slot; + if (slot == &ksm_mm_head) { + root_unstable_tree = RB_ROOT; + + spin_lock(&ksm_mmlist_lock); + slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); + ksm_scan.mm_slot = slot; + spin_unlock(&ksm_mmlist_lock); +next_mm: + ksm_scan.address = 0; + ksm_scan.rmap_list = &slot->rmap_list; + } + + mm = slot->mm; + down_read(&mm->mmap_sem); + if (ksm_test_exit(mm)) + vma = NULL; + else + vma = find_vma(mm, ksm_scan.address); + + for (; vma; vma = vma->vm_next) { + if (!(vma->vm_flags & VM_MERGEABLE)) + continue; + if (ksm_scan.address < vma->vm_start) + ksm_scan.address = vma->vm_start; + if (!vma->anon_vma) + ksm_scan.address = vma->vm_end; + + while (ksm_scan.address < vma->vm_end) { + if (ksm_test_exit(mm)) + break; + *page = follow_page(vma, ksm_scan.address, FOLL_GET); + if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { + flush_anon_page(vma, *page, ksm_scan.address); + flush_dcache_page(*page); + rmap_item = get_next_rmap_item(slot, + ksm_scan.rmap_list, ksm_scan.address); + if (rmap_item) { + ksm_scan.rmap_list = + &rmap_item->rmap_list; + ksm_scan.address += PAGE_SIZE; + } else + put_page(*page); + up_read(&mm->mmap_sem); + return rmap_item; + } + if (!IS_ERR_OR_NULL(*page)) + put_page(*page); + ksm_scan.address += PAGE_SIZE; + cond_resched(); + } + } + + if (ksm_test_exit(mm)) { + ksm_scan.address = 0; + ksm_scan.rmap_list = &slot->rmap_list; + } + /* + * Nuke all the rmap_items that are above this current rmap: + * because there were no VM_MERGEABLE vmas with such addresses. + */ + remove_trailing_rmap_items(slot, ksm_scan.rmap_list); + + spin_lock(&ksm_mmlist_lock); + ksm_scan.mm_slot = list_entry(slot->mm_list.next, + struct mm_slot, mm_list); + if (ksm_scan.address == 0) { + /* + * We've completed a full scan of all vmas, holding mmap_sem + * throughout, and found no VM_MERGEABLE: so do the same as + * __ksm_exit does to remove this mm from all our lists now. + * This applies either when cleaning up after __ksm_exit + * (but beware: we can reach here even before __ksm_exit), + * or when all VM_MERGEABLE areas have been unmapped (and + * mmap_sem then protects against race with MADV_MERGEABLE). + */ + hlist_del(&slot->link); + list_del(&slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + free_mm_slot(slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + up_read(&mm->mmap_sem); + mmdrop(mm); + } else { + spin_unlock(&ksm_mmlist_lock); + up_read(&mm->mmap_sem); + } + + /* Repeat until we've completed scanning the whole list */ + slot = ksm_scan.mm_slot; + if (slot != &ksm_mm_head) + goto next_mm; + + ksm_scan.seqnr++; + return NULL; +} + +/** + * ksm_do_scan - the ksm scanner main worker function. + * @scan_npages - number of pages we want to scan before we return. + */ +static void ksm_do_scan(unsigned int scan_npages) +{ + struct rmap_item *rmap_item; + struct page *uninitialized_var(page); + + while (scan_npages--) { + cond_resched(); + rmap_item = scan_get_next_rmap_item(&page); + if (!rmap_item) + return; + if (!PageKsm(page) || !in_stable_tree(rmap_item)) + cmp_and_merge_page(page, rmap_item); + put_page(page); + } +} + +static int ksmd_should_run(void) +{ + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); +} + +static int ksm_scan_thread(void *nothing) +{ + set_user_nice(current, 5); + + while (!kthread_should_stop()) { + mutex_lock(&ksm_thread_mutex); + if (ksmd_should_run()) + ksm_do_scan(ksm_thread_pages_to_scan); + mutex_unlock(&ksm_thread_mutex); + + if (ksmd_should_run()) { + schedule_timeout_interruptible( + msecs_to_jiffies(ksm_thread_sleep_millisecs)); + } else { + wait_event_interruptible(ksm_thread_wait, + ksmd_should_run() || kthread_should_stop()); + } + } + return 0; +} + +int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags) +{ + struct mm_struct *mm = vma->vm_mm; + int err; + + switch (advice) { + case MADV_MERGEABLE: + /* + * Be somewhat over-protective for now! + */ + if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_NONLINEAR | VM_MIXEDMAP | VM_SAO)) + return 0; /* just ignore the advice */ + + if (!test_bit(MMF_VM_MERGEABLE, &mm->flags)) { + err = __ksm_enter(mm); + if (err) + return err; + } + + *vm_flags |= VM_MERGEABLE; + break; + + case MADV_UNMERGEABLE: + if (!(*vm_flags & VM_MERGEABLE)) + return 0; /* just ignore the advice */ + + if (vma->anon_vma) { + err = unmerge_ksm_pages(vma, start, end); + if (err) + return err; + } + + *vm_flags &= ~VM_MERGEABLE; + break; + } + + return 0; +} + +int __ksm_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int needs_wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* Check ksm_run too? Would need tighter locking */ + needs_wakeup = list_empty(&ksm_mm_head.mm_list); + + spin_lock(&ksm_mmlist_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little; when fork is followed by immediate exec, we don't + * want ksmd to waste time setting up and tearing down an rmap_list. + */ + list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + spin_unlock(&ksm_mmlist_lock); + + set_bit(MMF_VM_MERGEABLE, &mm->flags); + atomic_inc(&mm->mm_count); + + if (needs_wakeup) + wake_up_interruptible(&ksm_thread_wait); + + return 0; +} + +void __ksm_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int easy_to_free = 0; + + /* + * This process is exiting: if it's straightforward (as is the + * case when ksmd was never running), free mm_slot immediately. + * But if it's at the cursor or has rmap_items linked to it, use + * mmap_sem to synchronize with any break_cows before pagetables + * are freed, and leave the mm_slot on the list for ksmd to free. + * Beware: ksm may already have noticed it exiting and freed the slot. + */ + + spin_lock(&ksm_mmlist_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && ksm_scan.mm_slot != mm_slot) { + if (!mm_slot->rmap_list) { + hlist_del(&mm_slot->link); + list_del(&mm_slot->mm_list); + easy_to_free = 1; + } else { + list_move(&mm_slot->mm_list, + &ksm_scan.mm_slot->mm_list); + } + } + spin_unlock(&ksm_mmlist_lock); + + if (easy_to_free) { + free_mm_slot(mm_slot); + clear_bit(MMF_VM_MERGEABLE, &mm->flags); + mmdrop(mm); + } else if (mm_slot) { + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } +} + +struct page *ksm_does_need_to_copy(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct page *new_page; + + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (new_page) { + copy_user_highpage(new_page, page, address, vma); + + SetPageDirty(new_page); + __SetPageUptodate(new_page); + SetPageSwapBacked(new_page); + __set_page_locked(new_page); + + if (page_evictable(new_page, vma)) + lru_cache_add_lru(new_page, LRU_ACTIVE_ANON); + else + add_page_to_unevictable_list(new_page); + } + + return new_page; +} + +int page_referenced_ksm(struct page *page, struct mem_cgroup *memcg, + unsigned long *vm_flags) +{ + struct stable_node *stable_node; + struct rmap_item *rmap_item; + struct hlist_node *hlist; + unsigned int mapcount = page_mapcount(page); + int referenced = 0; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return 0; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + anon_vma_lock(anon_vma); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + if (memcg && !mm_match_cgroup(vma->vm_mm, memcg)) + continue; + + referenced += page_referenced_one(page, vma, + rmap_item->address, &mapcount, vm_flags); + if (!search_new_forks || !mapcount) + break; + } + anon_vma_unlock(anon_vma); + if (!mapcount) + goto out; + } + if (!search_new_forks++) + goto again; +out: + return referenced; +} + +int try_to_unmap_ksm(struct page *page, enum ttu_flags flags) +{ + struct stable_node *stable_node; + struct hlist_node *hlist; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return SWAP_FAIL; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + anon_vma_lock(anon_vma); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + ret = try_to_unmap_one(page, vma, + rmap_item->address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) { + anon_vma_unlock(anon_vma); + goto out; + } + } + anon_vma_unlock(anon_vma); + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +#ifdef CONFIG_MIGRATION +int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + struct stable_node *stable_node; + struct hlist_node *hlist; + struct rmap_item *rmap_item; + int ret = SWAP_AGAIN; + int search_new_forks = 0; + + VM_BUG_ON(!PageKsm(page)); + VM_BUG_ON(!PageLocked(page)); + + stable_node = page_stable_node(page); + if (!stable_node) + return ret; +again: + hlist_for_each_entry(rmap_item, hlist, &stable_node->hlist, hlist) { + struct anon_vma *anon_vma = rmap_item->anon_vma; + struct anon_vma_chain *vmac; + struct vm_area_struct *vma; + + anon_vma_lock(anon_vma); + list_for_each_entry(vmac, &anon_vma->head, same_anon_vma) { + vma = vmac->vma; + if (rmap_item->address < vma->vm_start || + rmap_item->address >= vma->vm_end) + continue; + /* + * Initially we examine only the vma which covers this + * rmap_item; but later, if there is still work to do, + * we examine covering vmas in other mms: in case they + * were forked from the original since ksmd passed. + */ + if ((rmap_item->mm == vma->vm_mm) == search_new_forks) + continue; + + ret = rmap_one(page, vma, rmap_item->address, arg); + if (ret != SWAP_AGAIN) { + anon_vma_unlock(anon_vma); + goto out; + } + } + anon_vma_unlock(anon_vma); + } + if (!search_new_forks++) + goto again; +out: + return ret; +} + +void ksm_migrate_page(struct page *newpage, struct page *oldpage) +{ + struct stable_node *stable_node; + + VM_BUG_ON(!PageLocked(oldpage)); + VM_BUG_ON(!PageLocked(newpage)); + VM_BUG_ON(newpage->mapping != oldpage->mapping); + + stable_node = page_stable_node(newpage); + if (stable_node) { + VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); + stable_node->kpfn = page_to_pfn(newpage); + } +} +#endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_MEMORY_HOTREMOVE +static struct stable_node *ksm_check_stable_tree(unsigned long start_pfn, + unsigned long end_pfn) +{ + struct rb_node *node; + + for (node = rb_first(&root_stable_tree); node; node = rb_next(node)) { + struct stable_node *stable_node; + + stable_node = rb_entry(node, struct stable_node, node); + if (stable_node->kpfn >= start_pfn && + stable_node->kpfn < end_pfn) + return stable_node; + } + return NULL; +} + +static int ksm_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + struct stable_node *stable_node; + + switch (action) { + case MEM_GOING_OFFLINE: + /* + * Keep it very simple for now: just lock out ksmd and + * MADV_UNMERGEABLE while any memory is going offline. + * mutex_lock_nested() is necessary because lockdep was alarmed + * that here we take ksm_thread_mutex inside notifier chain + * mutex, and later take notifier chain mutex inside + * ksm_thread_mutex to unlock it. But that's safe because both + * are inside mem_hotplug_mutex. + */ + mutex_lock_nested(&ksm_thread_mutex, SINGLE_DEPTH_NESTING); + break; + + case MEM_OFFLINE: + /* + * Most of the work is done by page migration; but there might + * be a few stable_nodes left over, still pointing to struct + * pages which have been offlined: prune those from the tree. + */ + while ((stable_node = ksm_check_stable_tree(mn->start_pfn, + mn->start_pfn + mn->nr_pages)) != NULL) + remove_node_from_stable_tree(stable_node); + /* fallthrough */ + + case MEM_CANCEL_OFFLINE: + mutex_unlock(&ksm_thread_mutex); + break; + } + return NOTIFY_OK; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + +#ifdef CONFIG_SYSFS +/* + * This all compiles without CONFIG_SYSFS, but is a waste of space. + */ + +#define KSM_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) +#define KSM_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static ssize_t sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs); +} + +static ssize_t sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + ksm_thread_sleep_millisecs = msecs; + + return count; +} +KSM_ATTR(sleep_millisecs); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%u\n", ksm_thread_pages_to_scan); +} + +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long nr_pages; + + err = strict_strtoul(buf, 10, &nr_pages); + if (err || nr_pages > UINT_MAX) + return -EINVAL; + + ksm_thread_pages_to_scan = nr_pages; + + return count; +} +KSM_ATTR(pages_to_scan); + +static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", ksm_run); +} + +static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long flags; + + err = strict_strtoul(buf, 10, &flags); + if (err || flags > UINT_MAX) + return -EINVAL; + if (flags > KSM_RUN_UNMERGE) + return -EINVAL; + + /* + * KSM_RUN_MERGE sets ksmd running, and 0 stops it running. + * KSM_RUN_UNMERGE stops it running and unmerges all rmap_items, + * breaking COW to free the pages_shared (but leaves mm_slots + * on the list for when ksmd may be set running again). + */ + + mutex_lock(&ksm_thread_mutex); + if (ksm_run != flags) { + ksm_run = flags; + if (flags & KSM_RUN_UNMERGE) { + current->flags |= PF_OOM_ORIGIN; + err = unmerge_and_remove_all_rmap_items(); + current->flags &= ~PF_OOM_ORIGIN; + if (err) { + ksm_run = KSM_RUN_STOP; + count = err; + } + } + } + mutex_unlock(&ksm_thread_mutex); + + if (flags & KSM_RUN_MERGE) + wake_up_interruptible(&ksm_thread_wait); + + return count; +} +KSM_ATTR(run); + +static ssize_t pages_shared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_shared); +} +KSM_ATTR_RO(pages_shared); + +static ssize_t pages_sharing_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_sharing); +} +KSM_ATTR_RO(pages_sharing); + +static ssize_t pages_unshared_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_pages_unshared); +} +KSM_ATTR_RO(pages_unshared); + +static ssize_t pages_volatile_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + long ksm_pages_volatile; + + ksm_pages_volatile = ksm_rmap_items - ksm_pages_shared + - ksm_pages_sharing - ksm_pages_unshared; + /* + * It was not worth any locking to calculate that statistic, + * but it might therefore sometimes be negative: conceal that. + */ + if (ksm_pages_volatile < 0) + ksm_pages_volatile = 0; + return sprintf(buf, "%ld\n", ksm_pages_volatile); +} +KSM_ATTR_RO(pages_volatile); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", ksm_scan.seqnr); +} +KSM_ATTR_RO(full_scans); + +static struct attribute *ksm_attrs[] = { + &sleep_millisecs_attr.attr, + &pages_to_scan_attr.attr, + &run_attr.attr, + &pages_shared_attr.attr, + &pages_sharing_attr.attr, + &pages_unshared_attr.attr, + &pages_volatile_attr.attr, + &full_scans_attr.attr, + NULL, +}; + +static struct attribute_group ksm_attr_group = { + .attrs = ksm_attrs, + .name = "ksm", +}; +#endif /* CONFIG_SYSFS */ + +static int __init ksm_init(void) +{ + struct task_struct *ksm_thread; + int err; + + err = ksm_slab_init(); + if (err) + goto out; + + ksm_thread = kthread_run(ksm_scan_thread, NULL, "ksmd"); + if (IS_ERR(ksm_thread)) { + printk(KERN_ERR "ksm: creating kthread failed\n"); + err = PTR_ERR(ksm_thread); + goto out_free; + } + +#ifdef CONFIG_SYSFS + err = sysfs_create_group(mm_kobj, &ksm_attr_group); + if (err) { + printk(KERN_ERR "ksm: register sysfs failed\n"); + kthread_stop(ksm_thread); + goto out_free; + } +#else + ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ + +#endif /* CONFIG_SYSFS */ + +#ifdef CONFIG_MEMORY_HOTREMOVE + /* + * Choose a high priority since the callback takes ksm_thread_mutex: + * later callbacks could only be taking locks which nest within that. + */ + hotplug_memory_notifier(ksm_memory_callback, 100); +#endif + return 0; + +out_free: + ksm_slab_free(); +out: + return err; +} +module_init(ksm_init) diff --git a/mm/maccess.c b/mm/maccess.c index 9073695..e2b6f56 100644 --- a/mm/maccess.c +++ b/mm/maccess.c @@ -1,9 +1,9 @@ /* * Access kernel memory without faulting. */ -#include <linux/uaccess.h> #include <linux/module.h> #include <linux/mm.h> +#include <linux/uaccess.h> /** * probe_kernel_read(): safely attempt to read from a location @@ -14,7 +14,11 @@ * Safely read from address @src to the buffer at @dst. If a kernel fault * happens, handle that and return -EFAULT. */ -long probe_kernel_read(void *dst, void *src, size_t size) + +long __weak probe_kernel_read(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_read"))); + +long __probe_kernel_read(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); @@ -39,7 +43,10 @@ EXPORT_SYMBOL_GPL(probe_kernel_read); * Safely write to address @dst from the buffer at @src. If a kernel fault * happens, handle that and return -EFAULT. */ -long notrace __weak probe_kernel_write(void *dst, void *src, size_t size) +long __weak probe_kernel_write(void *dst, void *src, size_t size) + __attribute__((alias("__probe_kernel_write"))); + +long __probe_kernel_write(void *dst, void *src, size_t size) { long ret; mm_segment_t old_fs = get_fs(); diff --git a/mm/madvise.c b/mm/madvise.c index 76eb419..319528b 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -9,8 +9,10 @@ #include <linux/pagemap.h> #include <linux/syscalls.h> #include <linux/mempolicy.h> +#include <linux/page-isolation.h> #include <linux/hugetlb.h> #include <linux/sched.h> +#include <linux/ksm.h> /* * Any behaviour which results in changes to the vma->vm_flags needs to @@ -41,7 +43,7 @@ static long madvise_behavior(struct vm_area_struct * vma, struct mm_struct * mm = vma->vm_mm; int error = 0; pgoff_t pgoff; - int new_flags = vma->vm_flags; + unsigned long new_flags = vma->vm_flags; switch (behavior) { case MADV_NORMAL: @@ -57,8 +59,18 @@ static long madvise_behavior(struct vm_area_struct * vma, new_flags |= VM_DONTCOPY; break; case MADV_DOFORK: + if (vma->vm_flags & VM_IO) { + error = -EINVAL; + goto out; + } new_flags &= ~VM_DONTCOPY; break; + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: + error = ksm_madvise(vma, start, end, behavior, &new_flags); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -207,41 +219,52 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +#ifdef CONFIG_MEMORY_FAILURE +/* + * Error injection support for memory error handling. + */ +static int madvise_hwpoison(int bhv, unsigned long start, unsigned long end) +{ + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + for (; start < end; start += PAGE_SIZE) { + struct page *p; + int ret = get_user_pages_fast(start, 1, 0, &p); + if (ret != 1) + return ret; + if (bhv == MADV_SOFT_OFFLINE) { + printk(KERN_INFO "Soft offlining page %lx at %lx\n", + page_to_pfn(p), start); + ret = soft_offline_page(p, MF_COUNT_INCREASED); + if (ret) + break; + continue; + } + printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + page_to_pfn(p), start); + /* Ignore return value for now */ + __memory_failure(page_to_pfn(p), 0, MF_COUNT_INCREASED); + } + return ret; +} +#endif + static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) { - long error; - switch (behavior) { - case MADV_DOFORK: - if (vma->vm_flags & VM_IO) { - error = -EINVAL; - break; - } - case MADV_DONTFORK: - case MADV_NORMAL: - case MADV_SEQUENTIAL: - case MADV_RANDOM: - error = madvise_behavior(vma, prev, start, end, behavior); - break; case MADV_REMOVE: - error = madvise_remove(vma, prev, start, end); - break; - + return madvise_remove(vma, prev, start, end); case MADV_WILLNEED: - error = madvise_willneed(vma, prev, start, end); - break; - + return madvise_willneed(vma, prev, start, end); case MADV_DONTNEED: - error = madvise_dontneed(vma, prev, start, end); - break; - + return madvise_dontneed(vma, prev, start, end); default: - BUG(); - break; + return madvise_behavior(vma, prev, start, end, behavior); } - return error; } static int @@ -256,12 +279,17 @@ madvise_behavior_valid(int behavior) case MADV_REMOVE: case MADV_WILLNEED: case MADV_DONTNEED: +#ifdef CONFIG_KSM + case MADV_MERGEABLE: + case MADV_UNMERGEABLE: +#endif return 1; default: return 0; } } + /* * The madvise(2) system call. * @@ -286,6 +314,12 @@ madvise_behavior_valid(int behavior) * so the kernel can free resources associated with it. * MADV_REMOVE - the application wants to free up the given range of * pages and associated backing store. + * MADV_DONTFORK - omit this area from child's address space when forking: + * typically, to avoid COWing pages pinned by get_user_pages(). + * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. + * MADV_MERGEABLE - the application recommends that KSM try to merge pages in + * this area with pages of identical content from other such areas. + * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. * * return values: * zero - success @@ -307,6 +341,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int write; size_t len; +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) + return madvise_hwpoison(behavior, start, start+len_in); +#endif if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/memblock.c b/mm/memblock.c new file mode 100644 index 0000000..400dc62 --- /dev/null +++ b/mm/memblock.c @@ -0,0 +1,842 @@ +/* + * Procedures for maintaining information about logical memory blocks. + * + * Peter Bergner, IBM Corp. June 2001. + * Copyright (C) 2001 Peter Bergner. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/init.h> +#include <linux/bitops.h> +#include <linux/poison.h> +#include <linux/pfn.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/memblock.h> + +struct memblock memblock __initdata_memblock; + +int memblock_debug __initdata_memblock; +int memblock_can_resize __initdata_memblock; +static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; +static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock; + +/* inline so we don't get a warning when pr_debug is compiled out */ +static inline const char *memblock_type_name(struct memblock_type *type) +{ + if (type == &memblock.memory) + return "memory"; + else if (type == &memblock.reserved) + return "reserved"; + else + return "unknown"; +} + +/* + * Address comparison utilities + */ + +static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size) +{ + return addr & ~(size - 1); +} + +static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size) +{ + return (addr + (size - 1)) & ~(size - 1); +} + +static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2) +{ + return ((base1 < (base2 + size2)) && (base2 < (base1 + size1))); +} + +static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1, + phys_addr_t base2, phys_addr_t size2) +{ + if (base2 == base1 + size1) + return 1; + else if (base1 == base2 + size2) + return -1; + + return 0; +} + +static long __init_memblock memblock_regions_adjacent(struct memblock_type *type, + unsigned long r1, unsigned long r2) +{ + phys_addr_t base1 = type->regions[r1].base; + phys_addr_t size1 = type->regions[r1].size; + phys_addr_t base2 = type->regions[r2].base; + phys_addr_t size2 = type->regions[r2].size; + + return memblock_addrs_adjacent(base1, size1, base2, size2); +} + +long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +{ + unsigned long i; + + for (i = 0; i < type->cnt; i++) { + phys_addr_t rgnbase = type->regions[i].base; + phys_addr_t rgnsize = type->regions[i].size; + if (memblock_addrs_overlap(base, size, rgnbase, rgnsize)) + break; + } + + return (i < type->cnt) ? i : -1; +} + +/* + * Find, allocate, deallocate or reserve unreserved regions. All allocations + * are top-down. + */ + +static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end, + phys_addr_t size, phys_addr_t align) +{ + phys_addr_t base, res_base; + long j; + + /* In case, huge size is requested */ + if (end < size) + return MEMBLOCK_ERROR; + + base = memblock_align_down((end - size), align); + + /* Prevent allocations returning 0 as it's also used to + * indicate an allocation failure + */ + if (start == 0) + start = PAGE_SIZE; + + while (start <= base) { + j = memblock_overlaps_region(&memblock.reserved, base, size); + if (j < 0) + return base; + res_base = memblock.reserved.regions[j].base; + if (res_base < size) + break; + base = memblock_align_down(res_base - size, align); + } + + return MEMBLOCK_ERROR; +} + +static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, + phys_addr_t align, phys_addr_t start, phys_addr_t end) +{ + long i; + + BUG_ON(0 == size); + + size = memblock_align_up(size, align); + + /* Pump up max_addr */ + if (end == MEMBLOCK_ALLOC_ACCESSIBLE) + end = memblock.current_limit; + + /* We do a top-down search, this tends to limit memory + * fragmentation by keeping early boot allocs near the + * top of memory + */ + for (i = memblock.memory.cnt - 1; i >= 0; i--) { + phys_addr_t memblockbase = memblock.memory.regions[i].base; + phys_addr_t memblocksize = memblock.memory.regions[i].size; + phys_addr_t bottom, top, found; + + if (memblocksize < size) + continue; + if ((memblockbase + memblocksize) <= start) + break; + bottom = max(memblockbase, start); + top = min(memblockbase + memblocksize, end); + if (bottom >= top) + continue; + found = memblock_find_region(bottom, top, size, align); + if (found != MEMBLOCK_ERROR) + return found; + } + return MEMBLOCK_ERROR; +} + +/* + * Find a free area with specified alignment in a specific range. + */ +u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align) +{ + return memblock_find_base(size, align, start, end); +} + +/* + * Free memblock.reserved.regions + */ +int __init_memblock memblock_free_reserved_regions(void) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + return memblock_free(__pa(memblock.reserved.regions), + sizeof(struct memblock_region) * memblock.reserved.max); +} + +/* + * Reserve memblock.reserved.regions + */ +int __init_memblock memblock_reserve_reserved_regions(void) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + return memblock_reserve(__pa(memblock.reserved.regions), + sizeof(struct memblock_region) * memblock.reserved.max); +} + +static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) +{ + unsigned long i; + + for (i = r; i < type->cnt - 1; i++) { + type->regions[i].base = type->regions[i + 1].base; + type->regions[i].size = type->regions[i + 1].size; + } + type->cnt--; +} + +/* Assumption: base addr of region 1 < base addr of region 2 */ +static void __init_memblock memblock_coalesce_regions(struct memblock_type *type, + unsigned long r1, unsigned long r2) +{ + type->regions[r1].size += type->regions[r2].size; + memblock_remove_region(type, r2); +} + +/* Defined below but needed now */ +static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size); + +static int __init_memblock memblock_double_array(struct memblock_type *type) +{ + struct memblock_region *new_array, *old_array; + phys_addr_t old_size, new_size, addr; + int use_slab = slab_is_available(); + + /* We don't allow resizing until we know about the reserved regions + * of memory that aren't suitable for allocation + */ + if (!memblock_can_resize) + return -1; + + /* Calculate new doubled size */ + old_size = type->max * sizeof(struct memblock_region); + new_size = old_size << 1; + + /* Try to find some space for it. + * + * WARNING: We assume that either slab_is_available() and we use it or + * we use MEMBLOCK for allocations. That means that this is unsafe to use + * when bootmem is currently active (unless bootmem itself is implemented + * on top of MEMBLOCK which isn't the case yet) + * + * This should however not be an issue for now, as we currently only + * call into MEMBLOCK while it's still active, or much later when slab is + * active for memory hotplug operations + */ + if (use_slab) { + new_array = kmalloc(new_size, GFP_KERNEL); + addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array); + } else + addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE); + if (addr == MEMBLOCK_ERROR) { + pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n", + memblock_type_name(type), type->max, type->max * 2); + return -1; + } + new_array = __va(addr); + + memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", + memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); + + /* Found space, we now need to move the array over before + * we add the reserved region since it may be our reserved + * array itself that is full. + */ + memcpy(new_array, type->regions, old_size); + memset(new_array + type->max, 0, old_size); + old_array = type->regions; + type->regions = new_array; + type->max <<= 1; + + /* If we use SLAB that's it, we are done */ + if (use_slab) + return 0; + + /* Add the new reserved region now. Should not fail ! */ + BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0); + + /* If the array wasn't our static init one, then free it. We only do + * that before SLAB is available as later on, we don't know whether + * to use kfree or free_bootmem_pages(). Shouldn't be a big deal + * anyways + */ + if (old_array != memblock_memory_init_regions && + old_array != memblock_reserved_init_regions) + memblock_free(__pa(old_array), old_size); + + return 0; +} + +extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1, + phys_addr_t addr2, phys_addr_t size2) +{ + return 1; +} + +static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +{ + unsigned long coalesced = 0; + long adjacent, i; + + if ((type->cnt == 1) && (type->regions[0].size == 0)) { + type->regions[0].base = base; + type->regions[0].size = size; + return 0; + } + + /* First try and coalesce this MEMBLOCK with another. */ + for (i = 0; i < type->cnt; i++) { + phys_addr_t rgnbase = type->regions[i].base; + phys_addr_t rgnsize = type->regions[i].size; + + if ((rgnbase == base) && (rgnsize == size)) + /* Already have this region, so we're done */ + return 0; + + adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize); + /* Check if arch allows coalescing */ + if (adjacent != 0 && type == &memblock.memory && + !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize)) + break; + if (adjacent > 0) { + type->regions[i].base -= size; + type->regions[i].size += size; + coalesced++; + break; + } else if (adjacent < 0) { + type->regions[i].size += size; + coalesced++; + break; + } + } + + /* If we plugged a hole, we may want to also coalesce with the + * next region + */ + if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) && + ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base, + type->regions[i].size, + type->regions[i+1].base, + type->regions[i+1].size)))) { + memblock_coalesce_regions(type, i, i+1); + coalesced++; + } + + if (coalesced) + return coalesced; + + /* If we are out of space, we fail. It's too late to resize the array + * but then this shouldn't have happened in the first place. + */ + if (WARN_ON(type->cnt >= type->max)) + return -1; + + /* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */ + for (i = type->cnt - 1; i >= 0; i--) { + if (base < type->regions[i].base) { + type->regions[i+1].base = type->regions[i].base; + type->regions[i+1].size = type->regions[i].size; + } else { + type->regions[i+1].base = base; + type->regions[i+1].size = size; + break; + } + } + + if (base < type->regions[0].base) { + type->regions[0].base = base; + type->regions[0].size = size; + } + type->cnt++; + + /* The array is full ? Try to resize it. If that fails, we undo + * our allocation and return an error + */ + if (type->cnt == type->max && memblock_double_array(type)) { + type->cnt--; + return -1; + } + + return 0; +} + +long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size) +{ + return memblock_add_region(&memblock.memory, base, size); + +} + +static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size) +{ + phys_addr_t rgnbegin, rgnend; + phys_addr_t end = base + size; + int i; + + rgnbegin = rgnend = 0; /* supress gcc warnings */ + + /* Find the region where (base, size) belongs to */ + for (i=0; i < type->cnt; i++) { + rgnbegin = type->regions[i].base; + rgnend = rgnbegin + type->regions[i].size; + + if ((rgnbegin <= base) && (end <= rgnend)) + break; + } + + /* Didn't find the region */ + if (i == type->cnt) + return -1; + + /* Check to see if we are removing entire region */ + if ((rgnbegin == base) && (rgnend == end)) { + memblock_remove_region(type, i); + return 0; + } + + /* Check to see if region is matching at the front */ + if (rgnbegin == base) { + type->regions[i].base = end; + type->regions[i].size -= size; + return 0; + } + + /* Check to see if the region is matching at the end */ + if (rgnend == end) { + type->regions[i].size -= size; + return 0; + } + + /* + * We need to split the entry - adjust the current one to the + * beginging of the hole and add the region after hole. + */ + type->regions[i].size = base - type->regions[i].base; + return memblock_add_region(type, end, rgnend - end); +} + +long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) +{ + return __memblock_remove(&memblock.memory, base, size); +} + +long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size) +{ + return __memblock_remove(&memblock.reserved, base, size); +} + +long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) +{ + struct memblock_type *_rgn = &memblock.reserved; + + BUG_ON(0 == size); + + return memblock_add_region(_rgn, base, size); +} + +phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +{ + phys_addr_t found; + + /* We align the size to limit fragmentation. Without this, a lot of + * small allocs quickly eat up the whole reserve array on sparc + */ + size = memblock_align_up(size, align); + + found = memblock_find_base(size, align, 0, max_addr); + if (found != MEMBLOCK_ERROR && + memblock_add_region(&memblock.reserved, found, size) >= 0) + return found; + + return 0; +} + +phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr) +{ + phys_addr_t alloc; + + alloc = __memblock_alloc_base(size, align, max_addr); + + if (alloc == 0) + panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n", + (unsigned long long) size, (unsigned long long) max_addr); + + return alloc; +} + +phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align) +{ + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); +} + + +/* + * Additional node-local allocators. Search for node memory is bottom up + * and walks memblock regions within that node bottom-up as well, but allocation + * within an memblock region is top-down. XXX I plan to fix that at some stage + * + * WARNING: Only available after early_node_map[] has been populated, + * on some architectures, that is after all the calls to add_active_range() + * have been done to populate it. + */ + +phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid) +{ +#ifdef CONFIG_ARCH_POPULATES_NODE_MAP + /* + * This code originates from sparc which really wants use to walk by addresses + * and returns the nid. This is not very convenient for early_pfn_map[] users + * as the map isn't sorted yet, and it really wants to be walked by nid. + * + * For now, I implement the inefficient method below which walks the early + * map multiple times. Eventually we may want to use an ARCH config option + * to implement a completely different method for both case. + */ + unsigned long start_pfn, end_pfn; + int i; + + for (i = 0; i < MAX_NUMNODES; i++) { + get_pfn_range_for_nid(i, &start_pfn, &end_pfn); + if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn)) + continue; + *nid = i; + return min(end, PFN_PHYS(end_pfn)); + } +#endif + *nid = 0; + + return end; +} + +static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp, + phys_addr_t size, + phys_addr_t align, int nid) +{ + phys_addr_t start, end; + + start = mp->base; + end = start + mp->size; + + start = memblock_align_up(start, align); + while (start < end) { + phys_addr_t this_end; + int this_nid; + + this_end = memblock_nid_range(start, end, &this_nid); + if (this_nid == nid) { + phys_addr_t ret = memblock_find_region(start, this_end, size, align); + if (ret != MEMBLOCK_ERROR && + memblock_add_region(&memblock.reserved, ret, size) >= 0) + return ret; + } + start = this_end; + } + + return MEMBLOCK_ERROR; +} + +phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) +{ + struct memblock_type *mem = &memblock.memory; + int i; + + BUG_ON(0 == size); + + /* We align the size to limit fragmentation. Without this, a lot of + * small allocs quickly eat up the whole reserve array on sparc + */ + size = memblock_align_up(size, align); + + /* We do a bottom-up search for a region with the right + * nid since that's easier considering how memblock_nid_range() + * works + */ + for (i = 0; i < mem->cnt; i++) { + phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i], + size, align, nid); + if (ret != MEMBLOCK_ERROR) + return ret; + } + + return 0; +} + +phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid) +{ + phys_addr_t res = memblock_alloc_nid(size, align, nid); + + if (res) + return res; + return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE); +} + + +/* + * Remaining API functions + */ + +/* You must call memblock_analyze() before this. */ +phys_addr_t __init memblock_phys_mem_size(void) +{ + return memblock.memory_size; +} + +phys_addr_t __init_memblock memblock_end_of_DRAM(void) +{ + int idx = memblock.memory.cnt - 1; + + return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size); +} + +/* You must call memblock_analyze() after this. */ +void __init memblock_enforce_memory_limit(phys_addr_t memory_limit) +{ + unsigned long i; + phys_addr_t limit; + struct memblock_region *p; + + if (!memory_limit) + return; + + /* Truncate the memblock regions to satisfy the memory limit. */ + limit = memory_limit; + for (i = 0; i < memblock.memory.cnt; i++) { + if (limit > memblock.memory.regions[i].size) { + limit -= memblock.memory.regions[i].size; + continue; + } + + memblock.memory.regions[i].size = limit; + memblock.memory.cnt = i + 1; + break; + } + + memory_limit = memblock_end_of_DRAM(); + + /* And truncate any reserves above the limit also. */ + for (i = 0; i < memblock.reserved.cnt; i++) { + p = &memblock.reserved.regions[i]; + + if (p->base > memory_limit) + p->size = 0; + else if ((p->base + p->size) > memory_limit) + p->size = memory_limit - p->base; + + if (p->size == 0) { + memblock_remove_region(&memblock.reserved, i); + i--; + } + } +} + +static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr) +{ + unsigned int left = 0, right = type->cnt; + + do { + unsigned int mid = (right + left) / 2; + + if (addr < type->regions[mid].base) + right = mid; + else if (addr >= (type->regions[mid].base + + type->regions[mid].size)) + left = mid + 1; + else + return mid; + } while (left < right); + return -1; +} + +int __init memblock_is_reserved(phys_addr_t addr) +{ + return memblock_search(&memblock.reserved, addr) != -1; +} + +int __init_memblock memblock_is_memory(phys_addr_t addr) +{ + return memblock_search(&memblock.memory, addr) != -1; +} + +int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) +{ + int idx = memblock_search(&memblock.reserved, base); + + if (idx == -1) + return 0; + return memblock.reserved.regions[idx].base <= base && + (memblock.reserved.regions[idx].base + + memblock.reserved.regions[idx].size) >= (base + size); +} + +int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) +{ + return memblock_overlaps_region(&memblock.reserved, base, size) >= 0; +} + + +void __init_memblock memblock_set_current_limit(phys_addr_t limit) +{ + memblock.current_limit = limit; +} + +static void __init_memblock memblock_dump(struct memblock_type *region, char *name) +{ + unsigned long long base, size; + int i; + + pr_info(" %s.cnt = 0x%lx\n", name, region->cnt); + + for (i = 0; i < region->cnt; i++) { + base = region->regions[i].base; + size = region->regions[i].size; + + pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n", + name, i, base, base + size - 1, size); + } +} + +void __init_memblock memblock_dump_all(void) +{ + if (!memblock_debug) + return; + + pr_info("MEMBLOCK configuration:\n"); + pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size); + + memblock_dump(&memblock.memory, "memory"); + memblock_dump(&memblock.reserved, "reserved"); +} + +void __init memblock_analyze(void) +{ + int i; + + /* Check marker in the unused last array entry */ + WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base + != (phys_addr_t)RED_INACTIVE); + WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base + != (phys_addr_t)RED_INACTIVE); + + memblock.memory_size = 0; + + for (i = 0; i < memblock.memory.cnt; i++) + memblock.memory_size += memblock.memory.regions[i].size; + + /* We allow resizing from there */ + memblock_can_resize = 1; +} + +void __init memblock_init(void) +{ + static int init_done __initdata = 0; + + if (init_done) + return; + init_done = 1; + + /* Hookup the initial arrays */ + memblock.memory.regions = memblock_memory_init_regions; + memblock.memory.max = INIT_MEMBLOCK_REGIONS; + memblock.reserved.regions = memblock_reserved_init_regions; + memblock.reserved.max = INIT_MEMBLOCK_REGIONS; + + /* Write a marker in the unused last array entry */ + memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + + /* Create a dummy zero size MEMBLOCK which will get coalesced away later. + * This simplifies the memblock_add() code below... + */ + memblock.memory.regions[0].base = 0; + memblock.memory.regions[0].size = 0; + memblock.memory.cnt = 1; + + /* Ditto. */ + memblock.reserved.regions[0].base = 0; + memblock.reserved.regions[0].size = 0; + memblock.reserved.cnt = 1; + + memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE; +} + +static int __init early_memblock(char *p) +{ + if (p && strstr(p, "debug")) + memblock_debug = 1; + return 0; +} +early_param("memblock", early_memblock); + +#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK) + +static int memblock_debug_show(struct seq_file *m, void *private) +{ + struct memblock_type *type = m->private; + struct memblock_region *reg; + int i; + + for (i = 0; i < type->cnt; i++) { + reg = &type->regions[i]; + seq_printf(m, "%4d: ", i); + if (sizeof(phys_addr_t) == 4) + seq_printf(m, "0x%08lx..0x%08lx\n", + (unsigned long)reg->base, + (unsigned long)(reg->base + reg->size - 1)); + else + seq_printf(m, "0x%016llx..0x%016llx\n", + (unsigned long long)reg->base, + (unsigned long long)(reg->base + reg->size - 1)); + + } + return 0; +} + +static int memblock_debug_open(struct inode *inode, struct file *file) +{ + return single_open(file, memblock_debug_show, inode->i_private); +} + +static const struct file_operations memblock_debug_fops = { + .open = memblock_debug_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init memblock_init_debugfs(void) +{ + struct dentry *root = debugfs_create_dir("memblock", NULL); + if (!root) + return -ENXIO; + debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops); + debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops); + + return 0; +} +__initcall(memblock_init_debugfs); + +#endif /* CONFIG_DEBUG_FS */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2fa20d..00bb8a6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6,6 +6,10 @@ * Copyright 2007 OpenVZ SWsoft Inc * Author: Pavel Emelianov <xemul@openvz.org> * + * Memory thresholds + * Copyright (C) 2009 Nokia Corporation + * Author: Kirill A. Shutemov + * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -21,6 +25,7 @@ #include <linux/memcontrol.h> #include <linux/cgroup.h> #include <linux/mm.h> +#include <linux/hugetlb.h> #include <linux/pagemap.h> #include <linux/smp.h> #include <linux/page-flags.h> @@ -29,30 +34,54 @@ #include <linux/rcupdate.h> #include <linux/limits.h> #include <linux/mutex.h> +#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/swap.h> +#include <linux/swapops.h> #include <linux/spinlock.h> +#include <linux/eventfd.h> +#include <linux/sort.h> #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> #include <linux/mm_inline.h> #include <linux/page_cgroup.h> +#include <linux/cpu.h> +#include <linux/oom.h> #include "internal.h" #include <asm/uaccess.h> +#include <trace/events/vmscan.h> + struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 +struct mem_cgroup *root_mem_cgroup __read_mostly; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ int do_swap_account __read_mostly; -static int really_do_swap_account __initdata = 1; /* for remember boot option*/ + +/* for remember boot option*/ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED +static int really_do_swap_account __initdata = 1; +#else +static int really_do_swap_account __initdata = 0; +#endif + #else #define do_swap_account (0) #endif -static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ +/* + * Per memcg event counter is incremented at every pagein/pageout. This counter + * is used for trigger some periodic events. This is straightforward and better + * than using jiffies etc. to handle periodic memcg event. + * + * These values will be used as !((event) & ((1 <<(thresh)) - 1)) + */ +#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */ +#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */ /* * Statistics for memory cgroup. @@ -63,50 +92,23 @@ enum mem_cgroup_stat_index { */ MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ - MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ + MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ + MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ + MEM_CGROUP_STAT_DATA, /* end of data requires synchronization */ + /* incremented at every pagein/pageout */ + MEM_CGROUP_EVENTS = MEM_CGROUP_STAT_DATA, + MEM_CGROUP_ON_MOVE, /* someone is moving account between groups */ MEM_CGROUP_STAT_NSTATS, }; struct mem_cgroup_stat_cpu { s64 count[MEM_CGROUP_STAT_NSTATS]; -} ____cacheline_aligned_in_smp; - -struct mem_cgroup_stat { - struct mem_cgroup_stat_cpu cpustat[0]; }; /* - * For accounting under irq disable, no need for increment preempt count. - */ -static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, - enum mem_cgroup_stat_index idx, int val) -{ - stat->count[idx] += val; -} - -static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, - enum mem_cgroup_stat_index idx) -{ - int cpu; - s64 ret = 0; - for_each_possible_cpu(cpu) - ret += stat->cpustat[cpu].count[idx]; - return ret; -} - -static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat) -{ - s64 ret; - - ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE); - ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS); - return ret; -} - -/* * per-zone information in memory controller. */ struct mem_cgroup_per_zone { @@ -117,6 +119,12 @@ struct mem_cgroup_per_zone { unsigned long count[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; + struct rb_node tree_node; /* RB tree node */ + unsigned long long usage_in_excess;/* Set to the value by which */ + /* the soft limit is exceeded*/ + bool on_tree; + struct mem_cgroup *mem; /* Back pointer, we cannot */ + /* use container_of */ }; /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -130,6 +138,61 @@ struct mem_cgroup_lru_info { }; /* + * Cgroups above their limits are maintained in a RB-Tree, independent of + * their hierarchy representation + */ + +struct mem_cgroup_tree_per_zone { + struct rb_root rb_root; + spinlock_t lock; +}; + +struct mem_cgroup_tree_per_node { + struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; +}; + +struct mem_cgroup_tree { + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; +}; + +static struct mem_cgroup_tree soft_limit_tree __read_mostly; + +struct mem_cgroup_threshold { + struct eventfd_ctx *eventfd; + u64 threshold; +}; + +/* For threshold */ +struct mem_cgroup_threshold_ary { + /* An array index points to threshold just below usage. */ + int current_threshold; + /* Size of entries[] */ + unsigned int size; + /* Array of thresholds */ + struct mem_cgroup_threshold entries[0]; +}; + +struct mem_cgroup_thresholds { + /* Primary thresholds array */ + struct mem_cgroup_threshold_ary *primary; + /* + * Spare threshold array. + * This is needed to make mem_cgroup_unregister_event() "never fail". + * It must be able to store at least primary->size - 1 entries. + */ + struct mem_cgroup_threshold_ary *spare; +}; + +/* for OOM */ +struct mem_cgroup_eventfd_list { + struct list_head list; + struct eventfd_ctx *eventfd; +}; + +static void mem_cgroup_threshold(struct mem_cgroup *mem); +static void mem_cgroup_oom_notify(struct mem_cgroup *mem); + +/* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, @@ -161,10 +224,8 @@ struct mem_cgroup { */ spinlock_t reclaim_param_lock; - int prev_priority; /* for recording reclaim priority */ - /* - * While reclaiming in a hiearchy, we cache the last child we + * While reclaiming in a hierarchy, we cache the last child we * reclaimed from. */ int last_scanned_child; @@ -172,20 +233,91 @@ struct mem_cgroup { * Should the accounting and control be hierarchical, per subtree? */ bool use_hierarchy; - unsigned long last_oom_jiffies; + atomic_t oom_lock; atomic_t refcnt; unsigned int swappiness; + /* OOM-Killer disable */ + int oom_kill_disable; /* set when res.limit == memsw.limit */ bool memsw_is_minimum; + /* protect arrays of thresholds */ + struct mutex thresholds_lock; + + /* thresholds for memory usage. RCU-protected */ + struct mem_cgroup_thresholds thresholds; + + /* thresholds for mem+swap usage. RCU-protected */ + struct mem_cgroup_thresholds memsw_thresholds; + + /* For oom notifier event fd */ + struct list_head oom_notify; + + /* + * Should we move charges of a task when a task is moved into this + * mem_cgroup ? And what type of charges should we move ? + */ + unsigned long move_charge_at_immigrate; /* - * statistics. This must be placed at the end of memcg. + * percpu counter. */ - struct mem_cgroup_stat stat; + struct mem_cgroup_stat_cpu *stat; + /* + * used when a cpu is offlined or other synchronizations + * See mem_cgroup_read_stat(). + */ + struct mem_cgroup_stat_cpu nocpu_base; + spinlock_t pcp_counter_lock; +}; + +/* Stuffs for move charges at task migration. */ +/* + * Types of charges to be moved. "move_charge_at_immitgrate" is treated as a + * left-shifted bitmap of these types. + */ +enum move_type { + MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ + MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ + NR_MOVE_TYPE, +}; + +/* "mc" and its members are protected by cgroup_mutex */ +static struct move_charge_struct { + spinlock_t lock; /* for from, to */ + struct mem_cgroup *from; + struct mem_cgroup *to; + unsigned long precharge; + unsigned long moved_charge; + unsigned long moved_swap; + struct task_struct *moving_task; /* a task moving charges */ + struct mm_struct *mm; + wait_queue_head_t waitq; /* a waitq for other context */ +} mc = { + .lock = __SPIN_LOCK_UNLOCKED(mc.lock), + .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), }; +static bool move_anon(void) +{ + return test_bit(MOVE_CHARGE_TYPE_ANON, + &mc.to->move_charge_at_immigrate); +} + +static bool move_file(void) +{ + return test_bit(MOVE_CHARGE_TYPE_FILE, + &mc.to->move_charge_at_immigrate); +} + +/* + * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft + * limit reclaim to prevent infinite loops, if they ever occur. + */ +#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) + enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, @@ -200,48 +332,33 @@ enum charge_type { #define PCGF_CACHE (1UL << PCG_CACHE) #define PCGF_USED (1UL << PCG_USED) #define PCGF_LOCK (1UL << PCG_LOCK) -static const unsigned long -pcg_default_flags[NR_CHARGE_TYPE] = { - PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ - PCGF_USED | PCGF_LOCK, /* Anon */ - PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ - 0, /* FORCE */ -}; +/* Not used, but added here for completeness */ +#define PCGF_ACCT (1UL << PCG_ACCT) /* for encoding cft->private value on file */ #define _MEM (0) #define _MEMSWAP (1) +#define _OOM_TYPE (2) #define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) +/* Used for OOM nofiier */ +#define OOM_CONTROL (0) + +/* + * Reclaim flags for mem_cgroup_hierarchical_reclaim + */ +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 +#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) +#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 +#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) +#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 +#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); - -static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, - struct page_cgroup *pc, - bool charge) -{ - int val = (charge)? 1 : -1; - struct mem_cgroup_stat *stat = &mem->stat; - struct mem_cgroup_stat_cpu *cpustat; - int cpu = get_cpu(); - - cpustat = &stat->cpustat[cpu]; - if (PageCgroupCache(pc)) - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); - else - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); - - if (charge) - __mem_cgroup_stat_add_safe(cpustat, - MEM_CGROUP_STAT_PGPGIN_COUNT, 1); - else - __mem_cgroup_stat_add_safe(cpustat, - MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); - put_cpu(); -} +static void drain_all_stock_async(void); static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) @@ -249,6 +366,11 @@ mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) return &mem->info.nodeinfo[nid]->zoneinfo[zid]; } +struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *mem) +{ + return &mem->css; +} + static struct mem_cgroup_per_zone * page_cgroup_zoneinfo(struct page_cgroup *pc) { @@ -262,6 +384,244 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) return mem_cgroup_zoneinfo(mem, nid, zid); } +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_node_zone(int nid, int zid) +{ + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_from_page(struct page *page) +{ + int nid = page_to_nid(page); + int zid = page_zonenum(page); + + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static void +__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) +{ + struct rb_node **p = &mctz->rb_root.rb_node; + struct rb_node *parent = NULL; + struct mem_cgroup_per_zone *mz_node; + + if (mz->on_tree) + return; + + mz->usage_in_excess = new_usage_in_excess; + if (!mz->usage_in_excess) + return; + while (*p) { + parent = *p; + mz_node = rb_entry(parent, struct mem_cgroup_per_zone, + tree_node); + if (mz->usage_in_excess < mz_node->usage_in_excess) + p = &(*p)->rb_left; + /* + * We can't avoid mem cgroups that are over their soft + * limit by the same amount + */ + else if (mz->usage_in_excess >= mz_node->usage_in_excess) + p = &(*p)->rb_right; + } + rb_link_node(&mz->tree_node, parent, p); + rb_insert_color(&mz->tree_node, &mctz->rb_root); + mz->on_tree = true; +} + +static void +__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + if (!mz->on_tree) + return; + rb_erase(&mz->tree_node, &mctz->rb_root); + mz->on_tree = false; +} + +static void +mem_cgroup_remove_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + spin_lock(&mctz->lock); + __mem_cgroup_remove_exceeded(mem, mz, mctz); + spin_unlock(&mctz->lock); +} + + +static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) +{ + unsigned long long excess; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + int nid = page_to_nid(page); + int zid = page_zonenum(page); + mctz = soft_limit_tree_from_page(page); + + /* + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. + */ + for (; mem; mem = parent_mem_cgroup(mem)) { + mz = mem_cgroup_zoneinfo(mem, nid, zid); + excess = res_counter_soft_limit_excess(&mem->res); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + spin_lock(&mctz->lock); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(mem, mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); + spin_unlock(&mctz->lock); + } + } +} + +static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) +{ + int node, zone; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + + for_each_node_state(node, N_POSSIBLE) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + mz = mem_cgroup_zoneinfo(mem, node, zone); + mctz = soft_limit_tree_node_zone(node, zone); + mem_cgroup_remove_exceeded(mem, mz, mctz); + } + } +} + +static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) +{ + return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; +} + +static struct mem_cgroup_per_zone * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct rb_node *rightmost = NULL; + struct mem_cgroup_per_zone *mz; + +retry: + mz = NULL; + rightmost = rb_last(&mctz->rb_root); + if (!rightmost) + goto done; /* Nothing to reclaim from */ + + mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); + /* + * Remove the node now but someone else can add it back, + * we will to add it back at the end of reclaim to its correct + * position in the tree. + */ + __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); + if (!res_counter_soft_limit_excess(&mz->mem->res) || + !css_tryget(&mz->mem->css)) + goto retry; +done: + return mz; +} + +static struct mem_cgroup_per_zone * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct mem_cgroup_per_zone *mz; + + spin_lock(&mctz->lock); + mz = __mem_cgroup_largest_soft_limit_node(mctz); + spin_unlock(&mctz->lock); + return mz; +} + +/* + * Implementation Note: reading percpu statistics for memcg. + * + * Both of vmstat[] and percpu_counter has threshold and do periodic + * synchronization to implement "quick" read. There are trade-off between + * reading cost and precision of value. Then, we may have a chance to implement + * a periodic synchronizion of counter in memcg's counter. + * + * But this _read() function is used for user interface now. The user accounts + * memory usage by memory cgroup and he _always_ requires exact value because + * he accounts memory. Even if we provide quick-and-fuzzy read, we always + * have to visit all online cpus and make sum. So, for now, unnecessary + * synchronization is not implemented. (just implemented for cpu hotplug) + * + * If there are kernel internal actions which can make use of some not-exact + * value, and reading all cpu value can be performance bottleneck in some + * common workload, threashold and synchonization as vmstat[] should be + * implemented. + */ +static s64 mem_cgroup_read_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) +{ + int cpu; + s64 val = 0; + + get_online_cpus(); + for_each_online_cpu(cpu) + val += per_cpu(mem->stat->count[idx], cpu); +#ifdef CONFIG_HOTPLUG_CPU + spin_lock(&mem->pcp_counter_lock); + val += mem->nocpu_base.count[idx]; + spin_unlock(&mem->pcp_counter_lock); +#endif + put_online_cpus(); + return val; +} + +static s64 mem_cgroup_local_usage(struct mem_cgroup *mem) +{ + s64 ret; + + ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); + ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); + return ret; +} + +static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, + bool charge) +{ + int val = (charge) ? 1 : -1; + this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); +} + +static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, + struct page_cgroup *pc, + bool charge) +{ + int val = (charge) ? 1 : -1; + + preempt_disable(); + + if (PageCgroupCache(pc)) + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); + else + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); + + if (charge) + __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); + else + __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); + __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); + + preempt_enable(); +} + static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { @@ -277,6 +637,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, return total; } +static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift) +{ + s64 val; + + val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]); + + return !(val & ((1 << event_mask_shift) - 1)); +} + +/* + * Check events in order. + * + */ +static void memcg_check_events(struct mem_cgroup *mem, struct page *page) +{ + /* threshold event is triggered in finer grain than soft limit */ + if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) { + mem_cgroup_threshold(mem); + if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH))) + mem_cgroup_update_tree(mem, page); + } +} + static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { return container_of(cgroup_subsys_state(cont, @@ -319,39 +702,87 @@ static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) return mem; } -/* - * Call callback function against all cgroup under hierarchy tree. - */ -static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, - int (*func)(struct mem_cgroup *, void *)) +/* The caller has to guarantee "mem" exists before calling this */ +static struct mem_cgroup *mem_cgroup_start_loop(struct mem_cgroup *mem) { - int found, ret, nextid; struct cgroup_subsys_state *css; - struct mem_cgroup *mem; - - if (!root->use_hierarchy) - return (*func)(root, data); + int found; - nextid = 1; - do { - ret = 0; + if (!mem) /* ROOT cgroup has the smallest ID */ + return root_mem_cgroup; /*css_put/get against root is ignored*/ + if (!mem->use_hierarchy) { + if (css_tryget(&mem->css)) + return mem; + return NULL; + } + rcu_read_lock(); + /* + * searching a memory cgroup which has the smallest ID under given + * ROOT cgroup. (ID >= 1) + */ + css = css_get_next(&mem_cgroup_subsys, 1, &mem->css, &found); + if (css && css_tryget(css)) + mem = container_of(css, struct mem_cgroup, css); + else mem = NULL; + rcu_read_unlock(); + return mem; +} +static struct mem_cgroup *mem_cgroup_get_next(struct mem_cgroup *iter, + struct mem_cgroup *root, + bool cond) +{ + int nextid = css_id(&iter->css) + 1; + int found; + int hierarchy_used; + struct cgroup_subsys_state *css; + + hierarchy_used = iter->use_hierarchy; + + css_put(&iter->css); + /* If no ROOT, walk all, ignore hierarchy */ + if (!cond || (root && !hierarchy_used)) + return NULL; + + if (!root) + root = root_mem_cgroup; + + do { + iter = NULL; rcu_read_lock(); - css = css_get_next(&mem_cgroup_subsys, nextid, &root->css, - &found); + + css = css_get_next(&mem_cgroup_subsys, nextid, + &root->css, &found); if (css && css_tryget(css)) - mem = container_of(css, struct mem_cgroup, css); + iter = container_of(css, struct mem_cgroup, css); rcu_read_unlock(); - - if (mem) { - ret = (*func)(mem, data); - css_put(&mem->css); - } + /* If css is NULL, no more cgroups will be found */ nextid = found + 1; - } while (!ret && css); + } while (css && !iter); - return ret; + return iter; +} +/* + * for_eacn_mem_cgroup_tree() for visiting all cgroup under tree. Please + * be careful that "break" loop is not allowed. We have reference count. + * Instead of that modify "cond" to be false and "continue" to exit the loop. + */ +#define for_each_mem_cgroup_tree_cond(iter, root, cond) \ + for (iter = mem_cgroup_start_loop(root);\ + iter != NULL;\ + iter = mem_cgroup_get_next(iter, root, cond)) + +#define for_each_mem_cgroup_tree(iter, root) \ + for_each_mem_cgroup_tree_cond(iter, root, true) + +#define for_each_mem_cgroup_all(iter) \ + for_each_mem_cgroup_tree_cond(iter, NULL, true) + + +static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) +{ + return (mem == root_mem_cgroup); } /* @@ -371,22 +802,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) { struct page_cgroup *pc; - struct mem_cgroup *mem; struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); /* can happen while we handle swapcache. */ - if (list_empty(&pc->lru) || !pc->mem_cgroup) + if (!TestClearPageCgroupAcctLRU(pc)) return; + VM_BUG_ON(!pc->mem_cgroup); /* * We don't check PCG_USED bit. It's cleared when the "page" is finally * removed from global LRU. */ mz = page_cgroup_zoneinfo(pc); - mem = pc->mem_cgroup; MEM_CGROUP_ZSTAT(mz, lru) -= 1; + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; + VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); return; } @@ -410,8 +843,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) * For making pc->mem_cgroup visible, insert smp_rmb() here. */ smp_rmb(); - /* unused page is not rotated. */ - if (!PageCgroupUsed(pc)) + /* unused or root page is not rotated. */ + if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) return; mz = page_cgroup_zoneinfo(pc); list_move(&pc->lru, &mz->lists[lru]); @@ -425,6 +858,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); + VM_BUG_ON(PageCgroupAcctLRU(pc)); /* * Used bit is set without atomic ops but after smp_wmb(). * For making pc->mem_cgroup visible, insert smp_rmb() here. @@ -435,6 +869,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) mz = page_cgroup_zoneinfo(pc); MEM_CGROUP_ZSTAT(mz, lru) += 1; + SetPageCgroupAcctLRU(pc); + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; list_add(&pc->lru, &mz->lists[lru]); } @@ -469,7 +906,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); /* link when the page is linked to LRU but page_cgroup isn't */ - if (PageLRU(page) && list_empty(&pc->lru)) + if (PageLRU(page) && !PageCgroupAcctLRU(pc)) mem_cgroup_add_lru_list(page, page_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } @@ -488,15 +925,22 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) { int ret; struct mem_cgroup *curr = NULL; + struct task_struct *p; - task_lock(task); - rcu_read_lock(); - curr = try_get_mem_cgroup_from_mm(task->mm); - rcu_read_unlock(); - task_unlock(task); + p = find_lock_task_mm(task); + if (!p) + return 0; + curr = try_get_mem_cgroup_from_mm(p->mm); + task_unlock(p); if (!curr) return 0; - if (curr->use_hierarchy) + /* + * We should check use_hierarchy of "mem" not "curr". Because checking + * use_hierarchy of "curr" here make this function true if hierarchy is + * enabled in "curr" and "curr" is a child of "mem" in *cgroup* + * hierarchy(even if use_hierarchy is disabled in "mem"). + */ + if (mem->use_hierarchy) ret = css_is_ancestor(&curr->css, &mem->css); else ret = (curr == mem); @@ -504,35 +948,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) return ret; } -/* - * prev_priority control...this will be used in memory reclaim path. - */ -int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) -{ - int prev_priority; - - spin_lock(&mem->reclaim_param_lock); - prev_priority = mem->prev_priority; - spin_unlock(&mem->reclaim_param_lock); - - return prev_priority; -} - -void mem_cgroup_note_reclaim_priority(struct mem_cgroup *mem, int priority) -{ - spin_lock(&mem->reclaim_param_lock); - if (priority < mem->prev_priority) - mem->prev_priority = priority; - spin_unlock(&mem->reclaim_param_lock); -} - -void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) -{ - spin_lock(&mem->reclaim_param_lock); - mem->prev_priority = priority; - spin_unlock(&mem->reclaim_param_lock); -} - static int calc_inactive_ratio(struct mem_cgroup *memcg, unsigned long *present_pages) { unsigned long active; @@ -590,7 +1005,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone *zone, enum lru_list lru) { - int nid = zone->zone_pgdat->node_id; + int nid = zone_to_nid(zone); int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); @@ -600,7 +1015,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg, struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg, struct zone *zone) { - int nid = zone->zone_pgdat->node_id; + int nid = zone_to_nid(zone); int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid); @@ -645,10 +1060,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, LIST_HEAD(pc_list); struct list_head *src; struct page_cgroup *pc, *tmp; - int nid = z->zone_pgdat->node_id; + int nid = zone_to_nid(z); int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; - int lru = LRU_FILE * !!file + !!active; + int lru = LRU_FILE * file + active; int ret; BUG_ON(!mem_cont); @@ -684,6 +1099,10 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, } *scanned = scan; + + trace_mm_vmscan_memcg_isolate(0, nr_to_scan, scan, nr_taken, + 0, 0, 0, mode); + return nr_taken; } @@ -718,15 +1137,94 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg) return swappiness; } -static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data) +static void mem_cgroup_start_move(struct mem_cgroup *mem) { - int *val = data; - (*val)++; - return 0; + int cpu; + + get_online_cpus(); + spin_lock(&mem->pcp_counter_lock); + for_each_online_cpu(cpu) + per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) += 1; + mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] += 1; + spin_unlock(&mem->pcp_counter_lock); + put_online_cpus(); + + synchronize_rcu(); +} + +static void mem_cgroup_end_move(struct mem_cgroup *mem) +{ + int cpu; + + if (!mem) + return; + get_online_cpus(); + spin_lock(&mem->pcp_counter_lock); + for_each_online_cpu(cpu) + per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) -= 1; + mem->nocpu_base.count[MEM_CGROUP_ON_MOVE] -= 1; + spin_unlock(&mem->pcp_counter_lock); + put_online_cpus(); +} +/* + * 2 routines for checking "mem" is under move_account() or not. + * + * mem_cgroup_stealed() - checking a cgroup is mc.from or not. This is used + * for avoiding race in accounting. If true, + * pc->mem_cgroup may be overwritten. + * + * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or + * under hierarchy of moving cgroups. This is for + * waiting at hith-memory prressure caused by "move". + */ + +static bool mem_cgroup_stealed(struct mem_cgroup *mem) +{ + VM_BUG_ON(!rcu_read_lock_held()); + return this_cpu_read(mem->stat->count[MEM_CGROUP_ON_MOVE]) > 0; +} + +static bool mem_cgroup_under_move(struct mem_cgroup *mem) +{ + struct mem_cgroup *from; + struct mem_cgroup *to; + bool ret = false; + /* + * Unlike task_move routines, we access mc.to, mc.from not under + * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. + */ + spin_lock(&mc.lock); + from = mc.from; + to = mc.to; + if (!from) + goto unlock; + if (from == mem || to == mem + || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css)) + || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css))) + ret = true; +unlock: + spin_unlock(&mc.lock); + return ret; +} + +static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem) +{ + if (mc.moving_task && current != mc.moving_task) { + if (mem_cgroup_under_move(mem)) { + DEFINE_WAIT(wait); + prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); + /* moving charge context might have finished. */ + if (mc.moving_task) + schedule(); + finish_wait(&mc.waitq, &wait); + return true; + } + } + return false; } /** - * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode. + * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode. * @memcg: The memory cgroup that went over limit * @p: Task that is going to be killed * @@ -745,7 +1243,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) static char memcg_name[PATH_MAX]; int ret; - if (!memcg) + if (!memcg || !p) return; @@ -799,11 +1297,32 @@ done: static int mem_cgroup_count_children(struct mem_cgroup *mem) { int num = 0; - mem_cgroup_walk_tree(mem, &num, mem_cgroup_count_children_cb); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + num++; return num; } /* + * Return the memory (and swap, if configured) limit for a memcg. + */ +u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) +{ + u64 limit; + u64 memsw; + + limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + + total_swap_pages; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + /* + * If memsw is finite and limits the amount of swap space available + * to this memcg, return that limit. + */ + return min(limit, memsw); +} + +/* * Visit the first child (need not be the first child as per the ordering * of the cgroup list, since we track last_scanned_child) of @mem and use * that to reclaim free pages from. @@ -855,28 +1374,63 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) * If shrink==true, for avoiding to free too much, this returns immedieately. */ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, - gfp_t gfp_mask, bool noswap, bool shrink) + struct zone *zone, + gfp_t gfp_mask, + unsigned long reclaim_options) { struct mem_cgroup *victim; int ret, total = 0; int loop = 0; + bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; + bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; + bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; + unsigned long excess = mem_cgroup_get_excess(root_mem); /* If memsw_is_minimum==1, swap-out is of-no-use. */ if (root_mem->memsw_is_minimum) noswap = true; - while (loop < 2) { + while (1) { victim = mem_cgroup_select_victim(root_mem); - if (victim == root_mem) + if (victim == root_mem) { loop++; - if (!mem_cgroup_local_usage(&victim->stat)) { + if (loop >= 1) + drain_all_stock_async(); + if (loop >= 2) { + /* + * If we have not been able to reclaim + * anything, it might because there are + * no reclaimable pages under this hierarchy + */ + if (!check_soft || !total) { + css_put(&victim->css); + break; + } + /* + * We want to do more targetted reclaim. + * excess >> 2 is not to excessive so as to + * reclaim too much, nor too less that we keep + * coming back to reclaim from this cgroup + */ + if (total >= (excess >> 2) || + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { + css_put(&victim->css); + break; + } + } + } + if (!mem_cgroup_local_usage(victim)) { /* this cgroup's local usage == 0 */ css_put(&victim->css); continue; } /* we use swappiness of local cgroup */ - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, - get_swappiness(victim)); + if (check_soft) + ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, + noswap, get_swappiness(victim), zone); + else + ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, + noswap, get_swappiness(victim)); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -886,77 +1440,446 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (shrink) return ret; total += ret; - if (mem_cgroup_check_under_limit(root_mem)) + if (check_soft) { + if (res_counter_check_under_soft_limit(&root_mem->res)) + return total; + } else if (mem_cgroup_check_under_limit(root_mem)) return 1 + total; } return total; } -bool mem_cgroup_oom_called(struct task_struct *task) +/* + * Check OOM-Killer is already running under our hierarchy. + * If someone is running, return false. + */ +static bool mem_cgroup_oom_lock(struct mem_cgroup *mem) { - bool ret = false; - struct mem_cgroup *mem; - struct mm_struct *mm; + int x, lock_count = 0; + struct mem_cgroup *iter; - rcu_read_lock(); - mm = task->mm; - if (!mm) - mm = &init_mm; - mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10)) - ret = true; - rcu_read_unlock(); - return ret; + for_each_mem_cgroup_tree(iter, mem) { + x = atomic_inc_return(&iter->oom_lock); + lock_count = max(x, lock_count); + } + + if (lock_count == 1) + return true; + return false; } -static int record_last_oom_cb(struct mem_cgroup *mem, void *data) +static int mem_cgroup_oom_unlock(struct mem_cgroup *mem) { - mem->last_oom_jiffies = jiffies; + struct mem_cgroup *iter; + + /* + * When a new child is created while the hierarchy is under oom, + * mem_cgroup_oom_lock() may not be called. We have to use + * atomic_add_unless() here. + */ + for_each_mem_cgroup_tree(iter, mem) + atomic_add_unless(&iter->oom_lock, -1, 0); return 0; } -static void record_last_oom(struct mem_cgroup *mem) + +static DEFINE_MUTEX(memcg_oom_mutex); +static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); + +struct oom_wait_info { + struct mem_cgroup *mem; + wait_queue_t wait; +}; + +static int memcg_oom_wake_function(wait_queue_t *wait, + unsigned mode, int sync, void *arg) +{ + struct mem_cgroup *wake_mem = (struct mem_cgroup *)arg; + struct oom_wait_info *oom_wait_info; + + oom_wait_info = container_of(wait, struct oom_wait_info, wait); + + if (oom_wait_info->mem == wake_mem) + goto wakeup; + /* if no hierarchy, no match */ + if (!oom_wait_info->mem->use_hierarchy || !wake_mem->use_hierarchy) + return 0; + /* + * Both of oom_wait_info->mem and wake_mem are stable under us. + * Then we can use css_is_ancestor without taking care of RCU. + */ + if (!css_is_ancestor(&oom_wait_info->mem->css, &wake_mem->css) && + !css_is_ancestor(&wake_mem->css, &oom_wait_info->mem->css)) + return 0; + +wakeup: + return autoremove_wake_function(wait, mode, sync, arg); +} + +static void memcg_wakeup_oom(struct mem_cgroup *mem) +{ + /* for filtering, pass "mem" as argument. */ + __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, mem); +} + +static void memcg_oom_recover(struct mem_cgroup *mem) +{ + if (mem && atomic_read(&mem->oom_lock)) + memcg_wakeup_oom(mem); +} + +/* + * try to call OOM killer. returns false if we should exit memory-reclaim loop. + */ +bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) { - mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb); + struct oom_wait_info owait; + bool locked, need_to_kill; + + owait.mem = mem; + owait.wait.flags = 0; + owait.wait.func = memcg_oom_wake_function; + owait.wait.private = current; + INIT_LIST_HEAD(&owait.wait.task_list); + need_to_kill = true; + /* At first, try to OOM lock hierarchy under mem.*/ + mutex_lock(&memcg_oom_mutex); + locked = mem_cgroup_oom_lock(mem); + /* + * Even if signal_pending(), we can't quit charge() loop without + * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL + * under OOM is always welcomed, use TASK_KILLABLE here. + */ + prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); + if (!locked || mem->oom_kill_disable) + need_to_kill = false; + if (locked) + mem_cgroup_oom_notify(mem); + mutex_unlock(&memcg_oom_mutex); + + if (need_to_kill) { + finish_wait(&memcg_oom_waitq, &owait.wait); + mem_cgroup_out_of_memory(mem, mask); + } else { + schedule(); + finish_wait(&memcg_oom_waitq, &owait.wait); + } + mutex_lock(&memcg_oom_mutex); + mem_cgroup_oom_unlock(mem); + memcg_wakeup_oom(mem); + mutex_unlock(&memcg_oom_mutex); + + if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current)) + return false; + /* Give chance to dying process */ + schedule_timeout(1); + return true; } /* * Currently used to update mapped file statistics, but the routine can be * generalized to update other statistics as well. + * + * Notes: Race condition + * + * We usually use page_cgroup_lock() for accessing page_cgroup member but + * it tends to be costly. But considering some conditions, we doesn't need + * to do so _always_. + * + * Considering "charge", lock_page_cgroup() is not required because all + * file-stat operations happen after a page is attached to radix-tree. There + * are no race with "charge". + * + * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup + * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even + * if there are race with "uncharge". Statistics itself is properly handled + * by flags. + * + * Considering "move", this is an only case we see a race. To make the race + * small, we check MEM_CGROUP_ON_MOVE percpu value and detect there are + * possibility of race condition. If there is, we take a lock. */ -void mem_cgroup_update_mapped_file_stat(struct page *page, int val) + +static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) { struct mem_cgroup *mem; - struct mem_cgroup_stat *stat; - struct mem_cgroup_stat_cpu *cpustat; - int cpu; - struct page_cgroup *pc; - - if (!page_is_file_cache(page)) - return; + struct page_cgroup *pc = lookup_page_cgroup(page); + bool need_unlock = false; - pc = lookup_page_cgroup(page); if (unlikely(!pc)) return; - lock_page_cgroup(pc); + rcu_read_lock(); mem = pc->mem_cgroup; - if (!mem) - goto done; + if (unlikely(!mem || !PageCgroupUsed(pc))) + goto out; + /* pc->mem_cgroup is unstable ? */ + if (unlikely(mem_cgroup_stealed(mem))) { + /* take a lock against to access pc->mem_cgroup */ + lock_page_cgroup(pc); + need_unlock = true; + mem = pc->mem_cgroup; + if (!mem || !PageCgroupUsed(pc)) + goto out; + } - if (!PageCgroupUsed(pc)) - goto done; + this_cpu_add(mem->stat->count[idx], val); + + switch (idx) { + case MEM_CGROUP_STAT_FILE_MAPPED: + if (val > 0) + SetPageCgroupFileMapped(pc); + else if (!page_mapped(page)) + ClearPageCgroupFileMapped(pc); + break; + default: + BUG(); + } + +out: + if (unlikely(need_unlock)) + unlock_page_cgroup(pc); + rcu_read_unlock(); + return; +} +void mem_cgroup_update_file_mapped(struct page *page, int val) +{ + mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); +} + +/* + * size of first charge trial. "32" comes from vmscan.c's magic value. + * TODO: maybe necessary to use big numbers in big irons. + */ +#define CHARGE_SIZE (32 * PAGE_SIZE) +struct memcg_stock_pcp { + struct mem_cgroup *cached; /* this never be root cgroup */ + int charge; + struct work_struct work; +}; +static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); +static atomic_t memcg_drain_count; + +/* + * Try to consume stocked charge on this cpu. If success, PAGE_SIZE is consumed + * from local stock and true is returned. If the stock is 0 or charges from a + * cgroup which is not current target, returns false. This stock will be + * refilled. + */ +static bool consume_stock(struct mem_cgroup *mem) +{ + struct memcg_stock_pcp *stock; + bool ret = true; + + stock = &get_cpu_var(memcg_stock); + if (mem == stock->cached && stock->charge) + stock->charge -= PAGE_SIZE; + else /* need to call res_counter_charge */ + ret = false; + put_cpu_var(memcg_stock); + return ret; +} + +/* + * Returns stocks cached in percpu to res_counter and reset cached information. + */ +static void drain_stock(struct memcg_stock_pcp *stock) +{ + struct mem_cgroup *old = stock->cached; + + if (stock->charge) { + res_counter_uncharge(&old->res, stock->charge); + if (do_swap_account) + res_counter_uncharge(&old->memsw, stock->charge); + } + stock->cached = NULL; + stock->charge = 0; +} + +/* + * This must be called under preempt disabled or must be called by + * a thread which is pinned to local cpu. + */ +static void drain_local_stock(struct work_struct *dummy) +{ + struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); + drain_stock(stock); +} + +/* + * Cache charges(val) which is from res_counter, to local per_cpu area. + * This will be consumed by consume_stock() function, later. + */ +static void refill_stock(struct mem_cgroup *mem, int val) +{ + struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); + + if (stock->cached != mem) { /* reset if necessary */ + drain_stock(stock); + stock->cached = mem; + } + stock->charge += val; + put_cpu_var(memcg_stock); +} + +/* + * Tries to drain stocked charges in other cpus. This function is asynchronous + * and just put a work per cpu for draining localy on each cpu. Caller can + * expects some charges will be back to res_counter later but cannot wait for + * it. + */ +static void drain_all_stock_async(void) +{ + int cpu; + /* This function is for scheduling "drain" in asynchronous way. + * The result of "drain" is not directly handled by callers. Then, + * if someone is calling drain, we don't have to call drain more. + * Anyway, WORK_STRUCT_PENDING check in queue_work_on() will catch if + * there is a race. We just do loose check here. + */ + if (atomic_read(&memcg_drain_count)) + return; + /* Notify other cpus that system-wide "drain" is running */ + atomic_inc(&memcg_drain_count); + get_online_cpus(); + for_each_online_cpu(cpu) { + struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); + schedule_work_on(cpu, &stock->work); + } + put_online_cpus(); + atomic_dec(&memcg_drain_count); + /* We don't wait for flush_work */ +} + +/* This is a synchronous drain interface. */ +static void drain_all_stock_sync(void) +{ + /* called when force_empty is called */ + atomic_inc(&memcg_drain_count); + schedule_on_each_cpu(drain_local_stock); + atomic_dec(&memcg_drain_count); +} + +/* + * This function drains percpu counter value from DEAD cpu and + * move it to local cpu. Note that this function can be preempted. + */ +static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *mem, int cpu) +{ + int i; + + spin_lock(&mem->pcp_counter_lock); + for (i = 0; i < MEM_CGROUP_STAT_DATA; i++) { + s64 x = per_cpu(mem->stat->count[i], cpu); + + per_cpu(mem->stat->count[i], cpu) = 0; + mem->nocpu_base.count[i] += x; + } + /* need to clear ON_MOVE value, works as a kind of lock. */ + per_cpu(mem->stat->count[MEM_CGROUP_ON_MOVE], cpu) = 0; + spin_unlock(&mem->pcp_counter_lock); +} + +static void synchronize_mem_cgroup_on_move(struct mem_cgroup *mem, int cpu) +{ + int idx = MEM_CGROUP_ON_MOVE; + + spin_lock(&mem->pcp_counter_lock); + per_cpu(mem->stat->count[idx], cpu) = mem->nocpu_base.count[idx]; + spin_unlock(&mem->pcp_counter_lock); +} + +static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, + unsigned long action, + void *hcpu) +{ + int cpu = (unsigned long)hcpu; + struct memcg_stock_pcp *stock; + struct mem_cgroup *iter; + + if ((action == CPU_ONLINE)) { + for_each_mem_cgroup_all(iter) + synchronize_mem_cgroup_on_move(iter, cpu); + return NOTIFY_OK; + } + + if ((action != CPU_DEAD) || action != CPU_DEAD_FROZEN) + return NOTIFY_OK; + + for_each_mem_cgroup_all(iter) + mem_cgroup_drain_pcp_counter(iter, cpu); + + stock = &per_cpu(memcg_stock, cpu); + drain_stock(stock); + return NOTIFY_OK; +} + + +/* See __mem_cgroup_try_charge() for details */ +enum { + CHARGE_OK, /* success */ + CHARGE_RETRY, /* need to retry but retry is not bad */ + CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ + CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ + CHARGE_OOM_DIE, /* the current is killed because of OOM */ +}; + +static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, + int csize, bool oom_check) +{ + struct mem_cgroup *mem_over_limit; + struct res_counter *fail_res; + unsigned long flags = 0; + int ret; + + ret = res_counter_charge(&mem->res, csize, &fail_res); + + if (likely(!ret)) { + if (!do_swap_account) + return CHARGE_OK; + ret = res_counter_charge(&mem->memsw, csize, &fail_res); + if (likely(!ret)) + return CHARGE_OK; + + mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); + flags |= MEM_CGROUP_RECLAIM_NOSWAP; + } else + mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); + + if (csize > PAGE_SIZE) /* change csize and retry */ + return CHARGE_RETRY; + + if (!(gfp_mask & __GFP_WAIT)) + return CHARGE_WOULDBLOCK; + + ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, + gfp_mask, flags); /* - * Preemption is already disabled, we don't need get_cpu() + * try_to_free_mem_cgroup_pages() might not give us a full + * picture of reclaim. Some pages are reclaimed and might be + * moved to swap cache or just unmapped from the cgroup. + * Check the limit again to see if the reclaim reduced the + * current usage of the cgroup before giving up */ - cpu = smp_processor_id(); - stat = &mem->stat; - cpustat = &stat->cpustat[cpu]; + if (ret || mem_cgroup_check_under_limit(mem_over_limit)) + return CHARGE_RETRY; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, val); -done: - unlock_page_cgroup(pc); + /* + * At task move, charge accounts can be doubly counted. So, it's + * better to wait until the end of task_move if something is going on. + */ + if (mem_cgroup_wait_acct_move(mem_over_limit)) + return CHARGE_RETRY; + + /* If we don't need to call oom-killer at el, return immediately */ + if (!oom_check) + return CHARGE_NOMEM; + /* check OOM */ + if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) + return CHARGE_OOM_DIE; + + return CHARGE_RETRY; } /* @@ -964,18 +1887,21 @@ done: * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, - bool oom) + gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) { - struct mem_cgroup *mem, *mem_over_limit; - int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct res_counter *fail_res; + int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; + struct mem_cgroup *mem = NULL; + int ret; + int csize = CHARGE_SIZE; - if (unlikely(test_thread_flag(TIF_MEMDIE))) { - /* Don't account this! */ - *memcg = NULL; - return 0; - } + /* + * Unlike gloval-vm's OOM-kill, we're not in memory shortage + * in system level. So, allow to go ahead dying process in addition to + * MEMDIE process. + */ + if (unlikely(test_thread_flag(TIF_MEMDIE) + || fatal_signal_pending(current))) + goto bypass; /* * We always charge the cgroup the mm_struct belongs to. @@ -983,75 +1909,132 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ - mem = *memcg; - if (likely(!mem)) { - mem = try_get_mem_cgroup_from_mm(mm); - *memcg = mem; - } else { + if (!*memcg && !mm) + goto bypass; +again: + if (*memcg) { /* css should be a valid one */ + mem = *memcg; + VM_BUG_ON(css_is_removed(&mem->css)); + if (mem_cgroup_is_root(mem)) + goto done; + if (consume_stock(mem)) + goto done; css_get(&mem->css); - } - if (unlikely(!mem)) - return 0; - - VM_BUG_ON(css_is_removed(&mem->css)); + } else { + struct task_struct *p; - while (1) { - int ret; - bool noswap = false; + rcu_read_lock(); + p = rcu_dereference(mm->owner); + /* + * Because we don't have task_lock(), "p" can exit. + * In that case, "mem" can point to root or p can be NULL with + * race with swapoff. Then, we have small risk of mis-accouning. + * But such kind of mis-account by race always happens because + * we don't have cgroup_mutex(). It's overkill and we allo that + * small race, here. + * (*) swapoff at el will charge against mm-struct not against + * task-struct. So, mm->owner can be NULL. + */ + mem = mem_cgroup_from_task(p); + if (!mem || mem_cgroup_is_root(mem)) { + rcu_read_unlock(); + goto done; + } + if (consume_stock(mem)) { + /* + * It seems dagerous to access memcg without css_get(). + * But considering how consume_stok works, it's not + * necessary. If consume_stock success, some charges + * from this memcg are cached on this cpu. So, we + * don't need to call css_get()/css_tryget() before + * calling consume_stock(). + */ + rcu_read_unlock(); + goto done; + } + /* after here, we may be blocked. we need to get refcnt */ + if (!css_tryget(&mem->css)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + } - ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); - if (likely(!ret)) { - if (!do_swap_account) - break; - ret = res_counter_charge(&mem->memsw, PAGE_SIZE, - &fail_res); - if (likely(!ret)) - break; - /* mem+swap counter fails */ - res_counter_uncharge(&mem->res, PAGE_SIZE); - noswap = true; - mem_over_limit = mem_cgroup_from_res_counter(fail_res, - memsw); - } else - /* mem counter fails */ - mem_over_limit = mem_cgroup_from_res_counter(fail_res, - res); + do { + bool oom_check; - if (!(gfp_mask & __GFP_WAIT)) - goto nomem; + /* If killed, bypass charge */ + if (fatal_signal_pending(current)) { + css_put(&mem->css); + goto bypass; + } - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, - noswap, false); - if (ret) - continue; + oom_check = false; + if (oom && !nr_oom_retries) { + oom_check = true; + nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; + } - /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up - * - */ - if (mem_cgroup_check_under_limit(mem_over_limit)) - continue; + ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check); - if (!nr_retries--) { - if (oom) { - mutex_lock(&memcg_tasklist); - mem_cgroup_out_of_memory(mem_over_limit, gfp_mask); - mutex_unlock(&memcg_tasklist); - record_last_oom(mem_over_limit); - } + switch (ret) { + case CHARGE_OK: + break; + case CHARGE_RETRY: /* not in OOM situation but retry */ + csize = PAGE_SIZE; + css_put(&mem->css); + mem = NULL; + goto again; + case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ + css_put(&mem->css); goto nomem; + case CHARGE_NOMEM: /* OOM routine works */ + if (!oom) { + css_put(&mem->css); + goto nomem; + } + /* If oom, we never return -ENOMEM */ + nr_oom_retries--; + break; + case CHARGE_OOM_DIE: /* Killed by OOM Killer */ + css_put(&mem->css); + goto bypass; } - } + } while (ret != CHARGE_OK); + + if (csize > PAGE_SIZE) + refill_stock(mem, csize - PAGE_SIZE); + css_put(&mem->css); +done: + *memcg = mem; return 0; nomem: - css_put(&mem->css); + *memcg = NULL; return -ENOMEM; +bypass: + *memcg = NULL; + return 0; +} + +/* + * Somemtimes we have to undo a charge we got by try_charge(). + * This function is for that and do uncharge, put css's refcnt. + * gotten by try_charge(). + */ +static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, + unsigned long count) +{ + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE * count); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE * count); + } } +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +{ + __mem_cgroup_cancel_charge(mem, 1); +} /* * A helper function to get mem_cgroup from ID. must be called under @@ -1072,25 +2055,22 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) return container_of(css, struct mem_cgroup, css); } -static struct mem_cgroup *try_get_mem_cgroup_from_swapcache(struct page *page) +struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) { - struct mem_cgroup *mem; + struct mem_cgroup *mem = NULL; struct page_cgroup *pc; unsigned short id; swp_entry_t ent; VM_BUG_ON(!PageLocked(page)); - if (!PageSwapCache(page)) - return NULL; - pc = lookup_page_cgroup(page); lock_page_cgroup(pc); if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; if (mem && !css_tryget(&mem->css)) mem = NULL; - } else { + } else if (PageSwapCache(page)) { ent.val = page_private(page); id = lookup_swap_cgroup(ent); rcu_read_lock(); @@ -1119,94 +2099,113 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); - css_put(&mem->css); + mem_cgroup_cancel_charge(mem); return; } + pc->mem_cgroup = mem; + /* + * We access a page_cgroup asynchronously without lock_page_cgroup(). + * Especially when a page_cgroup is taken from a page, pc->mem_cgroup + * is accessed after testing USED bit. To make pc->mem_cgroup visible + * before USED bit, we need memory barrier here. + * See mem_cgroup_add_lru_list(), etc. + */ smp_wmb(); - pc->flags = pcg_default_flags[ctype]; + switch (ctype) { + case MEM_CGROUP_CHARGE_TYPE_CACHE: + case MEM_CGROUP_CHARGE_TYPE_SHMEM: + SetPageCgroupCache(pc); + SetPageCgroupUsed(pc); + break; + case MEM_CGROUP_CHARGE_TYPE_MAPPED: + ClearPageCgroupCache(pc); + SetPageCgroupUsed(pc); + break; + default: + break; + } mem_cgroup_charge_statistics(mem, pc, true); unlock_page_cgroup(pc); + /* + * "charge_statistics" updated event counter. Then, check it. + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. + * if they exceeds softlimit. + */ + memcg_check_events(mem, pc->page); } /** - * mem_cgroup_move_account - move account of the page + * __mem_cgroup_move_account - move account of the page * @pc: page_cgroup of the page. * @from: mem_cgroup which the page is moved from. * @to: mem_cgroup which the page is moved to. @from != @to. + * @uncharge: whether we should call uncharge and css_put against @from. * * The caller must confirm following. * - page is not on LRU (isolate_page() is useful.) + * - the pc is locked, used, and ->mem_cgroup points to @from. * - * returns 0 at success, - * returns -EBUSY when lock is busy or "pc" is unstable. - * - * This function does "uncharge" from old cgroup but doesn't do "charge" to - * new cgroup. It should be done by a caller. + * This function doesn't do "charge" nor css_get to new cgroup. It should be + * done by a caller(__mem_cgroup_try_charge would be usefull). If @uncharge is + * true, this function does "uncharge" from old cgroup, but it doesn't if + * @uncharge is false, so a caller should do "uncharge". */ -static int mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to) +static void __mem_cgroup_move_account(struct page_cgroup *pc, + struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) { - struct mem_cgroup_per_zone *from_mz, *to_mz; - int nid, zid; - int ret = -EBUSY; - struct page *page; - int cpu; - struct mem_cgroup_stat *stat; - struct mem_cgroup_stat_cpu *cpustat; - VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); - - nid = page_cgroup_nid(pc); - zid = page_cgroup_zid(pc); - from_mz = mem_cgroup_zoneinfo(from, nid, zid); - to_mz = mem_cgroup_zoneinfo(to, nid, zid); - - if (!trylock_page_cgroup(pc)) - return ret; - - if (!PageCgroupUsed(pc)) - goto out; - - if (pc->mem_cgroup != from) - goto out; - - res_counter_uncharge(&from->res, PAGE_SIZE); - mem_cgroup_charge_statistics(from, pc, false); - - page = pc->page; - if (page_is_file_cache(page) && page_mapped(page)) { - cpu = smp_processor_id(); - /* Update mapped_file data for mem_cgroup "from" */ - stat = &from->stat; - cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, - -1); - - /* Update mapped_file data for mem_cgroup "to" */ - stat = &to->stat; - cpustat = &stat->cpustat[cpu]; - __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_MAPPED_FILE, - 1); + VM_BUG_ON(!page_is_cgroup_locked(pc)); + VM_BUG_ON(!PageCgroupUsed(pc)); + VM_BUG_ON(pc->mem_cgroup != from); + + if (PageCgroupFileMapped(pc)) { + /* Update mapped_file data for mem_cgroup */ + preempt_disable(); + __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); + preempt_enable(); } + mem_cgroup_charge_statistics(from, pc, false); + if (uncharge) + /* This is not "cancel", but cancel_charge does all we need. */ + mem_cgroup_cancel_charge(from); - if (do_swap_account) - res_counter_uncharge(&from->memsw, PAGE_SIZE); - css_put(&from->css); - - css_get(&to->css); + /* caller should have done css_get */ pc->mem_cgroup = to; mem_cgroup_charge_statistics(to, pc, true); - ret = 0; -out: + /* + * We charges against "to" which may not have any tasks. Then, "to" + * can be under rmdir(). But in current implementation, caller of + * this function is just force_empty() and move charge, so it's + * garanteed that "to" is never removed. So, we don't check rmdir + * status here. + */ +} + +/* + * check whether the @pc is valid for moving account and call + * __mem_cgroup_move_account() + */ +static int mem_cgroup_move_account(struct page_cgroup *pc, + struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) +{ + int ret = -EINVAL; + lock_page_cgroup(pc); + if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { + __mem_cgroup_move_account(pc, from, to, uncharge); + ret = 0; + } unlock_page_cgroup(pc); + /* + * check events + */ + memcg_check_events(to, pc->page); + memcg_check_events(from, pc->page); return ret; } @@ -1228,43 +2227,25 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (!pcg) return -EINVAL; + ret = -EBUSY; + if (!get_page_unless_zero(page)) + goto out; + if (isolate_lru_page(page)) + goto put; parent = mem_cgroup_from_cont(pcg); - - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); if (ret || !parent) - return ret; - - if (!get_page_unless_zero(page)) { - ret = -EBUSY; - goto uncharge; - } - - ret = isolate_lru_page(page); + goto put_back; + ret = mem_cgroup_move_account(pc, child, parent, true); if (ret) - goto cancel; - - ret = mem_cgroup_move_account(pc, child, parent); - + mem_cgroup_cancel_charge(parent); +put_back: putback_lru_page(page); - if (!ret) { - put_page(page); - /* drop extra refcnt by try_charge() */ - css_put(&parent->css); - return 0; - } - -cancel: +put: put_page(page); -uncharge: - /* drop extra refcnt by try_charge() */ - css_put(&parent->css); - /* uncharge if move fails */ - res_counter_uncharge(&parent->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&parent->memsw, PAGE_SIZE); +out: return ret; } @@ -1275,10 +2256,9 @@ uncharge: * < 0 if the cgroup is over its limit */ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype, - struct mem_cgroup *memcg) + gfp_t gfp_mask, enum charge_type ctype) { - struct mem_cgroup *mem; + struct mem_cgroup *mem = NULL; struct page_cgroup *pc; int ret; @@ -1288,7 +2268,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - mem = memcg; ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); if (ret || !mem) return ret; @@ -1316,7 +2295,7 @@ int mem_cgroup_newpage_charge(struct page *page, if (unlikely(!mm)) mm = &init_mm; return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL); + MEM_CGROUP_CHARGE_TYPE_MAPPED); } static void @@ -1326,7 +2305,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { - struct mem_cgroup *mem = NULL; int ret; if (mem_cgroup_disabled()) @@ -1347,7 +2325,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (!(gfp_mask & __GFP_WAIT)) { struct page_cgroup *pc; - pc = lookup_page_cgroup(page); if (!pc) return 0; @@ -1359,22 +2336,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, unlock_page_cgroup(pc); } - if (unlikely(!mm && !mem)) + if (unlikely(!mm)) mm = &init_mm; if (page_is_file_cache(page)) return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); + MEM_CGROUP_CHARGE_TYPE_CACHE); /* shmem */ if (PageSwapCache(page)) { + struct mem_cgroup *mem = NULL; + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem); if (!ret) __mem_cgroup_commit_charge_swapin(page, mem, MEM_CGROUP_CHARGE_TYPE_SHMEM); } else ret = mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_SHMEM, mem); + MEM_CGROUP_CHARGE_TYPE_SHMEM); return ret; } @@ -1382,7 +2361,7 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, /* * While swap-in, try_charge -> commit or cancel, the page is locked. * And when try_charge() successfully returns, one refcnt to memcg without - * struct page_cgroup is aquired. This refcnt will be cumsumed by + * struct page_cgroup is acquired. This refcnt will be consumed by * "commit()" or removed by "cancel()" */ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, @@ -1399,17 +2378,17 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, goto charge_cur_mm; /* * A racing thread's fault, or swapoff, may have already updated - * the pte, and even removed page from swap cache: return success - * to go on to do_swap_page()'s pte_same() test, which should fail. + * the pte, and even removed page from swap cache: in those cases + * do_swap_page()'s pte_same() test will fail; but there's also a + * KSM case which does need to charge the page. */ if (!PageSwapCache(page)) - return 0; - mem = try_get_mem_cgroup_from_swapcache(page); + goto charge_cur_mm; + mem = try_get_mem_cgroup_from_page(page); if (!mem) goto charge_cur_mm; *ptr = mem; ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); - /* drop extra refcnt from tryget */ css_put(&mem->css); return ret; charge_cur_mm: @@ -1428,6 +2407,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, return; if (!ptr) return; + cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); __mem_cgroup_commit_charge(ptr, pc, ctype); @@ -1452,13 +2432,19 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); } - /* add this page(page_cgroup) to the LRU we want. */ - + /* + * At swapin, we may charge account against cgroup which has no tasks. + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. + */ + cgroup_release_and_wakeup_rmdir(&ptr->css); } void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr) @@ -1473,13 +2459,58 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; + mem_cgroup_cancel_charge(mem); +} + +static void +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +{ + struct memcg_batch_info *batch = NULL; + bool uncharge_memsw = true; + /* If swapout, usage of swap doesn't decrease */ + if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) + uncharge_memsw = false; + + batch = ¤t->memcg_batch; + /* + * In usual, we do css_get() when we remember memcg pointer. + * But in this case, we keep res->usage until end of a series of + * uncharges. Then, it's ok to ignore memcg's refcnt. + */ + if (!batch->memcg) + batch->memcg = mem; + /* + * do_batch > 0 when unmapping pages or inode invalidate/truncate. + * In those cases, all pages freed continously can be expected to be in + * the same cgroup and we have chance to coalesce uncharges. + * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) + * because we want to do uncharge as soon as possible. + */ + + if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) + goto direct_uncharge; + + /* + * In typical case, batch->memcg == mem. This means we can + * merge a series of uncharges to an uncharge of res_counter. + * If not, we uncharge res_counter ony by one. + */ + if (batch->memcg != mem) + goto direct_uncharge; + /* remember freed charge and uncharge it later */ + batch->bytes += PAGE_SIZE; + if (uncharge_memsw) + batch->memsw_bytes += PAGE_SIZE; + return; +direct_uncharge: res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account) + if (uncharge_memsw) res_counter_uncharge(&mem->memsw, PAGE_SIZE); - css_put(&mem->css); + if (unlikely(batch->memcg != mem)) + memcg_oom_recover(mem); + return; } - /* * uncharge if !page_mapped(page) */ @@ -1488,7 +2519,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { struct page_cgroup *pc; struct mem_cgroup *mem = NULL; - struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) return NULL; @@ -1513,7 +2543,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) switch (ctype) { case MEM_CGROUP_CHARGE_TYPE_MAPPED: case MEM_CGROUP_CHARGE_TYPE_DROP: - if (page_mapped(page)) + /* See mem_cgroup_prepare_migration() */ + if (page_mapped(page) || PageCgroupMigration(pc)) goto unlock_out; break; case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: @@ -1527,9 +2558,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); mem_cgroup_charge_statistics(mem, pc, false); ClearPageCgroupUsed(pc); @@ -1540,12 +2568,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) * special functions. */ - mz = page_cgroup_zoneinfo(pc); unlock_page_cgroup(pc); - - /* at swapout, this memcg will be accessed to record to swap */ - if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) - css_put(&mem->css); + /* + * even after unlock, we have mem->res.usage here and this memcg + * will never be freed. + */ + memcg_check_events(mem, page); + if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { + mem_cgroup_swap_statistics(mem, true); + mem_cgroup_get(mem); + } + if (!mem_cgroup_is_root(mem)) + __do_uncharge(mem, ctype); return mem; @@ -1571,6 +2605,51 @@ void mem_cgroup_uncharge_cache_page(struct page *page) __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } +/* + * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. + * In that cases, pages are freed continuously and we can expect pages + * are in the same memcg. All these calls itself limits the number of + * pages freed at once, then uncharge_start/end() is called properly. + * This may be called prural(2) times in a context, + */ + +void mem_cgroup_uncharge_start(void) +{ + current->memcg_batch.do_batch++; + /* We can do nest. */ + if (current->memcg_batch.do_batch == 1) { + current->memcg_batch.memcg = NULL; + current->memcg_batch.bytes = 0; + current->memcg_batch.memsw_bytes = 0; + } +} + +void mem_cgroup_uncharge_end(void) +{ + struct memcg_batch_info *batch = ¤t->memcg_batch; + + if (!batch->do_batch) + return; + + batch->do_batch--; + if (batch->do_batch) /* If stacked, do nothing. */ + return; + + if (!batch->memcg) + return; + /* + * This "batch->memcg" is valid without any css_get/put etc... + * bacause we hide charges behind us. + */ + if (batch->bytes) + res_counter_uncharge(&batch->memcg->res, batch->bytes); + if (batch->memsw_bytes) + res_counter_uncharge(&batch->memcg->memsw, batch->memsw_bytes); + memcg_oom_recover(batch->memcg); + /* forget this pointer (for sanity check) */ + batch->memcg = NULL; +} + #ifdef CONFIG_SWAP /* * called after __delete_from_swap_cache() and drop "page" account. @@ -1587,13 +2666,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) memcg = __mem_cgroup_uncharge_common(page, ctype); - /* record memcg information */ - if (do_swap_account && swapout && memcg) { + /* + * record memcg information, if swapout && memcg != NULL, + * mem_cgroup_get() was called in uncharge(). + */ + if (do_swap_account && swapout && memcg) swap_cgroup_record(ent, css_id(&memcg->css)); - mem_cgroup_get(memcg); - } - if (swapout && memcg) - css_put(&memcg->css); } #endif @@ -1618,21 +2696,82 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * We uncharge this because swap is freed. * This memcg can be obsolete one. We avoid calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); } + +/** + * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. + * @entry: swap entry to be moved + * @from: mem_cgroup which the entry is moved from + * @to: mem_cgroup which the entry is moved to + * @need_fixup: whether we should fixup res_counters and refcounts. + * + * It succeeds only when the swap_cgroup's record for this entry is the same + * as the mem_cgroup's id of @from. + * + * Returns 0 on success, -EINVAL on failure. + * + * The caller must have charged to @to, IOW, called res_counter_charge() about + * both res and memsw, and called css_get(). + */ +static int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) +{ + unsigned short old_id, new_id; + + old_id = css_id(&from->css); + new_id = css_id(&to->css); + + if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { + mem_cgroup_swap_statistics(from, false); + mem_cgroup_swap_statistics(to, true); + /* + * This function is only called from task migration context now. + * It postpones res_counter and refcount handling till the end + * of task migration(mem_cgroup_clear_mc()) for performance + * improvement. But we cannot postpone mem_cgroup_get(to) + * because if the process that has been moved to @to does + * swap-in, the refcount of @to might be decreased to 0. + */ + mem_cgroup_get(to); + if (need_fixup) { + if (!mem_cgroup_is_root(from)) + res_counter_uncharge(&from->memsw, PAGE_SIZE); + mem_cgroup_put(from); + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + if (!mem_cgroup_is_root(to)) + res_counter_uncharge(&to->res, PAGE_SIZE); + } + return 0; + } + return -EINVAL; +} +#else +static inline int mem_cgroup_move_swap_account(swp_entry_t entry, + struct mem_cgroup *from, struct mem_cgroup *to, bool need_fixup) +{ + return -EINVAL; +} #endif /* * Before starting migration, account PAGE_SIZE to mem_cgroup that the old * page belongs to. */ -int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) +int mem_cgroup_prepare_migration(struct page *page, + struct page *newpage, struct mem_cgroup **ptr) { struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + enum charge_type ctype; int ret = 0; if (mem_cgroup_disabled()) @@ -1643,67 +2782,126 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; css_get(&mem->css); + /* + * At migrating an anonymous page, its mapcount goes down + * to 0 and uncharge() will be called. But, even if it's fully + * unmapped, migration may fail and this page has to be + * charged again. We set MIGRATION flag here and delay uncharge + * until end_migration() is called + * + * Corner Case Thinking + * A) + * When the old page was mapped as Anon and it's unmap-and-freed + * while migration was ongoing. + * If unmap finds the old page, uncharge() of it will be delayed + * until end_migration(). If unmap finds a new page, it's + * uncharged when it make mapcount to be 1->0. If unmap code + * finds swap_migration_entry, the new page will not be mapped + * and end_migration() will find it(mapcount==0). + * + * B) + * When the old page was mapped but migraion fails, the kernel + * remaps it. A charge for it is kept by MIGRATION flag even + * if mapcount goes down to 0. We can do remap successfully + * without charging it again. + * + * C) + * The "old" page is under lock_page() until the end of + * migration, so, the old page itself will not be swapped-out. + * If the new page is swapped out before end_migraton, our + * hook to usual swap-out path will catch the event. + */ + if (PageAnon(page)) + SetPageCgroupMigration(pc); } unlock_page_cgroup(pc); + /* + * If the page is not charged at this point, + * we return here. + */ + if (!mem) + return 0; - if (mem) { - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); - css_put(&mem->css); - } *ptr = mem; + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + css_put(&mem->css);/* drop extra refcnt */ + if (ret || *ptr == NULL) { + if (PageAnon(page)) { + lock_page_cgroup(pc); + ClearPageCgroupMigration(pc); + unlock_page_cgroup(pc); + /* + * The old page may be fully unmapped while we kept it. + */ + mem_cgroup_uncharge_page(page); + } + return -ENOMEM; + } + /* + * We charge new page before it's used/mapped. So, even if unlock_page() + * is called before end_migration, we can catch all events on this new + * page. In the case new page is migrated but not remapped, new page's + * mapcount will be finally 0 and we call uncharge in end_migration(). + */ + pc = lookup_page_cgroup(newpage); + if (PageAnon(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; + else if (page_is_file_cache(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + else + ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; + __mem_cgroup_commit_charge(mem, pc, ctype); return ret; } /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage) { - struct page *target, *unused; + struct page *used, *unused; struct page_cgroup *pc; - enum charge_type ctype; if (!mem) return; - + /* blocks rmdir() */ + cgroup_exclude_rmdir(&mem->css); /* at migration success, oldpage->mapping is NULL. */ if (oldpage->mapping) { - target = oldpage; - unused = NULL; + used = oldpage; + unused = newpage; } else { - target = newpage; + used = newpage; unused = oldpage; } + /* + * We disallowed uncharge of pages under migration because mapcount + * of the page goes down to zero, temporarly. + * Clear the flag and check the page should be charged. + */ + pc = lookup_page_cgroup(oldpage); + lock_page_cgroup(pc); + ClearPageCgroupMigration(pc); + unlock_page_cgroup(pc); - if (PageAnon(target)) - ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; - else if (page_is_file_cache(target)) - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; - else - ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - - /* unused page is not on radix-tree now. */ - if (unused) - __mem_cgroup_uncharge_common(unused, ctype); + __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE); - pc = lookup_page_cgroup(target); /* - * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup. - * So, double-counting is effectively avoided. + * If a page is a file cache, radix-tree replacement is very atomic + * and we can skip this check. When it was an Anon page, its mapcount + * goes down to 0. But because we added MIGRATION flage, it's not + * uncharged yet. There are several case but page->mapcount check + * and USED bit check in mem_cgroup_uncharge_page() will do enough + * check. (see prepare_charge() also) */ - __mem_cgroup_commit_charge(mem, pc, ctype); - + if (PageAnon(used)) + mem_cgroup_uncharge_page(used); /* - * Both of oldpage and newpage are still under lock_page(). - * Then, we don't have to care about race in radix-tree. - * But we have to be careful that this page is unmapped or not. - * - * There is a case for !page_mapped(). At the start of - * migration, oldpage was mapped. But now, it's zapped. - * But we know *target* page is not freed/reused under us. - * mem_cgroup_uncharge_page() does all necessary checks. + * At migration, we may charge account against cgroup which has no + * tasks. + * So, rmdir()->pre_destroy() can be called while we do this charge. + * In that case, we need to call pre_destroy() again. check it here. */ - if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) - mem_cgroup_uncharge_page(target); + cgroup_release_and_wakeup_rmdir(&mem->css); } /* @@ -1737,11 +2935,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - int progress; - u64 memswlimit; + u64 memswlimit, memlimit; int ret = 0; int children = mem_cgroup_count_children(memcg); u64 curusage, oldusage; + int enlarge; /* * For keeping hierarchical_reclaim simple, how long we should retry @@ -1752,6 +2950,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); + enlarge = 0; while (retry_count) { if (signal_pending(current)) { ret = -EINTR; @@ -1769,6 +2968,11 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, mutex_unlock(&set_limit_mutex); break; } + + memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); + if (memlimit < val) + enlarge = 1; + ret = res_counter_set_limit(&memcg->res, val); if (!ret) { if (memswlimit == val) @@ -1781,8 +2985,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, - false, true); + mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -1790,6 +2994,8 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, else oldusage = curusage; } + if (!ret && enlarge) + memcg_oom_recover(memcg); return ret; } @@ -1798,9 +3004,10 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, unsigned long long val) { int retry_count; - u64 memlimit, oldusage, curusage; + u64 memlimit, memswlimit, oldusage, curusage; int children = mem_cgroup_count_children(memcg); int ret = -EBUSY; + int enlarge = 0; /* see mem_cgroup_resize_res_limit */ retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; @@ -1822,6 +3029,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, mutex_unlock(&set_limit_mutex); break; } + memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); + if (memswlimit < val) + enlarge = 1; ret = res_counter_set_limit(&memcg->memsw, val); if (!ret) { if (memlimit == val) @@ -1834,7 +3044,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); + mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, + MEM_CGROUP_RECLAIM_NOSWAP | + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -1842,9 +3054,101 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, else oldusage = curusage; } + if (!ret && enlarge) + memcg_oom_recover(memcg); return ret; } +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask) +{ + unsigned long nr_reclaimed = 0; + struct mem_cgroup_per_zone *mz, *next_mz = NULL; + unsigned long reclaimed; + int loop = 0; + struct mem_cgroup_tree_per_zone *mctz; + unsigned long long excess; + + if (order > 0) + return 0; + + mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); + /* + * This loop can run a while, specially if mem_cgroup's continuously + * keep exceeding their soft limit and putting the system under + * pressure + */ + do { + if (next_mz) + mz = next_mz; + else + mz = mem_cgroup_largest_soft_limit_node(mctz); + if (!mz) + break; + + reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, + gfp_mask, + MEM_CGROUP_RECLAIM_SOFT); + nr_reclaimed += reclaimed; + spin_lock(&mctz->lock); + + /* + * If we failed to reclaim anything from this memory cgroup + * it is time to move on to the next cgroup + */ + next_mz = NULL; + if (!reclaimed) { + do { + /* + * Loop until we find yet another one. + * + * By the time we get the soft_limit lock + * again, someone might have aded the + * group back on the RB tree. Iterate to + * make sure we get a different mem. + * mem_cgroup_largest_soft_limit_node returns + * NULL if no other cgroup is present on + * the tree + */ + next_mz = + __mem_cgroup_largest_soft_limit_node(mctz); + if (next_mz == mz) { + css_put(&next_mz->mem->css); + next_mz = NULL; + } else /* next_mz == NULL or other memcg */ + break; + } while (1); + } + __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); + excess = res_counter_soft_limit_excess(&mz->mem->res); + /* + * One school of thought says that we should not add + * back the node to the tree if reclaim returns 0. + * But our reclaim could return 0, simply because due + * to priority we are exposing a smaller subset of + * memory to reclaim from. Consider this as a longer + * term TODO. + */ + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); + spin_unlock(&mctz->lock); + css_put(&mz->mem->css); + loop++; + /* + * Could not reclaim anything and there are no more + * mem cgroups to try or we seem to be looping without + * reclaiming anything. + */ + if (!nr_reclaimed && + (next_mz == NULL || + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) + break; + } while (!nr_reclaimed); + if (next_mz) + css_put(&next_mz->mem->css); + return nr_reclaimed; +} + /* * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. @@ -1877,7 +3181,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem, pc = list_entry(list->prev, struct page_cgroup, lru); if (busy == pc) { list_move(&pc->lru, list); - busy = 0; + busy = NULL; spin_unlock_irqrestore(&zone->lru_lock, flags); continue; } @@ -1918,7 +3222,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all) if (free_all) goto try_to_free; move_account: - while (mem->res.usage > 0) { + do { ret = -EBUSY; if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) goto out; @@ -1927,7 +3231,9 @@ move_account: goto out; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); + drain_all_stock_sync(); ret = 0; + mem_cgroup_start_move(mem); for_each_node_state(node, N_HIGH_MEMORY) { for (zid = 0; !ret && zid < MAX_NR_ZONES; zid++) { enum lru_list l; @@ -1941,12 +3247,14 @@ move_account: if (ret) break; } + mem_cgroup_end_move(mem); + memcg_oom_recover(mem); /* it seems parent cgroup doesn't have enough mem */ if (ret == -ENOMEM) goto try_to_free; cond_resched(); - } - ret = 0; + /* "ret" should also be checked to ensure all lists are empty. */ + } while (mem->res.usage > 0 || ret); out: css_put(&mem->css); return ret; @@ -1973,16 +3281,13 @@ try_to_free: if (!progress) { nr_retries--; /* maybe some writeback is necessary */ - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } } lru_add_drain(); /* try move_account...there may be some *locked* pages. */ - if (mem->res.usage) - goto move_account; - ret = 0; - goto out; + goto move_account; } int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event) @@ -2009,7 +3314,7 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, cgroup_lock(); /* - * If parent's use_hiearchy is set, we can't make any modifications + * If parent's use_hierarchy is set, we can't make any modifications * in the child subtrees. If it is unset, then the change can * occur, provided the current cgroup has no children. * @@ -2029,20 +3334,63 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, return retval; } + +static u64 mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx) +{ + struct mem_cgroup *iter; + s64 val = 0; + + /* each per cpu's value can be minus.Then, use s64 */ + for_each_mem_cgroup_tree(iter, mem) + val += mem_cgroup_read_stat(iter, idx); + + if (val < 0) /* race ? */ + val = 0; + return val; +} + +static inline u64 mem_cgroup_usage(struct mem_cgroup *mem, bool swap) +{ + u64 val; + + if (!mem_cgroup_is_root(mem)) { + if (!swap) + return res_counter_read_u64(&mem->res, RES_USAGE); + else + return res_counter_read_u64(&mem->memsw, RES_USAGE); + } + + val = mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_CACHE); + val += mem_cgroup_get_recursive_idx_stat(mem, MEM_CGROUP_STAT_RSS); + + if (swap) + val += mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_SWAPOUT); + + return val << PAGE_SHIFT; +} + static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - u64 val = 0; + u64 val; int type, name; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - val = res_counter_read_u64(&mem->res, name); + if (name == RES_USAGE) + val = mem_cgroup_usage(mem, false); + else + val = res_counter_read_u64(&mem->res, name); break; case _MEMSWAP: - val = res_counter_read_u64(&mem->memsw, name); + if (name == RES_USAGE) + val = mem_cgroup_usage(mem, true); + else + val = res_counter_read_u64(&mem->memsw, name); break; default: BUG(); @@ -2066,6 +3414,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, name = MEMFILE_ATTR(cft->private); switch (name) { case RES_LIMIT: + if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ + ret = -EINVAL; + break; + } /* This function does all necessary parse...reuse it */ ret = res_counter_memparse_write_strategy(buffer, &val); if (ret) @@ -2075,6 +3427,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, else ret = mem_cgroup_resize_memsw_limit(memcg, val); break; + case RES_SOFT_LIMIT: + ret = res_counter_memparse_write_strategy(buffer, &val); + if (ret) + break; + /* + * For memsw, soft limits are hard to implement in terms + * of semantics, for now, we support soft limits for + * control without swap + */ + if (type == _MEM) + ret = res_counter_set_soft_limit(&memcg->res, val); + else + ret = -EINVAL; + break; default: ret = -EINVAL; /* should be BUG() ? */ break; @@ -2132,17 +3498,52 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) res_counter_reset_failcnt(&mem->memsw); break; } + return 0; } +static u64 mem_cgroup_move_charge_read(struct cgroup *cgrp, + struct cftype *cft) +{ + return mem_cgroup_from_cont(cgrp)->move_charge_at_immigrate; +} + +#ifdef CONFIG_MMU +static int mem_cgroup_move_charge_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + + if (val >= (1 << NR_MOVE_TYPE)) + return -EINVAL; + /* + * We check this value several times in both in can_attach() and + * attach(), so we need cgroup lock to prevent this value from being + * inconsistent. + */ + cgroup_lock(); + mem->move_charge_at_immigrate = val; + cgroup_unlock(); + + return 0; +} +#else +static int mem_cgroup_move_charge_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + return -ENOSYS; +} +#endif + /* For read statistics */ enum { MCS_CACHE, MCS_RSS, - MCS_MAPPED_FILE, + MCS_FILE_MAPPED, MCS_PGPGIN, MCS_PGPGOUT, + MCS_SWAP, MCS_INACTIVE_ANON, MCS_ACTIVE_ANON, MCS_INACTIVE_FILE, @@ -2164,6 +3565,7 @@ struct { {"mapped_file", "total_mapped_file"}, {"pgpgin", "total_pgpgin"}, {"pgpgout", "total_pgpgout"}, + {"swap", "total_swap"}, {"inactive_anon", "total_inactive_anon"}, {"active_anon", "total_active_anon"}, {"inactive_file", "total_inactive_file"}, @@ -2172,22 +3574,26 @@ struct { }; -static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) +static void +mem_cgroup_get_local_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) { - struct mcs_total_stat *s = data; s64 val; /* per cpu stat */ - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE); s->stat[MCS_CACHE] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS); s->stat[MCS_RSS] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_MAPPED_FILE); - s->stat[MCS_MAPPED_FILE] += val * PAGE_SIZE; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED); + s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE; + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT); s->stat[MCS_PGPGIN] += val; - val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT); s->stat[MCS_PGPGOUT] += val; + if (do_swap_account) { + val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT); + s->stat[MCS_SWAP] += val * PAGE_SIZE; + } /* per zone stat */ val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); @@ -2200,13 +3606,15 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) s->stat[MCS_ACTIVE_FILE] += val * PAGE_SIZE; val = mem_cgroup_get_local_zonestat(mem, LRU_UNEVICTABLE); s->stat[MCS_UNEVICTABLE] += val * PAGE_SIZE; - return 0; } static void mem_cgroup_get_total_stat(struct mem_cgroup *mem, struct mcs_total_stat *s) { - mem_cgroup_walk_tree(mem, s, mem_cgroup_get_local_stat); + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + mem_cgroup_get_local_stat(iter, s); } static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, @@ -2219,8 +3627,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_local_stat(mem_cont, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) + for (i = 0; i < NR_MCS_STAT; i++) { + if (i == MCS_SWAP && !do_swap_account) + continue; cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); + } /* Hierarchical information */ { @@ -2233,9 +3644,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_total_stat(mem_cont, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) + for (i = 0; i < NR_MCS_STAT; i++) { + if (i == MCS_SWAP && !do_swap_account) + continue; cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); - + } #ifdef CONFIG_DEBUG_VM cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); @@ -2308,12 +3721,344 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, return 0; } +static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) +{ + struct mem_cgroup_threshold_ary *t; + u64 usage; + int i; + + rcu_read_lock(); + if (!swap) + t = rcu_dereference(memcg->thresholds.primary); + else + t = rcu_dereference(memcg->memsw_thresholds.primary); + + if (!t) + goto unlock; + + usage = mem_cgroup_usage(memcg, swap); + + /* + * current_threshold points to threshold just below usage. + * If it's not true, a threshold was crossed after last + * call of __mem_cgroup_threshold(). + */ + i = t->current_threshold; + + /* + * Iterate backward over array of thresholds starting from + * current_threshold and check if a threshold is crossed. + * If none of thresholds below usage is crossed, we read + * only one element of the array here. + */ + for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) + eventfd_signal(t->entries[i].eventfd, 1); + + /* i = current_threshold + 1 */ + i++; + + /* + * Iterate forward over array of thresholds starting from + * current_threshold+1 and check if a threshold is crossed. + * If none of thresholds above usage is crossed, we read + * only one element of the array here. + */ + for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) + eventfd_signal(t->entries[i].eventfd, 1); + + /* Update current_threshold */ + t->current_threshold = i - 1; +unlock: + rcu_read_unlock(); +} + +static void mem_cgroup_threshold(struct mem_cgroup *memcg) +{ + while (memcg) { + __mem_cgroup_threshold(memcg, false); + if (do_swap_account) + __mem_cgroup_threshold(memcg, true); + + memcg = parent_mem_cgroup(memcg); + } +} + +static int compare_thresholds(const void *a, const void *b) +{ + const struct mem_cgroup_threshold *_a = a; + const struct mem_cgroup_threshold *_b = b; + + return _a->threshold - _b->threshold; +} + +static int mem_cgroup_oom_notify_cb(struct mem_cgroup *mem) +{ + struct mem_cgroup_eventfd_list *ev; + + list_for_each_entry(ev, &mem->oom_notify, list) + eventfd_signal(ev->eventfd, 1); + return 0; +} + +static void mem_cgroup_oom_notify(struct mem_cgroup *mem) +{ + struct mem_cgroup *iter; + + for_each_mem_cgroup_tree(iter, mem) + mem_cgroup_oom_notify_cb(iter); +} + +static int mem_cgroup_usage_register_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + int type = MEMFILE_TYPE(cft->private); + u64 threshold, usage; + int i, size, ret; + + ret = res_counter_memparse_write_strategy(args, &threshold); + if (ret) + return ret; + + mutex_lock(&memcg->thresholds_lock); + + if (type == _MEM) + thresholds = &memcg->thresholds; + else if (type == _MEMSWAP) + thresholds = &memcg->memsw_thresholds; + else + BUG(); + + usage = mem_cgroup_usage(memcg, type == _MEMSWAP); + + /* Check if a threshold crossed before adding a new one */ + if (thresholds->primary) + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + size = thresholds->primary ? thresholds->primary->size + 1 : 1; + + /* Allocate memory for new array of thresholds */ + new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), + GFP_KERNEL); + if (!new) { + ret = -ENOMEM; + goto unlock; + } + new->size = size; + + /* Copy thresholds (if any) to new array */ + if (thresholds->primary) { + memcpy(new->entries, thresholds->primary->entries, (size - 1) * + sizeof(struct mem_cgroup_threshold)); + } + + /* Add new threshold */ + new->entries[size - 1].eventfd = eventfd; + new->entries[size - 1].threshold = threshold; + + /* Sort thresholds. Registering of new threshold isn't time-critical */ + sort(new->entries, size, sizeof(struct mem_cgroup_threshold), + compare_thresholds, NULL); + + /* Find current threshold */ + new->current_threshold = -1; + for (i = 0; i < size; i++) { + if (new->entries[i].threshold < usage) { + /* + * new->current_threshold will not be used until + * rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } + } + + /* Free old spare buffer and save old primary buffer as spare */ + kfree(thresholds->spare); + thresholds->spare = thresholds->primary; + + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + +unlock: + mutex_unlock(&memcg->thresholds_lock); + + return ret; +} + +static void mem_cgroup_usage_unregister_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_thresholds *thresholds; + struct mem_cgroup_threshold_ary *new; + int type = MEMFILE_TYPE(cft->private); + u64 usage; + int i, j, size; + + mutex_lock(&memcg->thresholds_lock); + if (type == _MEM) + thresholds = &memcg->thresholds; + else if (type == _MEMSWAP) + thresholds = &memcg->memsw_thresholds; + else + BUG(); + + /* + * Something went wrong if we trying to unregister a threshold + * if we don't have thresholds + */ + BUG_ON(!thresholds); + + usage = mem_cgroup_usage(memcg, type == _MEMSWAP); + + /* Check if a threshold crossed before removing */ + __mem_cgroup_threshold(memcg, type == _MEMSWAP); + + /* Calculate new number of threshold */ + size = 0; + for (i = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd != eventfd) + size++; + } + + new = thresholds->spare; + + /* Set thresholds array to NULL if we don't have thresholds */ + if (!size) { + kfree(new); + new = NULL; + goto swap_buffers; + } + + new->size = size; + + /* Copy thresholds and find current threshold */ + new->current_threshold = -1; + for (i = 0, j = 0; i < thresholds->primary->size; i++) { + if (thresholds->primary->entries[i].eventfd == eventfd) + continue; + + new->entries[j] = thresholds->primary->entries[i]; + if (new->entries[j].threshold < usage) { + /* + * new->current_threshold will not be used + * until rcu_assign_pointer(), so it's safe to increment + * it here. + */ + ++new->current_threshold; + } + j++; + } + +swap_buffers: + /* Swap primary and spare array */ + thresholds->spare = thresholds->primary; + rcu_assign_pointer(thresholds->primary, new); + + /* To be sure that nobody uses thresholds */ + synchronize_rcu(); + + mutex_unlock(&memcg->thresholds_lock); +} + +static int mem_cgroup_oom_register_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) +{ + struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_eventfd_list *event; + int type = MEMFILE_TYPE(cft->private); + + BUG_ON(type != _OOM_TYPE); + event = kmalloc(sizeof(*event), GFP_KERNEL); + if (!event) + return -ENOMEM; + + mutex_lock(&memcg_oom_mutex); + + event->eventfd = eventfd; + list_add(&event->list, &memcg->oom_notify); + + /* already in OOM ? */ + if (atomic_read(&memcg->oom_lock)) + eventfd_signal(eventfd, 1); + mutex_unlock(&memcg_oom_mutex); + + return 0; +} + +static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp, + struct cftype *cft, struct eventfd_ctx *eventfd) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup_eventfd_list *ev, *tmp; + int type = MEMFILE_TYPE(cft->private); + + BUG_ON(type != _OOM_TYPE); + + mutex_lock(&memcg_oom_mutex); + + list_for_each_entry_safe(ev, tmp, &mem->oom_notify, list) { + if (ev->eventfd == eventfd) { + list_del(&ev->list); + kfree(ev); + } + } + + mutex_unlock(&memcg_oom_mutex); +} + +static int mem_cgroup_oom_control_read(struct cgroup *cgrp, + struct cftype *cft, struct cgroup_map_cb *cb) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + + cb->fill(cb, "oom_kill_disable", mem->oom_kill_disable); + + if (atomic_read(&mem->oom_lock)) + cb->fill(cb, "under_oom", 1); + else + cb->fill(cb, "under_oom", 0); + return 0; +} + +static int mem_cgroup_oom_control_write(struct cgroup *cgrp, + struct cftype *cft, u64 val) +{ + struct mem_cgroup *mem = mem_cgroup_from_cont(cgrp); + struct mem_cgroup *parent; + + /* cannot set to root cgroup and only 0 and 1 are allowed */ + if (!cgrp->parent || !((val == 0) || (val == 1))) + return -EINVAL; + + parent = mem_cgroup_from_cont(cgrp->parent); + + cgroup_lock(); + /* oom-kill-disable is a flag for subhierarchy. */ + if ((parent->use_hierarchy) || + (mem->use_hierarchy && !list_empty(&cgrp->children))) { + cgroup_unlock(); + return -EINVAL; + } + mem->oom_kill_disable = val; + if (!val) + memcg_oom_recover(mem); + cgroup_unlock(); + return 0; +} static struct cftype mem_cgroup_files[] = { { .name = "usage_in_bytes", .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), .read_u64 = mem_cgroup_read, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "max_usage_in_bytes", @@ -2328,6 +4073,12 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_read, }, { + .name = "soft_limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), + .write_string = mem_cgroup_write, + .read_u64 = mem_cgroup_read, + }, + { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), .trigger = mem_cgroup_reset, @@ -2351,6 +4102,19 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_swappiness_read, .write_u64 = mem_cgroup_swappiness_write, }, + { + .name = "move_charge_at_immigrate", + .read_u64 = mem_cgroup_move_charge_read, + .write_u64 = mem_cgroup_move_charge_write, + }, + { + .name = "oom_control", + .read_map = mem_cgroup_oom_control_read, + .write_u64 = mem_cgroup_oom_control_write, + .register_event = mem_cgroup_oom_register_event, + .unregister_event = mem_cgroup_oom_unregister_event, + .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), + }, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP @@ -2359,6 +4123,8 @@ static struct cftype memsw_cgroup_files[] = { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), .read_u64 = mem_cgroup_read, + .register_event = mem_cgroup_usage_register_event, + .unregister_event = mem_cgroup_usage_unregister_event, }, { .name = "memsw.max_usage_in_bytes", @@ -2421,6 +4187,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) mz = &pn->zoneinfo[zone]; for_each_lru(l) INIT_LIST_HEAD(&mz->lists[l]); + mz->usage_in_excess = 0; + mz->on_tree = false; + mz->mem = mem; } return 0; } @@ -2430,25 +4199,33 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) kfree(mem->info.nodeinfo[node]); } -static int mem_cgroup_size(void) -{ - int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu); - return sizeof(struct mem_cgroup) + cpustat_size; -} - static struct mem_cgroup *mem_cgroup_alloc(void) { struct mem_cgroup *mem; - int size = mem_cgroup_size(); + int size = sizeof(struct mem_cgroup); + /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) mem = kmalloc(size, GFP_KERNEL); else mem = vmalloc(size); - if (mem) - memset(mem, 0, size); + if (!mem) + return NULL; + + memset(mem, 0, size); + mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!mem->stat) + goto out_free; + spin_lock_init(&mem->pcp_counter_lock); return mem; + +out_free: + if (size < PAGE_SIZE) + kfree(mem); + else + vfree(mem); + return NULL; } /* @@ -2466,12 +4243,14 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) { int node; + mem_cgroup_remove_from_trees(mem); free_css_id(&mem_cgroup_subsys, &mem->css); for_each_node_state(node, N_POSSIBLE) free_mem_cgroup_per_zone_info(mem, node); - if (mem_cgroup_size() < PAGE_SIZE) + free_percpu(mem->stat); + if (sizeof(struct mem_cgroup) < PAGE_SIZE) kfree(mem); else vfree(mem); @@ -2482,9 +4261,9 @@ static void mem_cgroup_get(struct mem_cgroup *mem) atomic_inc(&mem->refcnt); } -static void mem_cgroup_put(struct mem_cgroup *mem) +static void __mem_cgroup_put(struct mem_cgroup *mem, int count) { - if (atomic_dec_and_test(&mem->refcnt)) { + if (atomic_sub_and_test(count, &mem->refcnt)) { struct mem_cgroup *parent = parent_mem_cgroup(mem); __mem_cgroup_free(mem); if (parent) @@ -2492,6 +4271,11 @@ static void mem_cgroup_put(struct mem_cgroup *mem) } } +static void mem_cgroup_put(struct mem_cgroup *mem) +{ + __mem_cgroup_put(mem, 1); +} + /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ @@ -2514,6 +4298,31 @@ static void __init enable_swap_cgroup(void) } #endif +static int mem_cgroup_soft_limit_tree_init(void) +{ + struct mem_cgroup_tree_per_node *rtpn; + struct mem_cgroup_tree_per_zone *rtpz; + int tmp, node, zone; + + for_each_node_state(node, N_POSSIBLE) { + tmp = node; + if (!node_state(node, N_NORMAL_MEMORY)) + tmp = -1; + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); + if (!rtpn) + return 1; + + soft_limit_tree.rb_tree_per_node[node] = rtpn; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + rtpz = &rtpn->rb_tree_per_zone[zone]; + rtpz->rb_root = RB_ROOT; + spin_lock_init(&rtpz->lock); + } + } + return 0; +} + static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { @@ -2528,13 +4337,25 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) goto free_out; + /* root ? */ if (cont->parent == NULL) { + int cpu; enable_swap_cgroup(); parent = NULL; + root_mem_cgroup = mem; + if (mem_cgroup_soft_limit_tree_init()) + goto free_out; + for_each_possible_cpu(cpu) { + struct memcg_stock_pcp *stock = + &per_cpu(memcg_stock, cpu); + INIT_WORK(&stock->work, drain_local_stock); + } + hotcpu_notifier(memcg_cpu_hotplug_callback, 0); } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; + mem->oom_kill_disable = parent->oom_kill_disable; } if (parent && parent->use_hierarchy) { @@ -2553,13 +4374,17 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) } mem->last_scanned_child = 0; spin_lock_init(&mem->reclaim_param_lock); + INIT_LIST_HEAD(&mem->oom_notify); if (parent) mem->swappiness = get_swappiness(parent); atomic_set(&mem->refcnt, 1); + mem->move_charge_at_immigrate = 0; + mutex_init(&mem->thresholds_lock); return &mem->css; free_out: __mem_cgroup_free(mem); + root_mem_cgroup = NULL; return ERR_PTR(error); } @@ -2592,18 +4417,499 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, return ret; } +#ifdef CONFIG_MMU +/* Handlers for move charge at task migration. */ +#define PRECHARGE_COUNT_AT_ONCE 256 +static int mem_cgroup_do_precharge(unsigned long count) +{ + int ret = 0; + int batch_count = PRECHARGE_COUNT_AT_ONCE; + struct mem_cgroup *mem = mc.to; + + if (mem_cgroup_is_root(mem)) { + mc.precharge += count; + /* we don't need css_get for root */ + return ret; + } + /* try to charge at once */ + if (count > 1) { + struct res_counter *dummy; + /* + * "mem" cannot be under rmdir() because we've already checked + * by cgroup_lock_live_cgroup() that it is not removed and we + * are still under the same cgroup_mutex. So we can postpone + * css_get(). + */ + if (res_counter_charge(&mem->res, PAGE_SIZE * count, &dummy)) + goto one_by_one; + if (do_swap_account && res_counter_charge(&mem->memsw, + PAGE_SIZE * count, &dummy)) { + res_counter_uncharge(&mem->res, PAGE_SIZE * count); + goto one_by_one; + } + mc.precharge += count; + return ret; + } +one_by_one: + /* fall back to one by one charge */ + while (count--) { + if (signal_pending(current)) { + ret = -EINTR; + break; + } + if (!batch_count--) { + batch_count = PRECHARGE_COUNT_AT_ONCE; + cond_resched(); + } + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + if (ret || !mem) + /* mem_cgroup_clear_mc() will do uncharge later */ + return -ENOMEM; + mc.precharge++; + } + return ret; +} + +/** + * is_target_pte_for_mc - check a pte whether it is valid for move charge + * @vma: the vma the pte to be checked belongs + * @addr: the address corresponding to the pte to be checked + * @ptent: the pte to be checked + * @target: the pointer the target page or swap ent will be stored(can be NULL) + * + * Returns + * 0(MC_TARGET_NONE): if the pte is not a target for move charge. + * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for + * move charge. if @target is not NULL, the page is stored in target->page + * with extra refcnt got(Callers should handle it). + * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a + * target for charge migration. if @target is not NULL, the entry is stored + * in target->ent. + * + * Called with pte lock held. + */ +union mc_target { + struct page *page; + swp_entry_t ent; +}; + +enum mc_target_type { + MC_TARGET_NONE, /* not used */ + MC_TARGET_PAGE, + MC_TARGET_SWAP, +}; + +static struct page *mc_handle_present_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent) +{ + struct page *page = vm_normal_page(vma, addr, ptent); + + if (!page || !page_mapped(page)) + return NULL; + if (PageAnon(page)) { + /* we don't move shared anon */ + if (!move_anon() || page_mapcount(page) > 2) + return NULL; + } else if (!move_file()) + /* we ignore mapcount for file pages */ + return NULL; + if (!get_page_unless_zero(page)) + return NULL; + + return page; +} + +static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + int usage_count; + struct page *page = NULL; + swp_entry_t ent = pte_to_swp_entry(ptent); + + if (!move_anon() || non_swap_entry(ent)) + return NULL; + usage_count = mem_cgroup_count_swap_user(ent, &page); + if (usage_count > 1) { /* we don't move shared anon */ + if (page) + put_page(page); + return NULL; + } + if (do_swap_account) + entry->val = ent.val; + + return page; +} + +static struct page *mc_handle_file_pte(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, swp_entry_t *entry) +{ + struct page *page = NULL; + struct inode *inode; + struct address_space *mapping; + pgoff_t pgoff; + + if (!vma->vm_file) /* anonymous vma */ + return NULL; + if (!move_file()) + return NULL; + + inode = vma->vm_file->f_path.dentry->d_inode; + mapping = vma->vm_file->f_mapping; + if (pte_none(ptent)) + pgoff = linear_page_index(vma, addr); + else /* pte_file(ptent) is true */ + pgoff = pte_to_pgoff(ptent); + + /* page is moved even if it's not RSS of this task(page-faulted). */ + if (!mapping_cap_swap_backed(mapping)) { /* normal file */ + page = find_get_page(mapping, pgoff); + } else { /* shmem/tmpfs file. we should take account of swap too. */ + swp_entry_t ent; + mem_cgroup_get_shmem_target(inode, pgoff, &page, &ent); + if (do_swap_account) + entry->val = ent.val; + } + + return page; +} + +static int is_target_pte_for_mc(struct vm_area_struct *vma, + unsigned long addr, pte_t ptent, union mc_target *target) +{ + struct page *page = NULL; + struct page_cgroup *pc; + int ret = 0; + swp_entry_t ent = { .val = 0 }; + + if (pte_present(ptent)) + page = mc_handle_present_pte(vma, addr, ptent); + else if (is_swap_pte(ptent)) + page = mc_handle_swap_pte(vma, addr, ptent, &ent); + else if (pte_none(ptent) || pte_file(ptent)) + page = mc_handle_file_pte(vma, addr, ptent, &ent); + + if (!page && !ent.val) + return 0; + if (page) { + pc = lookup_page_cgroup(page); + /* + * Do only loose check w/o page_cgroup lock. + * mem_cgroup_move_account() checks the pc is valid or not under + * the lock. + */ + if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) { + ret = MC_TARGET_PAGE; + if (target) + target->page = page; + } + if (!ret || !target) + put_page(page); + } + /* There is a swap entry and a page doesn't exist or isn't charged */ + if (ent.val && !ret && + css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { + ret = MC_TARGET_SWAP; + if (target) + target->ent = ent; + } + return ret; +} + +static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->private; + pte_t *pte; + spinlock_t *ptl; + + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; pte++, addr += PAGE_SIZE) + if (is_target_pte_for_mc(vma, addr, *pte, NULL)) + mc.precharge++; /* increment precharge temporarily */ + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + return 0; +} + +static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) +{ + unsigned long precharge; + struct vm_area_struct *vma; + + /* We've already held the mmap_sem */ + for (vma = mm->mmap; vma; vma = vma->vm_next) { + struct mm_walk mem_cgroup_count_precharge_walk = { + .pmd_entry = mem_cgroup_count_precharge_pte_range, + .mm = mm, + .private = vma, + }; + if (is_vm_hugetlb_page(vma)) + continue; + walk_page_range(vma->vm_start, vma->vm_end, + &mem_cgroup_count_precharge_walk); + } + + precharge = mc.precharge; + mc.precharge = 0; + + return precharge; +} + +static int mem_cgroup_precharge_mc(struct mm_struct *mm) +{ + return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); +} + +static void mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + struct mem_cgroup *to = mc.to; + + /* we must uncharge all the leftover precharges from mc.to */ + if (mc.precharge) { + __mem_cgroup_cancel_charge(mc.to, mc.precharge); + mc.precharge = 0; + } + /* + * we didn't uncharge from mc.from at mem_cgroup_move_account(), so + * we must uncharge here. + */ + if (mc.moved_charge) { + __mem_cgroup_cancel_charge(mc.from, mc.moved_charge); + mc.moved_charge = 0; + } + /* we must fixup refcnts and charges */ + if (mc.moved_swap) { + /* uncharge swap account from the old cgroup */ + if (!mem_cgroup_is_root(mc.from)) + res_counter_uncharge(&mc.from->memsw, + PAGE_SIZE * mc.moved_swap); + __mem_cgroup_put(mc.from, mc.moved_swap); + + if (!mem_cgroup_is_root(mc.to)) { + /* + * we charged both to->res and to->memsw, so we should + * uncharge to->res. + */ + res_counter_uncharge(&mc.to->res, + PAGE_SIZE * mc.moved_swap); + } + /* we've already done mem_cgroup_get(mc.to) */ + + mc.moved_swap = 0; + } + if (mc.mm) { + up_read(&mc.mm->mmap_sem); + mmput(mc.mm); + } + spin_lock(&mc.lock); + mc.from = NULL; + mc.to = NULL; + spin_unlock(&mc.lock); + mc.moving_task = NULL; + mc.mm = NULL; + mem_cgroup_end_move(from); + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static int mem_cgroup_can_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ + int ret = 0; + struct mem_cgroup *mem = mem_cgroup_from_cont(cgroup); + + if (mem->move_charge_at_immigrate) { + struct mm_struct *mm; + struct mem_cgroup *from = mem_cgroup_from_task(p); + + VM_BUG_ON(from == mem); + + mm = get_task_mm(p); + if (!mm) + return 0; + /* We move charges only when we move a owner of the mm */ + if (mm->owner == p) { + /* + * We do all the move charge works under one mmap_sem to + * avoid deadlock with down_write(&mmap_sem) + * -> try_charge() -> if (mc.moving_task) -> sleep. + */ + down_read(&mm->mmap_sem); + + VM_BUG_ON(mc.from); + VM_BUG_ON(mc.to); + VM_BUG_ON(mc.precharge); + VM_BUG_ON(mc.moved_charge); + VM_BUG_ON(mc.moved_swap); + VM_BUG_ON(mc.moving_task); + VM_BUG_ON(mc.mm); + + mem_cgroup_start_move(from); + spin_lock(&mc.lock); + mc.from = from; + mc.to = mem; + mc.precharge = 0; + mc.moved_charge = 0; + mc.moved_swap = 0; + spin_unlock(&mc.lock); + mc.moving_task = current; + mc.mm = mm; + + ret = mem_cgroup_precharge_mc(mm); + if (ret) + mem_cgroup_clear_mc(); + /* We call up_read() and mmput() in clear_mc(). */ + } else + mmput(mm); + } + return ret; +} + +static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ + mem_cgroup_clear_mc(); +} + +static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + int ret = 0; + struct vm_area_struct *vma = walk->private; + pte_t *pte; + spinlock_t *ptl; + +retry: + pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); + for (; addr != end; addr += PAGE_SIZE) { + pte_t ptent = *(pte++); + union mc_target target; + int type; + struct page *page; + struct page_cgroup *pc; + swp_entry_t ent; + + if (!mc.precharge) + break; + + type = is_target_pte_for_mc(vma, addr, ptent, &target); + switch (type) { + case MC_TARGET_PAGE: + page = target.page; + if (isolate_lru_page(page)) + goto put; + pc = lookup_page_cgroup(page); + if (!mem_cgroup_move_account(pc, + mc.from, mc.to, false)) { + mc.precharge--; + /* we uncharge from mc.from later. */ + mc.moved_charge++; + } + putback_lru_page(page); +put: /* is_target_pte_for_mc() gets the page */ + put_page(page); + break; + case MC_TARGET_SWAP: + ent = target.ent; + if (!mem_cgroup_move_swap_account(ent, + mc.from, mc.to, false)) { + mc.precharge--; + /* we fixup refcnts and charges later. */ + mc.moved_swap++; + } + break; + default: + break; + } + } + pte_unmap_unlock(pte - 1, ptl); + cond_resched(); + + if (addr != end) { + /* + * We have consumed all precharges we got in can_attach(). + * We try charge one by one, but don't do any additional + * charges to mc.to if we have failed in charge once in attach() + * phase. + */ + ret = mem_cgroup_do_precharge(1); + if (!ret) + goto retry; + } + + return ret; +} + +static void mem_cgroup_move_charge(struct mm_struct *mm) +{ + struct vm_area_struct *vma; + + lru_add_drain_all(); + /* We've already held the mmap_sem */ + for (vma = mm->mmap; vma; vma = vma->vm_next) { + int ret; + struct mm_walk mem_cgroup_move_charge_walk = { + .pmd_entry = mem_cgroup_move_charge_pte_range, + .mm = mm, + .private = vma, + }; + if (is_vm_hugetlb_page(vma)) + continue; + ret = walk_page_range(vma->vm_start, vma->vm_end, + &mem_cgroup_move_charge_walk); + if (ret) + /* + * means we have consumed all precharges and failed in + * doing additional charge. Just abandon here. + */ + break; + } +} + static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p) + struct task_struct *p, + bool threadgroup) +{ + if (!mc.mm) + /* no need to move charge */ + return; + + mem_cgroup_move_charge(mc.mm); + mem_cgroup_clear_mc(); +} +#else /* !CONFIG_MMU */ +static int mem_cgroup_can_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) +{ + return 0; +} +static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss, + struct cgroup *cgroup, + struct task_struct *p, + bool threadgroup) { - mutex_lock(&memcg_tasklist); - /* - * FIXME: It's better to move charges of this process from old - * memcg to new memcg. But it's just on TODO-List now. - */ - mutex_unlock(&memcg_tasklist); } +static void mem_cgroup_move_task(struct cgroup_subsys *ss, + struct cgroup *cont, + struct cgroup *old_cont, + struct task_struct *p, + bool threadgroup) +{ +} +#endif struct cgroup_subsys mem_cgroup_subsys = { .name = "memory", @@ -2612,16 +4918,28 @@ struct cgroup_subsys mem_cgroup_subsys = { .pre_destroy = mem_cgroup_pre_destroy, .destroy = mem_cgroup_destroy, .populate = mem_cgroup_populate, + .can_attach = mem_cgroup_can_attach, + .cancel_attach = mem_cgroup_cancel_attach, .attach = mem_cgroup_move_task, .early_init = 0, .use_id = 1, }; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +static int __init enable_swap_account(char *s) +{ + /* consider enabled if no parameter or 1 is given */ + if (!s || !strcmp(s, "1")) + really_do_swap_account = 1; + else if (!strcmp(s, "0")) + really_do_swap_account = 0; + return 1; +} +__setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - really_do_swap_account = 0; + enable_swap_account("0"); return 1; } __setup("noswapaccount", disable_swap_account); diff --git a/mm/memory-failure.c b/mm/memory-failure.c new file mode 100644 index 0000000..46ab2c0 --- /dev/null +++ b/mm/memory-failure.c @@ -0,0 +1,1467 @@ +/* + * Copyright (C) 2008, 2009 Intel Corporation + * Authors: Andi Kleen, Fengguang Wu + * + * This software may be redistributed and/or modified under the terms of + * the GNU General Public License ("GPL") version 2 only as published by the + * Free Software Foundation. + * + * High level machine check handler. Handles pages reported by the + * hardware as being corrupted usually due to a multi-bit ECC memory or cache + * failure. + * + * In addition there is a "soft offline" entry point that allows stop using + * not-yet-corrupted-by-suspicious pages without killing anything. + * + * Handles page cache pages in various states. The tricky part + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means + * the error handling takes potentially a long time. + * + * There are several operations here with exponential complexity because + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and + * has non linear complexity with the number. But since memory corruptions + * are rare we hope to get away with this. This avoids impacting the core + * VM. + */ + +/* + * Notebook: + * - hugetlb needs more code + * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages + * - pass bad pages to kdump next kernel + */ +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/kernel-page-flags.h> +#include <linux/sched.h> +#include <linux/ksm.h> +#include <linux/rmap.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/backing-dev.h> +#include <linux/migrate.h> +#include <linux/page-isolation.h> +#include <linux/suspend.h> +#include <linux/slab.h> +#include <linux/swapops.h> +#include <linux/hugetlb.h> +#include <linux/memory_hotplug.h> +#include "internal.h" + +int sysctl_memory_failure_early_kill __read_mostly = 0; + +int sysctl_memory_failure_recovery __read_mostly = 1; + +atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); + +#if defined(CONFIG_HWPOISON_INJECT) || defined(CONFIG_HWPOISON_INJECT_MODULE) + +u32 hwpoison_filter_enable = 0; +u32 hwpoison_filter_dev_major = ~0U; +u32 hwpoison_filter_dev_minor = ~0U; +u64 hwpoison_filter_flags_mask; +u64 hwpoison_filter_flags_value; +EXPORT_SYMBOL_GPL(hwpoison_filter_enable); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_major); +EXPORT_SYMBOL_GPL(hwpoison_filter_dev_minor); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_mask); +EXPORT_SYMBOL_GPL(hwpoison_filter_flags_value); + +static int hwpoison_filter_dev(struct page *p) +{ + struct address_space *mapping; + dev_t dev; + + if (hwpoison_filter_dev_major == ~0U && + hwpoison_filter_dev_minor == ~0U) + return 0; + + /* + * page_mapping() does not accept slab pages. + */ + if (PageSlab(p)) + return -EINVAL; + + mapping = page_mapping(p); + if (mapping == NULL || mapping->host == NULL) + return -EINVAL; + + dev = mapping->host->i_sb->s_dev; + if (hwpoison_filter_dev_major != ~0U && + hwpoison_filter_dev_major != MAJOR(dev)) + return -EINVAL; + if (hwpoison_filter_dev_minor != ~0U && + hwpoison_filter_dev_minor != MINOR(dev)) + return -EINVAL; + + return 0; +} + +static int hwpoison_filter_flags(struct page *p) +{ + if (!hwpoison_filter_flags_mask) + return 0; + + if ((stable_page_flags(p) & hwpoison_filter_flags_mask) == + hwpoison_filter_flags_value) + return 0; + else + return -EINVAL; +} + +/* + * This allows stress tests to limit test scope to a collection of tasks + * by putting them under some memcg. This prevents killing unrelated/important + * processes such as /sbin/init. Note that the target task may share clean + * pages with init (eg. libc text), which is harmless. If the target task + * share _dirty_ pages with another task B, the test scheme must make sure B + * is also included in the memcg. At last, due to race conditions this filter + * can only guarantee that the page either belongs to the memcg tasks, or is + * a freed page. + */ +#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +u64 hwpoison_filter_memcg; +EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); +static int hwpoison_filter_task(struct page *p) +{ + struct mem_cgroup *mem; + struct cgroup_subsys_state *css; + unsigned long ino; + + if (!hwpoison_filter_memcg) + return 0; + + mem = try_get_mem_cgroup_from_page(p); + if (!mem) + return -EINVAL; + + css = mem_cgroup_css(mem); + /* root_mem_cgroup has NULL dentries */ + if (!css->cgroup->dentry) + return -EINVAL; + + ino = css->cgroup->dentry->d_inode->i_ino; + css_put(css); + + if (ino != hwpoison_filter_memcg) + return -EINVAL; + + return 0; +} +#else +static int hwpoison_filter_task(struct page *p) { return 0; } +#endif + +int hwpoison_filter(struct page *p) +{ + if (!hwpoison_filter_enable) + return 0; + + if (hwpoison_filter_dev(p)) + return -EINVAL; + + if (hwpoison_filter_flags(p)) + return -EINVAL; + + if (hwpoison_filter_task(p)) + return -EINVAL; + + return 0; +} +#else +int hwpoison_filter(struct page *p) +{ + return 0; +} +#endif + +EXPORT_SYMBOL_GPL(hwpoison_filter); + +/* + * Send all the processes who have the page mapped an ``action optional'' + * signal. + */ +static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, + unsigned long pfn, struct page *page) +{ + struct siginfo si; + int ret; + + printk(KERN_ERR + "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", + pfn, t->comm, t->pid); + si.si_signo = SIGBUS; + si.si_errno = 0; + si.si_code = BUS_MCEERR_AO; + si.si_addr = (void *)addr; +#ifdef __ARCH_SI_TRAPNO + si.si_trapno = trapno; +#endif + si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + /* + * Don't use force here, it's convenient if the signal + * can be temporarily blocked. + * This could cause a loop when the user sets SIGBUS + * to SIG_IGN, but hopefully noone will do that? + */ + ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ + if (ret < 0) + printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", + t->comm, t->pid, ret); + return ret; +} + +/* + * When a unknown page type is encountered drain as many buffers as possible + * in the hope to turn the page into a LRU or free page, which we can handle. + */ +void shake_page(struct page *p, int access) +{ + if (!PageSlab(p)) { + lru_add_drain_all(); + if (PageLRU(p)) + return; + drain_all_pages(); + if (PageLRU(p) || is_free_buddy_page(p)) + return; + } + + /* + * Only all shrink_slab here (which would also + * shrink other caches) if access is not potentially fatal. + */ + if (access) { + int nr; + do { + nr = shrink_slab(1000, GFP_KERNEL, 1000); + if (page_count(p) == 1) + break; + } while (nr > 10); + } +} +EXPORT_SYMBOL_GPL(shake_page); + +/* + * Kill all processes that have a poisoned page mapped and then isolate + * the page. + * + * General strategy: + * Find all processes having the page mapped and kill them. + * But we keep a page reference around so that the page is not + * actually freed yet. + * Then stash the page away + * + * There's no convenient way to get back to mapped processes + * from the VMAs. So do a brute-force search over all + * running processes. + * + * Remember that machine checks are not common (or rather + * if they are common you have other problems), so this shouldn't + * be a performance issue. + * + * Also there are some races possible while we get from the + * error detection to actually handle it. + */ + +struct to_kill { + struct list_head nd; + struct task_struct *tsk; + unsigned long addr; + char addr_valid; +}; + +/* + * Failure handling: if we can't find or can't kill a process there's + * not much we can do. We just print a message and ignore otherwise. + */ + +/* + * Schedule a process for later kill. + * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * TBD would GFP_NOIO be enough? + */ +static void add_to_kill(struct task_struct *tsk, struct page *p, + struct vm_area_struct *vma, + struct list_head *to_kill, + struct to_kill **tkc) +{ + struct to_kill *tk; + + if (*tkc) { + tk = *tkc; + *tkc = NULL; + } else { + tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); + if (!tk) { + printk(KERN_ERR + "MCE: Out of memory while machine check handling\n"); + return; + } + } + tk->addr = page_address_in_vma(p, vma); + tk->addr_valid = 1; + + /* + * In theory we don't have to kill when the page was + * munmaped. But it could be also a mremap. Since that's + * likely very rare kill anyways just out of paranoia, but use + * a SIGKILL because the error is not contained anymore. + */ + if (tk->addr == -EFAULT) { + pr_info("MCE: Unable to find user space address %lx in %s\n", + page_to_pfn(p), tsk->comm); + tk->addr_valid = 0; + } + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Kill the processes that have been collected earlier. + * + * Only do anything when DOIT is set, otherwise just free the list + * (this is used for clean pages which do not need killing) + * Also when FAIL is set do a force kill because something went + * wrong earlier. + */ +static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, + int fail, struct page *page, unsigned long pfn) +{ + struct to_kill *tk, *next; + + list_for_each_entry_safe (tk, next, to_kill, nd) { + if (doit) { + /* + * In case something went wrong with munmapping + * make sure the process doesn't catch the + * signal and then access the memory. Just kill it. + */ + if (fail || tk->addr_valid == 0) { + printk(KERN_ERR + "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pfn, tk->tsk->comm, tk->tsk->pid); + force_sig(SIGKILL, tk->tsk); + } + + /* + * In theory the process could have mapped + * something else on the address in-between. We could + * check for that, but we need to tell the + * process anyways. + */ + else if (kill_proc_ao(tk->tsk, tk->addr, trapno, + pfn, page) < 0) + printk(KERN_ERR + "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pfn, tk->tsk->comm, tk->tsk->pid); + } + put_task_struct(tk->tsk); + kfree(tk); + } +} + +static int task_early_kill(struct task_struct *tsk) +{ + if (!tsk->mm) + return 0; + if (tsk->flags & PF_MCE_PROCESS) + return !!(tsk->flags & PF_MCE_EARLY); + return sysctl_memory_failure_early_kill; +} + +/* + * Collect processes when the error hit an anonymous page. + */ +static void collect_procs_anon(struct page *page, struct list_head *to_kill, + struct to_kill **tkc) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct anon_vma *av; + + read_lock(&tasklist_lock); + av = page_lock_anon_vma(page); + if (av == NULL) /* Not actually mapped anymore */ + goto out; + for_each_process (tsk) { + struct anon_vma_chain *vmac; + + if (!task_early_kill(tsk)) + continue; + list_for_each_entry(vmac, &av->head, same_anon_vma) { + vma = vmac->vma; + if (!page_mapped_in_vma(page, vma)) + continue; + if (vma->vm_mm == tsk->mm) + add_to_kill(tsk, page, vma, to_kill, tkc); + } + } + page_unlock_anon_vma(av); +out: + read_unlock(&tasklist_lock); +} + +/* + * Collect processes when the error hit a file mapped page. + */ +static void collect_procs_file(struct page *page, struct list_head *to_kill, + struct to_kill **tkc) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct prio_tree_iter iter; + struct address_space *mapping = page->mapping; + + /* + * A note on the locking order between the two locks. + * We don't rely on this particular order. + * If you have some other code that needs a different order + * feel free to switch them around. Or add a reverse link + * from mm_struct to task_struct, then this could be all + * done without taking tasklist_lock and looping over all tasks. + */ + + read_lock(&tasklist_lock); + spin_lock(&mapping->i_mmap_lock); + for_each_process(tsk) { + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + if (!task_early_kill(tsk)) + continue; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + pgoff) { + /* + * Send early kill signal to tasks where a vma covers + * the page but the corrupted page is not necessarily + * mapped it in its pte. + * Assume applications who requested early kill want + * to be informed of all such data corruptions. + */ + if (vma->vm_mm == tsk->mm) + add_to_kill(tsk, page, vma, to_kill, tkc); + } + } + spin_unlock(&mapping->i_mmap_lock); + read_unlock(&tasklist_lock); +} + +/* + * Collect the processes who have the corrupted page mapped to kill. + * This is done in two steps for locking reasons. + * First preallocate one tokill structure outside the spin locks, + * so that we can kill at least one process reasonably reliable. + */ +static void collect_procs(struct page *page, struct list_head *tokill) +{ + struct to_kill *tk; + + if (!page->mapping) + return; + + tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); + if (!tk) + return; + if (PageAnon(page)) + collect_procs_anon(page, tokill, &tk); + else + collect_procs_file(page, tokill, &tk); + kfree(tk); +} + +/* + * Error handlers for various types of pages. + */ + +enum outcome { + IGNORED, /* Error: cannot be handled */ + FAILED, /* Error: handling failed */ + DELAYED, /* Will be handled later */ + RECOVERED, /* Successfully recovered */ +}; + +static const char *action_name[] = { + [IGNORED] = "Ignored", + [FAILED] = "Failed", + [DELAYED] = "Delayed", + [RECOVERED] = "Recovered", +}; + +/* + * XXX: It is possible that a page is isolated from LRU cache, + * and then kept in swap cache or failed to remove from page cache. + * The page count will stop it from being freed by unpoison. + * Stress tests should be aware of this memory leak problem. + */ +static int delete_from_lru_cache(struct page *p) +{ + if (!isolate_lru_page(p)) { + /* + * Clear sensible page flags, so that the buddy system won't + * complain when the page is unpoison-and-freed. + */ + ClearPageActive(p); + ClearPageUnevictable(p); + /* + * drop the page count elevated by isolate_lru_page() + */ + page_cache_release(p); + return 0; + } + return -EIO; +} + +/* + * Error hit kernel page. + * Do nothing, try to be lucky and not touch this instead. For a few cases we + * could be more sophisticated. + */ +static int me_kernel(struct page *p, unsigned long pfn) +{ + return IGNORED; +} + +/* + * Page in unknown state. Do nothing. + */ +static int me_unknown(struct page *p, unsigned long pfn) +{ + printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); + return FAILED; +} + +/* + * Clean (or cleaned) page cache page. + */ +static int me_pagecache_clean(struct page *p, unsigned long pfn) +{ + int err; + int ret = FAILED; + struct address_space *mapping; + + delete_from_lru_cache(p); + + /* + * For anonymous pages we're done the only reference left + * should be the one m_f() holds. + */ + if (PageAnon(p)) + return RECOVERED; + + /* + * Now truncate the page in the page cache. This is really + * more like a "temporary hole punch" + * Don't do this for block devices when someone else + * has a reference, because it could be file system metadata + * and that's not safe to truncate. + */ + mapping = page_mapping(p); + if (!mapping) { + /* + * Page has been teared down in the meanwhile + */ + return FAILED; + } + + /* + * Truncation is a bit tricky. Enable it per file system for now. + * + * Open: to take i_mutex or not for this? Right now we don't. + */ + if (mapping->a_ops->error_remove_page) { + err = mapping->a_ops->error_remove_page(mapping, p); + if (err != 0) { + printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", + pfn, err); + } else if (page_has_private(p) && + !try_to_release_page(p, GFP_NOIO)) { + pr_info("MCE %#lx: failed to release buffers\n", pfn); + } else { + ret = RECOVERED; + } + } else { + /* + * If the file system doesn't support it just invalidate + * This fails on dirty or anything with private pages + */ + if (invalidate_inode_page(p)) + ret = RECOVERED; + else + printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", + pfn); + } + return ret; +} + +/* + * Dirty cache page page + * Issues: when the error hit a hole page the error is not properly + * propagated. + */ +static int me_pagecache_dirty(struct page *p, unsigned long pfn) +{ + struct address_space *mapping = page_mapping(p); + + SetPageError(p); + /* TBD: print more information about the file. */ + if (mapping) { + /* + * IO error will be reported by write(), fsync(), etc. + * who check the mapping. + * This way the application knows that something went + * wrong with its dirty file data. + * + * There's one open issue: + * + * The EIO will be only reported on the next IO + * operation and then cleared through the IO map. + * Normally Linux has two mechanisms to pass IO error + * first through the AS_EIO flag in the address space + * and then through the PageError flag in the page. + * Since we drop pages on memory failure handling the + * only mechanism open to use is through AS_AIO. + * + * This has the disadvantage that it gets cleared on + * the first operation that returns an error, while + * the PageError bit is more sticky and only cleared + * when the page is reread or dropped. If an + * application assumes it will always get error on + * fsync, but does other operations on the fd before + * and the page is dropped inbetween then the error + * will not be properly reported. + * + * This can already happen even without hwpoisoned + * pages: first on metadata IO errors (which only + * report through AS_EIO) or when the page is dropped + * at the wrong time. + * + * So right now we assume that the application DTRT on + * the first EIO, but we're not worse than other parts + * of the kernel. + */ + mapping_set_error(mapping, EIO); + } + + return me_pagecache_clean(p, pfn); +} + +/* + * Clean and dirty swap cache. + * + * Dirty swap cache page is tricky to handle. The page could live both in page + * cache and swap cache(ie. page is freshly swapped in). So it could be + * referenced concurrently by 2 types of PTEs: + * normal PTEs and swap PTEs. We try to handle them consistently by calling + * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, + * and then + * - clear dirty bit to prevent IO + * - remove from LRU + * - but keep in the swap cache, so that when we return to it on + * a later page fault, we know the application is accessing + * corrupted data and shall be killed (we installed simple + * interception code in do_swap_page to catch it). + * + * Clean swap cache pages can be directly isolated. A later page fault will + * bring in the known good data from disk. + */ +static int me_swapcache_dirty(struct page *p, unsigned long pfn) +{ + ClearPageDirty(p); + /* Trigger EIO in shmem: */ + ClearPageUptodate(p); + + if (!delete_from_lru_cache(p)) + return DELAYED; + else + return FAILED; +} + +static int me_swapcache_clean(struct page *p, unsigned long pfn) +{ + delete_from_swap_cache(p); + + if (!delete_from_lru_cache(p)) + return RECOVERED; + else + return FAILED; +} + +/* + * Huge pages. Needs work. + * Issues: + * - Error on hugepage is contained in hugepage unit (not in raw page unit.) + * To narrow down kill region to one page, we need to break up pmd. + */ +static int me_huge_page(struct page *p, unsigned long pfn) +{ + int res = 0; + struct page *hpage = compound_head(p); + /* + * We can safely recover from error on free or reserved (i.e. + * not in-use) hugepage by dequeuing it from freelist. + * To check whether a hugepage is in-use or not, we can't use + * page->lru because it can be used in other hugepage operations, + * such as __unmap_hugepage_range() and gather_surplus_pages(). + * So instead we use page_mapping() and PageAnon(). + * We assume that this function is called with page lock held, + * so there is no race between isolation and mapping/unmapping. + */ + if (!(page_mapping(hpage) || PageAnon(hpage))) { + res = dequeue_hwpoisoned_huge_page(hpage); + if (!res) + return RECOVERED; + } + return DELAYED; +} + +/* + * Various page states we can handle. + * + * A page state is defined by its current page->flags bits. + * The table matches them in order and calls the right handler. + * + * This is quite tricky because we can access page at any time + * in its live cycle, so all accesses have to be extremly careful. + * + * This is not complete. More states could be added. + * For any missing state don't attempt recovery. + */ + +#define dirty (1UL << PG_dirty) +#define sc (1UL << PG_swapcache) +#define unevict (1UL << PG_unevictable) +#define mlock (1UL << PG_mlocked) +#define writeback (1UL << PG_writeback) +#define lru (1UL << PG_lru) +#define swapbacked (1UL << PG_swapbacked) +#define head (1UL << PG_head) +#define tail (1UL << PG_tail) +#define compound (1UL << PG_compound) +#define slab (1UL << PG_slab) +#define reserved (1UL << PG_reserved) + +static struct page_state { + unsigned long mask; + unsigned long res; + char *msg; + int (*action)(struct page *p, unsigned long pfn); +} error_states[] = { + { reserved, reserved, "reserved kernel", me_kernel }, + /* + * free pages are specially detected outside this table: + * PG_buddy pages only make a small fraction of all free pages. + */ + + /* + * Could in theory check if slab page is free or if we can drop + * currently unused objects without touching them. But just + * treat it as standard kernel for now. + */ + { slab, slab, "kernel slab", me_kernel }, + +#ifdef CONFIG_PAGEFLAGS_EXTENDED + { head, head, "huge", me_huge_page }, + { tail, tail, "huge", me_huge_page }, +#else + { compound, compound, "huge", me_huge_page }, +#endif + + { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, + { sc|dirty, sc, "swapcache", me_swapcache_clean }, + + { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, + { unevict, unevict, "unevictable LRU", me_pagecache_clean}, + + { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, + { mlock, mlock, "mlocked LRU", me_pagecache_clean }, + + { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, + { lru|dirty, lru, "clean LRU", me_pagecache_clean }, + + /* + * Catchall entry: must be at end. + */ + { 0, 0, "unknown page state", me_unknown }, +}; + +#undef dirty +#undef sc +#undef unevict +#undef mlock +#undef writeback +#undef lru +#undef swapbacked +#undef head +#undef tail +#undef compound +#undef slab +#undef reserved + +static void action_result(unsigned long pfn, char *msg, int result) +{ + struct page *page = pfn_to_page(pfn); + + printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", + pfn, + PageDirty(page) ? "dirty " : "", + msg, action_name[result]); +} + +static int page_action(struct page_state *ps, struct page *p, + unsigned long pfn) +{ + int result; + int count; + + result = ps->action(p, pfn); + action_result(pfn, ps->msg, result); + + count = page_count(p) - 1; + if (ps->action == me_swapcache_dirty && result == DELAYED) + count--; + if (count != 0) { + printk(KERN_ERR + "MCE %#lx: %s page still referenced by %d users\n", + pfn, ps->msg, count); + result = FAILED; + } + + /* Could do more checks here if page looks ok */ + /* + * Could adjust zone counters here to correct for the missing page. + */ + + return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; +} + +/* + * Do all that is necessary to remove user space mappings. Unmap + * the pages and send SIGBUS to the processes if the data was dirty. + */ +static int hwpoison_user_mappings(struct page *p, unsigned long pfn, + int trapno) +{ + enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + struct address_space *mapping; + LIST_HEAD(tokill); + int ret; + int kill = 1; + struct page *hpage = compound_head(p); + + if (PageReserved(p) || PageSlab(p)) + return SWAP_SUCCESS; + + /* + * This check implies we don't kill processes if their pages + * are in the swap cache early. Those are always late kills. + */ + if (!page_mapped(hpage)) + return SWAP_SUCCESS; + + if (PageKsm(p)) + return SWAP_FAIL; + + if (PageSwapCache(p)) { + printk(KERN_ERR + "MCE %#lx: keeping poisoned page in swap cache\n", pfn); + ttu |= TTU_IGNORE_HWPOISON; + } + + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + * XXX: the dirty test could be racy: set_page_dirty() may not always + * be called inside page lock (it's recommended but not enforced). + */ + mapping = page_mapping(hpage); + if (!PageDirty(hpage) && mapping && + mapping_cap_writeback_dirty(mapping)) { + if (page_mkclean(hpage)) { + SetPageDirty(hpage); + } else { + kill = 0; + ttu |= TTU_IGNORE_HWPOISON; + printk(KERN_INFO + "MCE %#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + + /* + * First collect all the processes that have the page + * mapped in dirty form. This has to be done before try_to_unmap, + * because ttu takes the rmap data structures down. + * + * Error handling: We ignore errors here because + * there's nothing that can be done. + */ + if (kill) + collect_procs(hpage, &tokill); + + ret = try_to_unmap(hpage, ttu); + if (ret != SWAP_SUCCESS) + printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(hpage)); + + /* + * Now that the dirty bit has been propagated to the + * struct page and all unmaps done we can decide if + * killing is needed or not. Only kill when the page + * was dirty, otherwise the tokill list is merely + * freed. When there was a problem unmapping earlier + * use a more force-full uncatchable kill to prevent + * any accesses to the poisoned memory. + */ + kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, + ret != SWAP_SUCCESS, p, pfn); + + return ret; +} + +static void set_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_order(hpage); + for (i = 0; i < nr_pages; i++) + SetPageHWPoison(hpage + i); +} + +static void clear_page_hwpoison_huge_page(struct page *hpage) +{ + int i; + int nr_pages = 1 << compound_order(hpage); + for (i = 0; i < nr_pages; i++) + ClearPageHWPoison(hpage + i); +} + +int __memory_failure(unsigned long pfn, int trapno, int flags) +{ + struct page_state *ps; + struct page *p; + struct page *hpage; + int res; + unsigned int nr_pages; + + if (!sysctl_memory_failure_recovery) + panic("Memory failure from trap %d on page %lx", trapno, pfn); + + if (!pfn_valid(pfn)) { + printk(KERN_ERR + "MCE %#lx: memory outside kernel control\n", + pfn); + return -ENXIO; + } + + p = pfn_to_page(pfn); + hpage = compound_head(p); + if (TestSetPageHWPoison(p)) { + printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn); + return 0; + } + + nr_pages = 1 << compound_order(hpage); + atomic_long_add(nr_pages, &mce_bad_pages); + + /* + * We need/can do nothing about count=0 pages. + * 1) it's a free page, and therefore in safe hand: + * prep_new_page() will be the gate keeper. + * 2) it's a free hugepage, which is also safe: + * an affected hugepage will be dequeued from hugepage freelist, + * so there's no concern about reusing it ever after. + * 3) it's part of a non-compound high order page. + * Implies some kernel user: cannot stop them from + * R/W the page; let's pray that the page has been + * used and will be freed some time later. + * In fact it's dangerous to directly bump up page count from 0, + * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. + */ + if (!(flags & MF_COUNT_INCREASED) && + !get_page_unless_zero(hpage)) { + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy", DELAYED); + return 0; + } else if (PageHuge(hpage)) { + /* + * Check "just unpoisoned", "filter hit", and + * "race with other subpage." + */ + lock_page_nosync(hpage); + if (!PageHWPoison(hpage) + || (hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &mce_bad_pages); + return 0; + } + set_page_hwpoison_huge_page(hpage); + res = dequeue_hwpoisoned_huge_page(hpage); + action_result(pfn, "free huge", + res ? IGNORED : DELAYED); + unlock_page(hpage); + return res; + } else { + action_result(pfn, "high order kernel", IGNORED); + return -EBUSY; + } + } + + /* + * We ignore non-LRU pages for good reasons. + * - PG_locked is only well defined for LRU pages and a few others + * - to avoid races with __set_page_locked() + * - to avoid races with __SetPageSlab*() (and more non-atomic ops) + * The check (unnecessarily) ignores LRU pages being isolated and + * walked by the page reclaim code, however that's not a big loss. + */ + if (!PageLRU(p) && !PageHuge(p)) + shake_page(p, 0); + if (!PageLRU(p) && !PageHuge(p)) { + /* + * shake_page could have turned it free. + */ + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy, 2nd try", DELAYED); + return 0; + } + action_result(pfn, "non LRU", IGNORED); + put_page(p); + return -EBUSY; + } + + /* + * Lock the page and wait for writeback to finish. + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ + lock_page_nosync(hpage); + + /* + * unpoison always clear PG_hwpoison inside page lock + */ + if (!PageHWPoison(p)) { + printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn); + res = 0; + goto out; + } + if (hwpoison_filter(p)) { + if (TestClearPageHWPoison(p)) + atomic_long_sub(nr_pages, &mce_bad_pages); + unlock_page(hpage); + put_page(hpage); + return 0; + } + + /* + * For error on the tail page, we should set PG_hwpoison + * on the head page to show that the hugepage is hwpoisoned + */ + if (PageTail(p) && TestSetPageHWPoison(hpage)) { + action_result(pfn, "hugepage already hardware poisoned", + IGNORED); + unlock_page(hpage); + put_page(hpage); + return 0; + } + /* + * Set PG_hwpoison on all pages in an error hugepage, + * because containment is done in hugepage unit for now. + * Since we have done TestSetPageHWPoison() for the head page with + * page lock held, we can safely set PG_hwpoison bits on tail pages. + */ + if (PageHuge(p)) + set_page_hwpoison_huge_page(hpage); + + wait_on_page_writeback(p); + + /* + * Now take care of user space mappings. + * Abort on fail: __remove_from_page_cache() assumes unmapped page. + */ + if (hwpoison_user_mappings(p, pfn, trapno) != SWAP_SUCCESS) { + printk(KERN_ERR "MCE %#lx: cannot unmap page, give up\n", pfn); + res = -EBUSY; + goto out; + } + + /* + * Torn down by someone else? + */ + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + action_result(pfn, "already truncated LRU", IGNORED); + res = -EBUSY; + goto out; + } + + res = -EBUSY; + for (ps = error_states;; ps++) { + if ((p->flags & ps->mask) == ps->res) { + res = page_action(ps, p, pfn); + break; + } + } +out: + unlock_page(hpage); + return res; +} +EXPORT_SYMBOL_GPL(__memory_failure); + +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + */ +void memory_failure(unsigned long pfn, int trapno) +{ + __memory_failure(pfn, trapno, 0); +} + +/** + * unpoison_memory - Unpoison a previously poisoned page + * @pfn: Page number of the to be unpoisoned page + * + * Software-unpoison a page that has been poisoned by + * memory_failure() earlier. + * + * This is only done on the software-level, so it only works + * for linux injected failures, not real hardware failures + * + * Returns 0 for success, otherwise -errno. + */ +int unpoison_memory(unsigned long pfn) +{ + struct page *page; + struct page *p; + int freeit = 0; + unsigned int nr_pages; + + if (!pfn_valid(pfn)) + return -ENXIO; + + p = pfn_to_page(pfn); + page = compound_head(p); + + if (!PageHWPoison(p)) { + pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); + return 0; + } + + nr_pages = 1 << compound_order(page); + + if (!get_page_unless_zero(page)) { + /* + * Since HWPoisoned hugepage should have non-zero refcount, + * race between memory failure and unpoison seems to happen. + * In such case unpoison fails and memory failure runs + * to the end. + */ + if (PageHuge(page)) { + pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + return 0; + } + if (TestClearPageHWPoison(p)) + atomic_long_sub(nr_pages, &mce_bad_pages); + pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); + return 0; + } + + lock_page_nosync(page); + /* + * This test is racy because PG_hwpoison is set outside of page lock. + * That's acceptable because that won't trigger kernel panic. Instead, + * the PG_hwpoison page will be caught and isolated on the entrance to + * the free buddy page pool. + */ + if (TestClearPageHWPoison(page)) { + pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); + atomic_long_sub(nr_pages, &mce_bad_pages); + freeit = 1; + if (PageHuge(page)) + clear_page_hwpoison_huge_page(page); + } + unlock_page(page); + + put_page(page); + if (freeit) + put_page(page); + + return 0; +} +EXPORT_SYMBOL(unpoison_memory); + +static struct page *new_page(struct page *p, unsigned long private, int **x) +{ + int nid = page_to_nid(p); + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + nid); + else + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); +} + +/* + * Safely get reference count of an arbitrary page. + * Returns 0 for a free page, -EIO for a zero refcount page + * that is not free, and 1 for any other page type. + * For 1 the page is returned with increased page count, otherwise not. + */ +static int get_any_page(struct page *p, unsigned long pfn, int flags) +{ + int ret; + + if (flags & MF_COUNT_INCREASED) + return 1; + + /* + * The lock_memory_hotplug prevents a race with memory hotplug. + * This is a big hammer, a better would be nicer. + */ + lock_memory_hotplug(); + + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. + */ + set_migratetype_isolate(p); + /* + * When the target page is a free hugepage, just remove it + * from free hugepage list. + */ + if (!get_page_unless_zero(compound_head(p))) { + if (PageHuge(p)) { + pr_info("get_any_page: %#lx free huge page\n", pfn); + ret = dequeue_hwpoisoned_huge_page(compound_head(p)); + } else if (is_free_buddy_page(p)) { + pr_info("get_any_page: %#lx free buddy page\n", pfn); + /* Set hwpoison bit while page is still isolated */ + SetPageHWPoison(p); + ret = 0; + } else { + pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", + pfn, p->flags); + ret = -EIO; + } + } else { + /* Not a free page */ + ret = 1; + } + unset_migratetype_isolate(p); + unlock_memory_hotplug(); + return ret; +} + +static int soft_offline_huge_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + + if (PageHWPoison(hpage)) { + put_page(hpage); + pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); + return -EBUSY; + } + + /* Keep page count to indicate a given hugepage is isolated. */ + + list_add(&hpage->lru, &pagelist); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + if (ret) { + putback_lru_pages(&pagelist); + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + return ret; + } +done: + if (!PageHWPoison(hpage)) + atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + /* keep elevated page count for bad page */ + return ret; +} + +/** + * soft_offline_page - Soft offline a page. + * @page: page to offline + * @flags: flags. Same as memory_failure(). + * + * Returns 0 on success, otherwise negated errno. + * + * Soft offline a page, by migration or invalidation, + * without killing anything. This is for the case when + * a page is not corrupted yet (so it's still valid to access), + * but has had a number of corrected errors and is better taken + * out. + * + * The actual policy on when to do that is maintained by + * user space. + * + * This should never impact any application or cause data loss, + * however it might take some time. + * + * This is not a 100% solution for all memory, but tries to be + * ``good enough'' for the majority of memory. + */ +int soft_offline_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + + if (PageHuge(page)) + return soft_offline_huge_page(page, flags); + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + + /* + * Page cache page we can handle? + */ + if (!PageLRU(page)) { + /* + * Try to free it. + */ + put_page(page); + shake_page(page, 1); + + /* + * Did it turn free? + */ + ret = get_any_page(page, pfn, 0); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + } + if (!PageLRU(page)) { + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", + pfn, page->flags); + return -EIO; + } + + lock_page(page); + wait_on_page_writeback(page); + + /* + * Synchronized using the page lock with memory_failure() + */ + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + pr_info("soft offline: %#lx page already poisoned\n", pfn); + return -EBUSY; + } + + /* + * Try to invalidate first. This should work for + * non dirty unmapped page cache pages. + */ + ret = invalidate_inode_page(page); + unlock_page(page); + + /* + * Drop count because page migration doesn't like raised + * counts. The page could get re-allocated, but if it becomes + * LRU the isolation will just fail. + * RED-PEN would be better to keep it isolated here, but we + * would need to fix isolation locking first. + */ + put_page(page); + if (ret == 1) { + ret = 0; + pr_info("soft_offline: %#lx: invalidated\n", pfn); + goto done; + } + + /* + * Simple invalidation didn't work. + * Try to migrate to a new page instead. migrate.c + * handles a large number of cases for us. + */ + ret = isolate_lru_page(page); + if (!ret) { + LIST_HEAD(pagelist); + + list_add(&page->lru, &pagelist); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + if (ret) { + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + } + } else { + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pfn, ret, page_count(page), page->flags); + } + if (ret) + return ret; + +done: + atomic_long_add(1, &mce_bad_pages); + SetPageHWPoison(page); + /* keep elevated page count for bad page */ + return ret; +} + +/* + * The caller must hold current->mm->mmap_sem in read mode. + */ +int is_hwpoison_address(unsigned long addr) +{ + pgd_t *pgdp; + pud_t pud, *pudp; + pmd_t pmd, *pmdp; + pte_t pte, *ptep; + swp_entry_t entry; + + pgdp = pgd_offset(current->mm, addr); + if (!pgd_present(*pgdp)) + return 0; + pudp = pud_offset(pgdp, addr); + pud = *pudp; + if (!pud_present(pud) || pud_large(pud)) + return 0; + pmdp = pmd_offset(pudp, addr); + pmd = *pmdp; + if (!pmd_present(pmd) || pmd_large(pmd)) + return 0; + ptep = pte_offset_map(pmdp, addr); + pte = *ptep; + pte_unmap(ptep); + if (!is_swap_pte(pte)) + return 0; + entry = pte_to_swp_entry(pte); + return is_hwpoison_entry(entry); +} +EXPORT_SYMBOL_GPL(is_hwpoison_address); diff --git a/mm/memory.c b/mm/memory.c index 98bcb90..02e48aa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -45,6 +45,7 @@ #include <linux/swap.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/module.h> #include <linux/delayacct.h> @@ -55,7 +56,9 @@ #include <linux/kallsyms.h> #include <linux/swapops.h> #include <linux/elf.h> +#include <linux/gfp.h> +#include <asm/io.h> #include <asm/pgalloc.h> #include <asm/uaccess.h> #include <asm/tlb.h> @@ -106,6 +109,89 @@ static int __init disable_randmaps(char *s) } __setup("norandmaps", disable_randmaps); +unsigned long zero_pfn __read_mostly; +unsigned long highest_memmap_pfn __read_mostly; + +/* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +static int __init init_zero_pfn(void) +{ + zero_pfn = page_to_pfn(ZERO_PAGE(0)); + return 0; +} +core_initcall(init_zero_pfn); + + +#if defined(SPLIT_RSS_COUNTING) + +static void __sync_task_rss_stat(struct task_struct *task, struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + if (task->rss_stat.count[i]) { + add_mm_counter(mm, i, task->rss_stat.count[i]); + task->rss_stat.count[i] = 0; + } + } + task->rss_stat.events = 0; +} + +static void add_mm_counter_fast(struct mm_struct *mm, int member, int val) +{ + struct task_struct *task = current; + + if (likely(task->mm == mm)) + task->rss_stat.count[member] += val; + else + add_mm_counter(mm, member, val); +} +#define inc_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, 1) +#define dec_mm_counter_fast(mm, member) add_mm_counter_fast(mm, member, -1) + +/* sync counter once per 64 page faults */ +#define TASK_RSS_EVENTS_THRESH (64) +static void check_sync_rss_stat(struct task_struct *task) +{ + if (unlikely(task != current)) + return; + if (unlikely(task->rss_stat.events++ > TASK_RSS_EVENTS_THRESH)) + __sync_task_rss_stat(task, task->mm); +} + +unsigned long get_mm_counter(struct mm_struct *mm, int member) +{ + long val = 0; + + /* + * Don't use task->mm here...for avoiding to use task_get_mm().. + * The caller must guarantee task->mm is not invalid. + */ + val = atomic_long_read(&mm->rss_stat.count[member]); + /* + * counter is updated in asynchronous manner and may go to minus. + * But it's never be expected number for users. + */ + if (val < 0) + return 0; + return (unsigned long)val; +} + +void sync_mm_rss(struct task_struct *task, struct mm_struct *mm) +{ + __sync_task_rss_stat(task, mm); +} +#else + +#define inc_mm_counter_fast(mm, member) inc_mm_counter(mm, member) +#define dec_mm_counter_fast(mm, member) dec_mm_counter(mm, member) + +static void check_sync_rss_stat(struct task_struct *task) +{ +} + +#endif /* * If a p?d_bad entry is found while walking page tables, report @@ -135,11 +221,12 @@ void pmd_clear_bad(pmd_t *pmd) * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, + unsigned long addr) { pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); - pte_free_tlb(tlb, token); + pte_free_tlb(tlb, token, addr); tlb->mm->nr_ptes--; } @@ -157,7 +244,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - free_pte_range(tlb, pmd); + free_pte_range(tlb, pmd, addr); } while (pmd++, addr = next, addr != end); start &= PUD_MASK; @@ -173,7 +260,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); - pmd_free_tlb(tlb, pmd); + pmd_free_tlb(tlb, pmd, start); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -206,7 +293,7 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, pud = pud_offset(pgd, start); pgd_clear(pgd); - pud_free_tlb(tlb, pud); + pud_free_tlb(tlb, pud, start); } /* @@ -220,7 +307,6 @@ void free_pgd_range(struct mmu_gather *tlb, { pgd_t *pgd; unsigned long next; - unsigned long start; /* * The next few lines have given us lots of grief... @@ -264,7 +350,6 @@ void free_pgd_range(struct mmu_gather *tlb, if (addr > end - 1) return; - start = addr; pgd = pgd_offset(tlb->mm, addr); do { next = pgd_addr_end(addr, end); @@ -282,9 +367,10 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr = vma->vm_start; /* - * Hide vma from rmap and vmtruncate before freeing pgtables + * Hide vma from rmap and truncate_pagecache before freeing + * pgtables */ - anon_vma_unlink(vma); + unlink_anon_vmas(vma); unlink_file_vma(vma); if (is_vm_hugetlb_page(vma)) { @@ -298,7 +384,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, && !is_vm_hugetlb_page(next)) { vma = next; next = vma->vm_next; - anon_vma_unlink(vma); + unlink_anon_vmas(vma); unlink_file_vma(vma); } free_pgd_range(tlb, addr, vma->vm_end, @@ -360,12 +446,20 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) return 0; } -static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) +static inline void init_rss_vec(int *rss) +{ + memset(rss, 0, sizeof(int) * NR_MM_COUNTERS); +} + +static inline void add_mm_rss_vec(struct mm_struct *mm, int *rss) { - if (file_rss) - add_mm_counter(mm, file_rss, file_rss); - if (anon_rss) - add_mm_counter(mm, anon_rss, anon_rss); + int i; + + if (current->mm == mm) + sync_mm_rss(current, mm); + for (i = 0; i < NR_MM_COUNTERS; i++) + if (rss[i]) + add_mm_counter(mm, i, rss[i]); } /* @@ -414,12 +508,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, "BUG: Bad page map in process %s pte:%08llx pmd:%08llx\n", current->comm, (long long)pte_val(pte), (long long)pmd_val(*pmd)); - if (page) { - printk(KERN_ALERT - "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", - page, (void *)page->flags, page_count(page), - page_mapcount(page), page->mapping, page->index); - } + if (page) + dump_page(page); printk(KERN_ALERT "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n", (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index); @@ -441,6 +531,20 @@ static inline int is_cow_mapping(unsigned int flags) return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; } +#ifndef is_zero_pfn +static inline int is_zero_pfn(unsigned long pfn) +{ + return pfn == zero_pfn; +} +#endif + +#ifndef my_zero_pfn +static inline unsigned long my_zero_pfn(unsigned long addr) +{ + return zero_pfn; +} +#endif + /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * @@ -496,7 +600,9 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (HAVE_PTE_SPECIAL) { if (likely(!pte_special(pte))) goto check_pfn; - if (!(vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))) + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) + return NULL; + if (!is_zero_pfn(pfn)) print_bad_pte(vma, addr, pte, NULL); return NULL; } @@ -518,6 +624,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } + if (is_zero_pfn(pfn)) + return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); @@ -538,7 +646,7 @@ out: * covered by this vma. */ -static inline void +static inline unsigned long copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma, unsigned long addr, int *rss) @@ -552,7 +660,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (!pte_file(pte)) { swp_entry_t entry = pte_to_swp_entry(pte); - swap_duplicate(entry); + if (swap_duplicate(entry) < 0) + return entry.val; + /* make sure dst_mm is on swapoff's mmlist. */ if (unlikely(list_empty(&dst_mm->mmlist))) { spin_lock(&mmlist_lock); @@ -561,7 +671,9 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, &src_mm->mmlist); spin_unlock(&mmlist_lock); } - if (is_write_migration_entry(entry) && + if (likely(!non_swap_entry(entry))) + rss[MM_SWAPENTS]++; + else if (is_write_migration_entry(entry) && is_cow_mapping(vm_flags)) { /* * COW mappings require pages in both parent @@ -595,31 +707,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, page = vm_normal_page(vma, addr, pte); if (page) { get_page(page); - page_dup_rmap(page, vma, addr); - rss[!!PageAnon(page)]++; + page_dup_rmap(page); + if (PageAnon(page)) + rss[MM_ANONPAGES]++; + else + rss[MM_FILEPAGES]++; } out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); + return 0; } static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, unsigned long addr, unsigned long end) { + pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; spinlock_t *src_ptl, *dst_ptl; int progress = 0; - int rss[2]; + int rss[NR_MM_COUNTERS]; + swp_entry_t entry = (swp_entry_t){0}; again: - rss[1] = rss[0] = 0; + init_rss_vec(rss); + dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl); if (!dst_pte) return -ENOMEM; - src_pte = pte_offset_map_nested(src_pmd, addr); + src_pte = pte_offset_map(src_pmd, addr); src_ptl = pte_lockptr(src_mm, src_pmd); spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + orig_src_pte = src_pte; + orig_dst_pte = dst_pte; arch_enter_lazy_mmu_mode(); do { @@ -637,16 +758,25 @@ again: progress++; continue; } - copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma, addr, rss); + entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, + vma, addr, rss); + if (entry.val) + break; progress += 8; } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); spin_unlock(src_ptl); - pte_unmap_nested(src_pte - 1); - add_mm_rss(dst_mm, rss[0], rss[1]); - pte_unmap_unlock(dst_pte - 1, dst_ptl); + pte_unmap(orig_src_pte); + add_mm_rss_vec(dst_mm, rss); + pte_unmap_unlock(orig_dst_pte, dst_ptl); cond_resched(); + + if (entry.val) { + if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) + return -ENOMEM; + progress = 0; + } if (addr != end) goto again; return 0; @@ -766,8 +896,9 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct mm_struct *mm = tlb->mm; pte_t *pte; spinlock_t *ptl; - int file_rss = 0; - int anon_rss = 0; + int rss[NR_MM_COUNTERS]; + + init_rss_vec(rss); pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -813,14 +944,14 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, set_pte_at(mm, addr, pte, pgoff_to_pte(page->index)); if (PageAnon(page)) - anon_rss--; + rss[MM_ANONPAGES]--; else { if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && likely(!VM_SequentialReadHint(vma))) mark_page_accessed(page); - file_rss--; + rss[MM_FILEPAGES]--; } page_remove_rmap(page); if (unlikely(page_mapcount(page) < 0)) @@ -837,13 +968,18 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, if (pte_file(ptent)) { if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) print_bad_pte(vma, addr, ptent, NULL); - } else if - (unlikely(!free_swap_and_cache(pte_to_swp_entry(ptent)))) - print_bad_pte(vma, addr, ptent, NULL); + } else { + swp_entry_t entry = pte_to_swp_entry(ptent); + + if (!non_swap_entry(entry)) + rss[MM_SWAPENTS]--; + if (unlikely(!free_swap_and_cache(entry))) + print_bad_pte(vma, addr, ptent, NULL); + } pte_clear_not_present_full(mm, addr, pte, tlb->fullmm); } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0)); - add_mm_rss(mm, file_rss, anon_rss); + add_mm_rss_vec(mm, rss); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); @@ -906,6 +1042,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, details = NULL; BUG_ON(addr >= end); + mem_cgroup_uncharge_start(); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); do { @@ -918,6 +1055,7 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, zap_work, details); } while (pgd++, addr = next, (addr != end && *zap_work > 0)); tlb_end_vma(tlb, vma); + mem_cgroup_uncharge_end(); return addr; } @@ -1087,8 +1225,17 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); -/* - * Do a quick page-table lookup for a single page. +/** + * follow_page - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * + * @flags can have FOLL_ flags set, defined in <linux/mm.h> + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). */ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -1141,9 +1288,14 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto no_page; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; + page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) - goto bad_page; + if (unlikely(!page)) { + if ((flags & FOLL_DUMP) || + !is_zero_pfn(pte_pfn(pte))) + goto bad_page; + page = pte_page(pte); + } if (flags & FOLL_GET) get_page(page); @@ -1171,65 +1323,46 @@ no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) return page; - /* Fall through to ZERO_PAGE handling */ + no_page_table: /* * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate page tables. + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. */ - if (flags & FOLL_ANON) { - page = ZERO_PAGE(0); - if (flags & FOLL_GET) - get_page(page); - BUG_ON(flags & FOLL_WRITE); - } + if ((flags & FOLL_DUMP) && + (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); return page; } -/* Can we do the FOLL_ANON optimization? */ -static inline int use_zero_page(struct vm_area_struct *vma) -{ - /* - * We don't want to optimize FOLL_ANON for make_pages_present() - * when it tries to page in a VM_LOCKED region. As to VM_SHARED, - * we want to get the page from the page tables to make sure - * that we serialize and update with any other user of that - * mapping. - */ - if (vma->vm_flags & (VM_LOCKED | VM_SHARED)) - return 0; - /* - * And if we have a fault routine, it's not an anonymous region. - */ - return !vma->vm_ops || !vma->vm_ops->fault; -} - - - int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, - struct page **pages, struct vm_area_struct **vmas) + unsigned long start, int nr_pages, unsigned int gup_flags, + struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int vm_flags = 0; - int write = !!(flags & GUP_FLAGS_WRITE); - int force = !!(flags & GUP_FLAGS_FORCE); - int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); - int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL); + unsigned long vm_flags; - if (len <= 0) + if (nr_pages <= 0) return 0; + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + /* * Require read or write permissions. - * If 'force' is set, we only require the "MAY" flags. + * If FOLL_FORCE is set, we only require the "MAY" flags. */ - vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = (gup_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (gup_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); i = 0; do { struct vm_area_struct *vma; - unsigned int foll_flags; vma = find_extend_vma(mm, start); if (!vma && in_gate_area(tsk, start)) { @@ -1241,7 +1374,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pte_t *pte; /* user gate pages are read-only */ - if (!ignore && write) + if (gup_flags & FOLL_WRITE) return i ? : -EFAULT; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); @@ -1259,64 +1392,66 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? : -EFAULT; } if (pages) { - struct page *page = vm_normal_page(gate_vma, start, *pte); + struct page *page; + + page = vm_normal_page(gate_vma, start, *pte); + if (!page) { + if (!(gup_flags & FOLL_DUMP) && + is_zero_pfn(pte_pfn(*pte))) + page = pte_page(*pte); + else { + pte_unmap(pte); + return i ? : -EFAULT; + } + } pages[i] = page; - if (page) - get_page(page); + get_page(page); } pte_unmap(pte); if (vmas) vmas[i] = gate_vma; i++; start += PAGE_SIZE; - len--; + nr_pages--; continue; } if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) || - (!ignore && !(vm_flags & vma->vm_flags))) + !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &len, i, write); + &start, &nr_pages, i, gup_flags); continue; } - foll_flags = FOLL_TOUCH; - if (pages) - foll_flags |= FOLL_GET; - if (!write && use_zero_page(vma)) - foll_flags |= FOLL_ANON; - do { struct page *page; + unsigned int foll_flags = gup_flags; /* * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory, unless - * current is handling munlock--e.g., on exit. In - * that case, we are not allocating memory. Rather, - * we're only unlocking already resident/mapped pages. + * pages and potentially allocating memory. */ - if (unlikely(!ignore_sigkill && - fatal_signal_pending(current))) + if (unlikely(fatal_signal_pending(current))) return i ? i : -ERESTARTSYS; - if (write) - foll_flags |= FOLL_WRITE; - cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; - /* FOLL_WRITE matches FAULT_FLAG_WRITE! */ - ret = handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); + ret = handle_mm_fault(mm, vma, start, + (foll_flags & FOLL_WRITE) ? + FAULT_FLAG_WRITE : 0); + if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; - else if (ret & VM_FAULT_SIGBUS) + if (ret & + (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| + VM_FAULT_SIGBUS)) return i ? i : -EFAULT; BUG(); } @@ -1355,9 +1490,9 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vmas[i] = vma; i++; start += PAGE_SIZE; - len--; - } while (len && start < vma->vm_end); - } while (len); + nr_pages--; + } while (nr_pages && start < vma->vm_end); + } while (nr_pages); return i; } @@ -1366,7 +1501,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * @tsk: task_struct of target task * @mm: mm_struct of target mm * @start: starting user address - * @len: number of pages from start to pin + * @nr_pages: number of pages from start to pin * @write: whether pages will be written to by the caller * @force: whether to force write access even if user mapping is * readonly. This will result in the page being COWed even @@ -1378,7 +1513,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * Or NULL if the caller does not require them. * * Returns number of pages pinned. This may be fewer than the number - * requested. If len is 0 or negative, returns 0. If no pages + * requested. If nr_pages is 0 or negative, returns 0. If no pages * were pinned, returns -errno. Each page returned must be released * with a put_page() call when it is finished with. vmas will only * remain valid while mmap_sem is held. @@ -1412,24 +1547,51 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * See also get_user_pages_fast, for performance critical applications. */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { - int flags = 0; + int flags = FOLL_TOUCH; + if (pages) + flags |= FOLL_GET; if (write) - flags |= GUP_FLAGS_WRITE; + flags |= FOLL_WRITE; if (force) - flags |= GUP_FLAGS_FORCE; + flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, - start, len, flags, - pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } - EXPORT_SYMBOL(get_user_pages); -pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by page_cache_release() or put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ + +pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { pgd_t * pgd = pgd_offset(mm, addr); @@ -1471,7 +1633,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ get_page(page); - inc_mm_counter(mm, file_rss); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); set_pte_at(mm, addr, pte, mk_pte(page, prot)); @@ -1537,7 +1699,7 @@ static int insert_pfn(struct vm_area_struct *vma, unsigned long addr, /* Ok, finally just insert the thing.. */ entry = pte_mkspecial(pfn_pte(pfn, prot)); set_pte_at(mm, addr, pte, entry); - update_mmu_cache(vma, addr, entry); /* XXX: why not for insert_page? */ + update_mmu_cache(vma, addr, pte); /* XXX: why not for insert_page? */ retval = 0; out_unlock: @@ -1606,7 +1768,8 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * If we don't have pte special, then we have to use the pfn_valid() * based VM_MIXEDMAP scheme (see vm_normal_page), and thus we *must* * refcount the page if pfn_valid is true (hence insert_page rather - * than insert_pfn). + * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP + * without pte special, it would there be refcounted as a normal page. */ if (!HAVE_PTE_SPECIAL && pfn_valid(pfn)) { struct page *page; @@ -1781,10 +1944,10 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, token = pmd_pgtable(*pmd); do { - err = fn(pte, token, addr, data); + err = fn(pte++, token, addr, data); if (err) break; - } while (pte++, addr += PAGE_SIZE, addr != end); + } while (addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); @@ -1844,11 +2007,10 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, { pgd_t *pgd; unsigned long next; - unsigned long start = addr, end = addr + size; + unsigned long end = addr + size; int err; BUG_ON(addr >= end); - mmu_notifier_invalidate_range_start(mm, start, end); pgd = pgd_offset(mm, addr); do { next = pgd_addr_end(addr, end); @@ -1856,7 +2018,7 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr, if (err) break; } while (pgd++, addr = next, addr != end); - mmu_notifier_invalidate_range_end(mm, start, end); + return err; } EXPORT_SYMBOL_GPL(apply_to_page_range); @@ -1918,7 +2080,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) - memset(kaddr, 0, PAGE_SIZE); + clear_page(kaddr); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(dst); } else @@ -1946,6 +2108,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte) + __releases(ptl) { struct page *old_page, *new_page; pte_t entry; @@ -1972,7 +2135,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(old_page)) { + if (PageAnon(old_page) && !PageKsm(old_page)) { if (!trylock_page(old_page)) { page_cache_get(old_page); pte_unmap_unlock(page_table, ptl); @@ -1987,6 +2150,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, page_cache_release(old_page); } reuse = reuse_swap_page(old_page); + if (reuse) + /* + * The page is all ours. Move it to our anon_vma so + * the rmap code will not search our parent or siblings. + * Protected against the rmap code by the page lock. + */ + page_move_anon_rmap(old_page, vma, address); unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2059,7 +2229,7 @@ reuse: entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); ret |= VM_FAULT_WRITE; goto unlock; } @@ -2073,10 +2243,19 @@ gotten: if (unlikely(anon_vma_prepare(vma))) goto oom; - VM_BUG_ON(old_page == ZERO_PAGE(0)); - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (!new_page) - goto oom; + + if (is_zero_pfn(pte_pfn(orig_pte))) { + new_page = alloc_zeroed_user_highpage_movable(vma, address); + if (!new_page) + goto oom; + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!new_page) + goto oom; + cow_user_page(new_page, old_page, address, vma); + } + __SetPageUptodate(new_page); + /* * Don't let another task, with possibly unlocked vma, * keep the mlocked page. @@ -2086,8 +2265,6 @@ gotten: clear_page_mlock(old_page); unlock_page(old_page); } - cow_user_page(new_page, old_page, address, vma); - __SetPageUptodate(new_page); if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; @@ -2099,11 +2276,11 @@ gotten: if (likely(pte_same(*page_table, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { - dec_mm_counter(mm, file_rss); - inc_mm_counter(mm, anon_rss); + dec_mm_counter_fast(mm, MM_FILEPAGES); + inc_mm_counter_fast(mm, MM_ANONPAGES); } } else - inc_mm_counter(mm, anon_rss); + inc_mm_counter_fast(mm, MM_ANONPAGES); flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -2113,10 +2290,15 @@ gotten: * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, address, page_table); + ptep_clear_flush(vma, address, page_table); page_add_new_anon_rmap(new_page, vma, address); - set_pte_at(mm, address, page_table, entry); - update_mmu_cache(vma, address, entry); + /* + * We call the notify macro here because, when using secondary + * mmu page tables (such as kvm shadow page tables), we want the + * new page to be mapped directly into the secondary page table. + */ + set_pte_at_notify(mm, address, page_table, entry); + update_mmu_cache(vma, address, page_table); if (old_page) { /* * Only after switching the pte to the new page may @@ -2358,7 +2540,7 @@ restart: * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE - * boundary. Note that this is different from vmtruncate(), which + * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded @@ -2409,63 +2591,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/** - * vmtruncate - unmap mappings "freed" by truncate() syscall - * @inode: inode of the file used - * @offset: file offset to start truncating - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode * inode, loff_t offset) -{ - if (inode->i_size < offset) { - unsigned long limit; - - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - i_size_write(inode, offset); - } else { - struct address_space *mapping = inode->i_mapping; - - /* - * truncation of in-use swapfiles is disallowed - it would - * cause subsequent swapout to scribble on the now-freed - * blocks. - */ - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - i_size_write(inode, offset); - - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - } - - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; - -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; -} -EXPORT_SYMBOL(vmtruncate); - int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) { struct address_space *mapping = inode->i_mapping; @@ -2500,24 +2625,33 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned int flags, pte_t orig_pte) { spinlock_t *ptl; - struct page *page; + struct page *page, *swapcache = NULL; swp_entry_t entry; pte_t pte; + int locked; struct mem_cgroup *ptr = NULL; + int exclusive = 0; int ret = 0; if (!pte_unmap_same(mm, pmd, page_table, orig_pte)) goto out; entry = pte_to_swp_entry(orig_pte); - if (is_migration_entry(entry)) { - migration_entry_wait(mm, pmd, address); + if (unlikely(non_swap_entry(entry))) { + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + } else if (is_hwpoison_entry(entry)) { + ret = VM_FAULT_HWPOISON; + } else { + print_bad_pte(vma, address, orig_pte, NULL); + ret = VM_FAULT_SIGBUS; + } goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - grab_swap_token(); /* Contend for token _before_ read-in */ + grab_swap_token(mm); /* Contend for token _before_ read-in */ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { @@ -2535,10 +2669,43 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); + } else if (PageHWPoison(page)) { + /* + * hwpoisoned dirty swapcache pages are kept for killing + * owner processes (which may be unknown at hwpoison time) + */ + ret = VM_FAULT_HWPOISON; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + goto out_release; } - lock_page(page); + locked = lock_page_or_retry(page, mm, flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (!locked) { + ret |= VM_FAULT_RETRY; + goto out_release; + } + + /* + * Make sure try_to_free_swap or reuse_swap_page or swapoff did not + * release the swapcache from under us. The page pin, and pte_same + * test below, are not enough to exclude that. Even if it is still + * swapcache, we need to check that the page's swap has not changed. + */ + if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) + goto out_page; + + if (ksm_might_need_to_copy(page, vma, address)) { + swapcache = page; + page = ksm_does_need_to_copy(page, vma, address); + + if (unlikely(!page)) { + ret = VM_FAULT_OOM; + page = swapcache; + swapcache = NULL; + goto out_page; + } + } if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) { ret = VM_FAULT_OOM; @@ -2571,15 +2738,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, * discarded at swap_free(). */ - inc_mm_counter(mm, anon_rss); + inc_mm_counter_fast(mm, MM_ANONPAGES); + dec_mm_counter_fast(mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); if ((flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); flags &= ~FAULT_FLAG_WRITE; + ret |= VM_FAULT_WRITE; + exclusive = 1; } flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); - page_add_anon_rmap(page, vma, address); + do_page_add_anon_rmap(page, vma, address, exclusive); /* It's better to call commit-charge after rmap is established */ mem_cgroup_commit_charge_swapin(page, ptr); @@ -2587,6 +2757,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) try_to_free_swap(page); unlock_page(page); + if (swapcache) { + /* + * Hold the lock to avoid the swap entry to be reused + * until we take the PT lock for the pte_same() check + * (to avoid false positives from pte_same). For + * further safety release the lock after the swap_free + * so that the swap count won't change under a + * parallel locked swapcache. + */ + unlock_page(swapcache); + page_cache_release(swapcache); + } if (flags & FAULT_FLAG_WRITE) { ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte); @@ -2596,7 +2778,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, pte); + update_mmu_cache(vma, address, page_table); unlock: pte_unmap_unlock(page_table, ptl); out: @@ -2606,11 +2788,50 @@ out_nomap: pte_unmap_unlock(page_table, ptl); out_page: unlock_page(page); +out_release: page_cache_release(page); + if (swapcache) { + unlock_page(swapcache); + page_cache_release(swapcache); + } return ret; } /* + * This is like a special single-page "expand_{down|up}wards()", + * except we must first make sure that 'address{-|+}PAGE_SIZE' + * doesn't hit another vma. + */ +static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address) +{ + address &= PAGE_MASK; + if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) { + struct vm_area_struct *prev = vma->vm_prev; + + /* + * Is there a mapping abutting this one below? + * + * That's only ok if it's the same stack mapping + * that has gotten split.. + */ + if (prev && prev->vm_end == address) + return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM; + + expand_stack(vma, address - PAGE_SIZE); + } + if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) { + struct vm_area_struct *next = vma->vm_next; + + /* As VM_GROWSDOWN but s/below/above/ */ + if (next && next->vm_start == address + PAGE_SIZE) + return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM; + + expand_upwards(vma, address + PAGE_SIZE); + } + return 0; +} + +/* * We enter with non-exclusive mmap_sem (to exclude vma changes, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. @@ -2623,9 +2844,23 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; pte_t entry; - /* Allocate our own private page. */ pte_unmap(page_table); + /* Check if we need to add a guard page to the stack */ + if (check_stack_guard_page(vma, address) < 0) + return VM_FAULT_SIGBUS; + + /* Use the zero-page for reads */ + if (!(flags & FAULT_FLAG_WRITE)) { + entry = pte_mkspecial(pfn_pte(my_zero_pfn(address), + vma->vm_page_prot)); + page_table = pte_offset_map_lock(mm, pmd, address, &ptl); + if (!pte_none(*page_table)) + goto unlock; + goto setpte; + } + + /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; page = alloc_zeroed_user_highpage_movable(vma, address); @@ -2637,17 +2872,20 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, goto oom_free_page; entry = mk_pte(page, vma->vm_page_prot); - entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (vma->vm_flags & VM_WRITE) + entry = pte_mkwrite(pte_mkdirty(entry)); page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (!pte_none(*page_table)) goto release; - inc_mm_counter(mm, anon_rss); + + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); +setpte: set_pte_at(mm, address, page_table, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); unlock: pte_unmap_unlock(page_table, ptl); return 0; @@ -2695,9 +2933,16 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, vmf.page = NULL; ret = vma->vm_ops->fault(vma, &vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | + VM_FAULT_RETRY))) return ret; + if (unlikely(PageHWPoison(vmf.page))) { + if (ret & VM_FAULT_LOCKED) + unlock_page(vmf.page); + return VM_FAULT_HWPOISON; + } + /* * For consistency in subsequent calls, make the faulted page always * locked. @@ -2789,10 +3034,10 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (anon) { - inc_mm_counter(mm, anon_rss); + inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); } else { - inc_mm_counter(mm, file_rss); + inc_mm_counter_fast(mm, MM_FILEPAGES); page_add_file_rmap(page); if (flags & FAULT_FLAG_WRITE) { dirty_page = page; @@ -2802,7 +3047,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, page_table); } else { if (charged) mem_cgroup_uncharge_page(page); @@ -2882,7 +3127,7 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * Page table corrupted: show pte and kill process. */ print_bad_pte(vma, address, orig_pte, NULL); - return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; } pgoff = pte_to_pgoff(orig_pte); @@ -2939,7 +3184,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, } entry = pte_mkyoung(entry); if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(vma, address, entry); + update_mmu_cache(vma, address, pte); } else { /* * This is needed only for protection faults but the arch code @@ -2948,7 +3193,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, * with threads. */ if (flags & FAULT_FLAG_WRITE) - flush_tlb_page(vma, address); + flush_tlb_fix_spurious_fault(vma, address); } unlock: pte_unmap_unlock(pte, ptl); @@ -2970,6 +3215,9 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGFAULT); + /* do counter updates before entering really critical section. */ + check_sync_rss_stat(current); + if (unlikely(is_vm_hugetlb_page(vma))) return hugetlb_fault(mm, vma, address, flags); @@ -3103,7 +3351,7 @@ int in_gate_area_no_task(unsigned long addr) #endif /* __HAVE_ARCH_GATE_AREA */ -static int follow_pte(struct mm_struct *mm, unsigned long address, +static int __follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, spinlock_t **ptlp) { pgd_t *pgd; @@ -3140,6 +3388,17 @@ out: return -EINVAL; } +static inline int follow_pte(struct mm_struct *mm, unsigned long address, + pte_t **ptepp, spinlock_t **ptlp) +{ + int res; + + /* (void) is needed to make gcc happy */ + (void) __cond_lock(*ptlp, + !(res = __follow_pte(mm, address, ptepp, ptlp))); + return res; +} + /** * follow_pfn - look up PFN at a user virtual address * @vma: memory mapping diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4412a6..2c6523a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -26,11 +26,31 @@ #include <linux/migrate.h> #include <linux/page-isolation.h> #include <linux/pfn.h> +#include <linux/suspend.h> +#include <linux/mm_inline.h> +#include <linux/firmware-map.h> #include <asm/tlbflush.h> #include "internal.h" +DEFINE_MUTEX(mem_hotplug_mutex); + +void lock_memory_hotplug(void) +{ + mutex_lock(&mem_hotplug_mutex); + + /* for exclusive hibernation if CONFIG_HIBERNATION=y */ + lock_system_sleep(); +} + +void unlock_memory_hotplug(void) +{ + unlock_system_sleep(); + mutex_unlock(&mem_hotplug_mutex); +} + + /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) { @@ -70,7 +90,9 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) atomic_inc(&page->_count); } -void put_page_bootmem(struct page *page) +/* reference to __meminit __free_pages_bootmem is valid + * so use __ref to tell modpost not to generate a warning */ +void __ref put_page_bootmem(struct page *page) { int type; @@ -339,8 +361,11 @@ EXPORT_SYMBOL_GPL(__remove_pages); void online_page(struct page *page) { + unsigned long pfn = page_to_pfn(page); + totalram_pages++; - num_physpages++; + if (pfn >= num_physpages) + num_physpages = pfn + 1; #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) @@ -407,12 +432,14 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ + mutex_lock(&zonelists_mutex); if (!populated_zone(zone)) need_zonelists_rebuild = 1; - ret = walk_memory_resource(pfn, nr_pages, &onlined_pages, + ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + mutex_unlock(&zonelists_mutex); printk(KERN_DEBUG "online_pages %lx at %lx failed\n", nr_pages, pfn); memory_notify(MEM_CANCEL_ONLINE, &arg); @@ -421,7 +448,12 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; + if (need_zonelists_rebuild) + build_all_zonelists(zone); + else + zone_pcp_update(zone); + mutex_unlock(&zonelists_mutex); setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); if (onlined_pages) { @@ -429,10 +461,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); } - if (need_zonelists_rebuild) - build_all_zonelists(); - else - vm_total_pages = nr_free_pagecache_pages(); + vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); @@ -443,7 +472,8 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ -static pg_data_t *hotadd_new_pgdat(int nid, u64 start) +/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ +static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) { struct pglist_data *pgdat; unsigned long zones_size[MAX_NR_ZONES] = {0}; @@ -472,6 +502,29 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } +/* + * called by cpu_up() to online a node without onlined memory. + */ +int mem_online_node(int nid) +{ + pg_data_t *pgdat; + int ret; + + lock_memory_hotplug(); + pgdat = hotadd_new_pgdat(nid, 0); + if (pgdat) { + ret = -ENOMEM; + goto out; + } + node_set_online(nid); + ret = register_one_node(nid); + BUG_ON(ret); + +out: + unlock_memory_hotplug(); + return ret; +} + /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */ int __ref add_memory(int nid, u64 start, u64 size) { @@ -480,14 +533,18 @@ int __ref add_memory(int nid, u64 start, u64 size) struct resource *res; int ret; + lock_memory_hotplug(); + res = register_memory_resource(start, size); + ret = -EEXIST; if (!res) - return -EEXIST; + goto out; if (!node_online(nid)) { pgdat = hotadd_new_pgdat(nid, start); + ret = -ENOMEM; if (!pgdat) - return -ENOMEM; + goto out; new_pgdat = 1; } @@ -510,7 +567,11 @@ int __ref add_memory(int nid, u64 start, u64 size) BUG_ON(ret); } - return ret; + /* create new memmap entry */ + firmware_map_add_hotplug(start, start + size, "System RAM"); + + goto out; + error: /* rollback pgdat allocation and others */ if (new_pgdat) @@ -518,6 +579,8 @@ error: if (res) release_memory_resource(res); +out: + unlock_memory_hotplug(); return ret; } EXPORT_SYMBOL_GPL(add_memory); @@ -538,45 +601,32 @@ static inline int pageblock_free(struct page *page) /* Return the start of the next active pageblock after a given page */ static struct page *next_active_pageblock(struct page *page) { - int pageblocks_stride; - /* Ensure the starting page is pageblock-aligned */ BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1)); - /* Move forward by at least 1 * pageblock_nr_pages */ - pageblocks_stride = 1; - /* If the entire pageblock is free, move to the end of free page */ - if (pageblock_free(page)) - pageblocks_stride += page_order(page) - pageblock_order; + if (pageblock_free(page)) { + int order; + /* be careful. we don't have locks, page_order can be changed.*/ + order = page_order(page); + if ((order < MAX_ORDER) && (order >= pageblock_order)) + return page + (1 << order); + } - return page + (pageblocks_stride * pageblock_nr_pages); + return page + pageblock_nr_pages; } /* Checks if this range of memory is likely to be hot-removable. */ int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { - int type; struct page *page = pfn_to_page(start_pfn); struct page *end_page = page + nr_pages; /* Check the starting page of each pageblock within the range */ for (; page < end_page; page = next_active_pageblock(page)) { - type = get_pageblock_migratetype(page); - - /* - * A pageblock containing MOVABLE or free pages is considered - * removable - */ - if (type != MIGRATE_MOVABLE && !pageblock_free(page)) - return 0; - - /* - * A pageblock starting with a PageReserved page is not - * considered removable. - */ - if (PageReserved(page)) + if (!is_pageblock_removable_nolock(page)) return 0; + cond_resched(); } /* All pageblocks in the memory block are likely to be hot-removable */ @@ -613,7 +663,7 @@ static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn) * Scanning pfn is much easier than scanning lru list. * Scan pfn from start to end and Find LRU page. */ -int scan_lru_pages(unsigned long start, unsigned long end) +static unsigned long scan_lru_pages(unsigned long start, unsigned long end) { unsigned long pfn; struct page *page; @@ -659,30 +709,34 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (!ret) { /* Success */ list_add_tail(&page->lru, &source); move_pages--; + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + } else { +#ifdef CONFIG_DEBUG_VM + printk(KERN_ALERT "removing pfn %lx from LRU failed\n", + pfn); + dump_page(page); +#endif /* Becasue we don't have big zone->lock. we should check this again here. */ - if (page_count(page)) + if (page_count(page)) { not_managed++; -#ifdef CONFIG_DEBUG_VM - printk(KERN_INFO "removing from LRU failed" - " %lx/%d/%lx\n", - pfn, page_count(page), page->flags); -#endif + ret = -EBUSY; + break; + } } } - ret = -EBUSY; - if (not_managed) { - if (!list_empty(&source)) + if (!list_empty(&source)) { + if (not_managed) { + putback_lru_pages(&source); + goto out; + } + /* this function returns # of failed pages */ + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); + if (ret) putback_lru_pages(&source); - goto out; } - ret = 0; - if (list_empty(&source)) - goto out; - /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0); - out: return ret; } @@ -701,7 +755,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages, static void offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { - walk_memory_resource(start_pfn, end_pfn - start_pfn, NULL, + walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL, offline_isolated_pages_cb); } @@ -727,14 +781,14 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) long offlined = 0; int ret; - ret = walk_memory_resource(start_pfn, end_pfn - start_pfn, &offlined, + ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined, check_pages_isolated_cb); if (ret < 0) offlined = (long)ret; return offlined; } -int offline_pages(unsigned long start_pfn, +static int offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { unsigned long pfn, nr_pages, expire; @@ -754,6 +808,8 @@ int offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; + lock_memory_hotplug(); + zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; @@ -761,7 +817,7 @@ int offline_pages(unsigned long start_pfn, /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn); if (ret) - return ret; + goto out; arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; @@ -789,7 +845,6 @@ repeat: ret = 0; if (drain) { lru_add_drain_all(); - flush_scheduled_work(); cond_resched(); drain_all_pages(); } @@ -811,7 +866,6 @@ repeat: } /* drain all zone's lru pagevec, this is asyncronous... */ lru_add_drain_all(); - flush_scheduled_work(); yield(); /* drain pcp pages , this is synchrouns. */ drain_all_pages(); @@ -831,15 +885,19 @@ repeat: zone->present_pages -= offlined_pages; zone->zone_pgdat->node_present_pages -= offlined_pages; totalram_pages -= offlined_pages; - num_physpages -= offlined_pages; setup_per_zone_wmarks(); calculate_zone_inactive_ratio(zone); + if (!node_present_pages(node)) { + node_clear_state(node, N_HIGH_MEMORY); + kswapd_stop(node); + } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); + unlock_memory_hotplug(); return 0; failed_removal: @@ -849,6 +907,8 @@ failed_removal: /* pushback to free area */ undo_isolate_page_range(start_pfn, end_pfn); +out: + unlock_memory_hotplug(); return ret; } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e08e2c4..11ff260 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -73,7 +73,6 @@ #include <linux/sched.h> #include <linux/nodemask.h> #include <linux/cpuset.h> -#include <linux/gfp.h> #include <linux/slab.h> #include <linux/string.h> #include <linux/module.h> @@ -85,10 +84,12 @@ #include <linux/seq_file.h> #include <linux/proc_fs.h> #include <linux/migrate.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/syscalls.h> #include <linux/ctype.h> +#include <linux/mm_inline.h> #include <asm/tlbflush.h> #include <asm/uaccess.h> @@ -118,7 +119,22 @@ struct mempolicy default_policy = { static const struct mempolicy_operations { int (*create)(struct mempolicy *pol, const nodemask_t *nodes); - void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes); + /* + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ + void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step); } mpol_ops[MPOL_MAX]; /* Check that the nodemask contains at least one populated zone */ @@ -126,9 +142,6 @@ static int is_valid_nodemask(const nodemask_t *nodemask) { int nd, k; - /* Check that there is something useful in this mask */ - k = policy_zone; - for_each_node_mask(nd, *nodemask) { struct zone *z; @@ -144,7 +157,7 @@ static int is_valid_nodemask(const nodemask_t *nodemask) static inline int mpol_store_user_nodemask(const struct mempolicy *pol) { - return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES); + return pol->flags & MPOL_MODE_FLAGS; } static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig, @@ -191,25 +204,27 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes) * Must be called holding task's alloc_lock to protect task's mems_allowed * and mempolicy. May also be called holding the mmap_semaphore for write. */ -static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) +static int mpol_set_nodemask(struct mempolicy *pol, + const nodemask_t *nodes, struct nodemask_scratch *nsc) { - nodemask_t cpuset_context_nmask; int ret; /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */ if (pol == NULL) return 0; + /* Check N_HIGH_MEMORY */ + nodes_and(nsc->mask1, + cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]); VM_BUG_ON(!nodes); if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes)) nodes = NULL; /* explicit local allocation */ else { if (pol->flags & MPOL_F_RELATIVE_NODES) - mpol_relative_nodemask(&cpuset_context_nmask, nodes, - &cpuset_current_mems_allowed); + mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1); else - nodes_and(cpuset_context_nmask, *nodes, - cpuset_current_mems_allowed); + nodes_and(nsc->mask2, *nodes, nsc->mask1); + if (mpol_store_user_nodemask(pol)) pol->w.user_nodemask = *nodes; else @@ -217,8 +232,10 @@ static int mpol_set_nodemask(struct mempolicy *pol, const nodemask_t *nodes) cpuset_current_mems_allowed; } - ret = mpol_ops[pol->mode].create(pol, - nodes ? &cpuset_context_nmask : NULL); + if (nodes) + ret = mpol_ops[pol->mode].create(pol, &nsc->mask2); + else + ret = mpol_ops[pol->mode].create(pol, NULL); return ret; } @@ -272,12 +289,19 @@ void __mpol_put(struct mempolicy *p) kmem_cache_free(policy_cache, p); } -static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes) +static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) { } -static void mpol_rebind_nodemask(struct mempolicy *pol, - const nodemask_t *nodes) +/* + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes, + enum mpol_rebind_step step) { nodemask_t tmp; @@ -286,12 +310,31 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, else if (pol->flags & MPOL_F_RELATIVE_NODES) mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes); else { - nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed, - *nodes); - pol->w.cpuset_mems_allowed = *nodes; + /* + * if step == 1, we use ->w.cpuset_mems_allowed to cache the + * result + */ + if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) { + nodes_remap(tmp, pol->v.nodes, + pol->w.cpuset_mems_allowed, *nodes); + pol->w.cpuset_mems_allowed = step ? tmp : *nodes; + } else if (step == MPOL_REBIND_STEP2) { + tmp = pol->w.cpuset_mems_allowed; + pol->w.cpuset_mems_allowed = *nodes; + } else + BUG(); } - pol->v.nodes = tmp; + if (nodes_empty(tmp)) + tmp = *nodes; + + if (step == MPOL_REBIND_STEP1) + nodes_or(pol->v.nodes, pol->v.nodes, tmp); + else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2) + pol->v.nodes = tmp; + else + BUG(); + if (!node_isset(current->il_next, tmp)) { current->il_next = next_node(current->il_next, tmp); if (current->il_next >= MAX_NUMNODES) @@ -302,7 +345,8 @@ static void mpol_rebind_nodemask(struct mempolicy *pol, } static void mpol_rebind_preferred(struct mempolicy *pol, - const nodemask_t *nodes) + const nodemask_t *nodes, + enum mpol_rebind_step step) { nodemask_t tmp; @@ -325,16 +369,45 @@ static void mpol_rebind_preferred(struct mempolicy *pol, } } -/* Migrate a policy to a different set of nodes */ -static void mpol_rebind_policy(struct mempolicy *pol, - const nodemask_t *newmask) +/* + * mpol_rebind_policy - Migrate a policy to a different set of nodes + * + * If read-side task has no lock to protect task->mempolicy, write-side + * task will rebind the task->mempolicy by two step. The first step is + * setting all the newly nodes, and the second step is cleaning all the + * disallowed nodes. In this way, we can avoid finding no node to alloc + * page. + * If we have a lock to protect task->mempolicy in read-side, we do + * rebind directly. + * + * step: + * MPOL_REBIND_ONCE - do rebind work at once + * MPOL_REBIND_STEP1 - set all the newly nodes + * MPOL_REBIND_STEP2 - clean all the disallowed nodes + */ +static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask, + enum mpol_rebind_step step) { if (!pol) return; - if (!mpol_store_user_nodemask(pol) && + if (!mpol_store_user_nodemask(pol) && step == 0 && nodes_equal(pol->w.cpuset_mems_allowed, *newmask)) return; - mpol_ops[pol->mode].rebind(pol, newmask); + + if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING)) + return; + + if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING)) + BUG(); + + if (step == MPOL_REBIND_STEP1) + pol->flags |= MPOL_F_REBINDING; + else if (step == MPOL_REBIND_STEP2) + pol->flags &= ~MPOL_F_REBINDING; + else if (step >= MPOL_REBIND_NSTEP) + BUG(); + + mpol_ops[pol->mode].rebind(pol, newmask, step); } /* @@ -344,9 +417,10 @@ static void mpol_rebind_policy(struct mempolicy *pol, * Called with task's alloc_lock held. */ -void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new, + enum mpol_rebind_step step) { - mpol_rebind_policy(tsk->mempolicy, new); + mpol_rebind_policy(tsk->mempolicy, new, step); } /* @@ -361,7 +435,7 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) down_write(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) - mpol_rebind_policy(vma->vm_policy, new); + mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE); up_write(&mm->mmap_sem); } @@ -408,17 +482,11 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!page) continue; /* - * The check for PageReserved here is important to avoid - * handling zero pages and other pages that may have been - * marked special by the system. - * - * If the PageReserved would not be checked here then f.e. - * the location of the zero page could have an influence - * on MPOL_MF_STRICT, zero pages would be counted for - * the per node stats, and there would be useless attempts - * to put zero pages on the migration list. + * vm_normal_page() filters out zero pages, but there might + * still be PageReserved pages to skip, perhaps in a VDSO. + * And we cannot move PageKsm pages sensibly or safely yet. */ - if (PageReserved(page)) + if (PageReserved(page) || PageKsm(page)) continue; nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) @@ -563,24 +631,50 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) } /* Step 2: apply policy to a range and do splits. */ -static int mbind_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct mempolicy *new) +static int mbind_range(struct mm_struct *mm, unsigned long start, + unsigned long end, struct mempolicy *new_pol) { struct vm_area_struct *next; - int err; + struct vm_area_struct *prev; + struct vm_area_struct *vma; + int err = 0; + pgoff_t pgoff; + unsigned long vmstart; + unsigned long vmend; - err = 0; - for (; vma && vma->vm_start < end; vma = next) { + vma = find_vma_prev(mm, start, &prev); + if (!vma || vma->vm_start > start) + return -EFAULT; + + for (; vma && vma->vm_start < end; prev = vma, vma = next) { next = vma->vm_next; - if (vma->vm_start < start) - err = split_vma(vma->vm_mm, vma, start, 1); - if (!err && vma->vm_end > end) - err = split_vma(vma->vm_mm, vma, end, 0); - if (!err) - err = policy_vma(vma, new); + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + + pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); + prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, + vma->anon_vma, vma->vm_file, pgoff, new_pol); + if (prev) { + vma = prev; + next = vma->vm_next; + continue; + } + if (vma->vm_start != vmstart) { + err = split_vma(vma->vm_mm, vma, vmstart, 1); + if (err) + goto out; + } + if (vma->vm_end != vmend) { + err = split_vma(vma->vm_mm, vma, vmend, 0); + if (err) + goto out; + } + err = policy_vma(vma, new_pol); if (err) - break; + goto out; } + + out: return err; } @@ -620,12 +714,17 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, { struct mempolicy *new, *old; struct mm_struct *mm = current->mm; + NODEMASK_SCRATCH(scratch); int ret; - new = mpol_new(mode, flags, nodes); - if (IS_ERR(new)) - return PTR_ERR(new); + if (!scratch) + return -ENOMEM; + new = mpol_new(mode, flags, nodes); + if (IS_ERR(new)) { + ret = PTR_ERR(new); + goto out; + } /* * prevent changing our mempolicy while show_numa_maps() * is using it. @@ -635,13 +734,13 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, if (mm) down_write(&mm->mmap_sem); task_lock(current); - ret = mpol_set_nodemask(new, nodes); + ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { task_unlock(current); if (mm) up_write(&mm->mmap_sem); mpol_put(new); - return ret; + goto out; } old = current->mempolicy; current->mempolicy = new; @@ -654,7 +753,10 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, up_write(&mm->mmap_sem); mpol_put(old); - return 0; + ret = 0; +out: + NODEMASK_SCRATCH_FREE(scratch); + return ret; } /* @@ -772,9 +874,13 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, err = 0; if (nmask) { - task_lock(current); - get_policy_nodemask(pol, nmask); - task_unlock(current); + if (mpol_store_user_nodemask(pol)) { + *nmask = pol->w.user_nodemask; + } else { + task_lock(current); + get_policy_nodemask(pol, nmask); + task_unlock(current); + } } out: @@ -797,6 +903,8 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { if (!isolate_lru_page(page)) { list_add_tail(&page->lru, pagelist); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } } } @@ -816,15 +924,21 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, nodemask_t nmask; LIST_HEAD(pagelist); int err = 0; + struct vm_area_struct *vma; nodes_clear(nmask); node_set(source, nmask); - check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, + vma = check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); + if (IS_ERR(vma)) + return PTR_ERR(vma); - if (!list_empty(&pagelist)) - err = migrate_pages(&pagelist, new_node_page, dest); + if (!list_empty(&pagelist)) { + err = migrate_pages(&pagelist, new_node_page, dest, 0); + if (err) + putback_lru_pages(&pagelist); + } return err; } @@ -852,36 +966,36 @@ int do_migrate_pages(struct mm_struct *mm, if (err) goto out; -/* - * Find a 'source' bit set in 'tmp' whose corresponding 'dest' - * bit in 'to' is not also set in 'tmp'. Clear the found 'source' - * bit in 'tmp', and return that <source, dest> pair for migration. - * The pair of nodemasks 'to' and 'from' define the map. - * - * If no pair of bits is found that way, fallback to picking some - * pair of 'source' and 'dest' bits that are not the same. If the - * 'source' and 'dest' bits are the same, this represents a node - * that will be migrating to itself, so no pages need move. - * - * If no bits are left in 'tmp', or if all remaining bits left - * in 'tmp' correspond to the same bit in 'to', return false - * (nothing left to migrate). - * - * This lets us pick a pair of nodes to migrate between, such that - * if possible the dest node is not already occupied by some other - * source node, minimizing the risk of overloading the memory on a - * node that would happen if we migrated incoming memory to a node - * before migrating outgoing memory source that same node. - * - * A single scan of tmp is sufficient. As we go, we remember the - * most recent <s, d> pair that moved (s != d). If we find a pair - * that not only moved, but what's better, moved to an empty slot - * (d is not set in tmp), then we break out then, with that pair. - * Otherwise when we finish scannng from_tmp, we at least have the - * most recent <s, d> pair that moved. If we get all the way through - * the scan of tmp without finding any node that moved, much less - * moved to an empty node, then there is nothing left worth migrating. - */ + /* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that <source, dest> pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent <s, d> pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scannng from_tmp, we at least have the + * most recent <s, d> pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ tmp = *from_nodes; while (!nodes_empty(tmp)) { @@ -1012,17 +1126,24 @@ static long do_mbind(unsigned long start, unsigned long len, err = migrate_prep(); if (err) - return err; + goto mpol_out; } - down_write(&mm->mmap_sem); - task_lock(current); - err = mpol_set_nodemask(new, nmask); - task_unlock(current); - if (err) { - up_write(&mm->mmap_sem); - mpol_put(new); - return err; + { + NODEMASK_SCRATCH(scratch); + if (scratch) { + down_write(&mm->mmap_sem); + task_lock(current); + err = mpol_set_nodemask(new, nmask, scratch); + task_unlock(current); + if (err) + up_write(&mm->mmap_sem); + } else + err = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); } + if (err) + goto mpol_out; + vma = check_range(mm, start, end, nmask, flags | MPOL_MF_INVERT, &pagelist); @@ -1030,17 +1151,22 @@ static long do_mbind(unsigned long start, unsigned long len, if (!IS_ERR(vma)) { int nr_failed = 0; - err = mbind_range(vma, start, end, new); + err = mbind_range(mm, start, end, new); - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma); + (unsigned long)vma, 0); + if (nr_failed) + putback_lru_pages(&pagelist); + } if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; - } + } else + putback_lru_pages(&pagelist); up_write(&mm->mmap_sem); + mpol_out: mpol_put(new); return err; } @@ -1158,33 +1284,45 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, const unsigned long __user *, new_nodes) { const struct cred *cred = current_cred(), *tcred; - struct mm_struct *mm; + struct mm_struct *mm = NULL; struct task_struct *task; - nodemask_t old; - nodemask_t new; nodemask_t task_nodes; int err; + nodemask_t *old; + nodemask_t *new; + NODEMASK_SCRATCH(scratch); - err = get_nodes(&old, old_nodes, maxnode); + if (!scratch) + return -ENOMEM; + + old = &scratch->mask1; + new = &scratch->mask2; + + err = get_nodes(old, old_nodes, maxnode); if (err) - return err; + goto out; - err = get_nodes(&new, new_nodes, maxnode); + err = get_nodes(new, new_nodes, maxnode); if (err) - return err; + goto out; /* Find the mm_struct */ + rcu_read_lock(); read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { read_unlock(&tasklist_lock); - return -ESRCH; + rcu_read_unlock(); + err = -ESRCH; + goto out; } mm = get_task_mm(task); read_unlock(&tasklist_lock); + rcu_read_unlock(); + err = -EINVAL; if (!mm) - return -EINVAL; + goto out; /* * Check if this process has the right to modify the specified @@ -1205,12 +1343,12 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ - if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { + if (!nodes_subset(*new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } - if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { + if (!nodes_subset(*new, node_states[N_HIGH_MEMORY])) { err = -EINVAL; goto out; } @@ -1219,10 +1357,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, if (err) goto out; - err = do_migrate_pages(mm, &old, &new, + err = do_migrate_pages(mm, old, new, capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); out: - mmput(mm); + if (mm) + mmput(mm); + NODEMASK_SCRATCH_FREE(scratch); + return err; } @@ -1396,15 +1537,13 @@ static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy) /* * Normally, MPOL_BIND allocations are node-local within the * allowed nodemask. However, if __GFP_THISNODE is set and the - * current node is part of the mask, we use the zonelist for + * current node isn't part of the mask, we use the zonelist for * the first node in the mask instead. */ if (unlikely(gfp & __GFP_THISNODE) && unlikely(!node_isset(nd, policy->v.nodes))) nd = first_node(policy->v.nodes); break; - case MPOL_INTERLEAVE: /* should not happen */ - break; default: BUG(); } @@ -1461,7 +1600,7 @@ unsigned slab_node(struct mempolicy *policy) (void)first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes, &zone); - return zone->node; + return zone ? zone->node : numa_node_id(); } default: @@ -1524,6 +1663,8 @@ static inline unsigned interleave_nid(struct mempolicy *pol, * to the struct mempolicy for conditional unref after allocation. * If the effective policy is 'BIND, returns a pointer to the mempolicy's * @nodemask for filtering the zonelist. + * + * Must be protected by get_mems_allowed() */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, gfp_t gfp_flags, struct mempolicy **mpol, @@ -1544,8 +1685,101 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, } return zl; } + +/* + * init_nodemask_of_mempolicy + * + * If the current task's mempolicy is "default" [NULL], return 'false' + * to indicate default policy. Otherwise, extract the policy nodemask + * for 'bind' or 'interleave' policy into the argument nodemask, or + * initialize the argument nodemask to contain the single node for + * 'preferred' or 'local' policy and return 'true' to indicate presence + * of non-default mempolicy. + * + * We don't bother with reference counting the mempolicy [mpol_get/put] + * because the current task is examining it's own mempolicy and a task's + * mempolicy is only ever changed by the task itself. + * + * N.B., it is the caller's responsibility to free a returned nodemask. + */ +bool init_nodemask_of_mempolicy(nodemask_t *mask) +{ + struct mempolicy *mempolicy; + int nid; + + if (!(mask && current->mempolicy)) + return false; + + task_lock(current); + mempolicy = current->mempolicy; + switch (mempolicy->mode) { + case MPOL_PREFERRED: + if (mempolicy->flags & MPOL_F_LOCAL) + nid = numa_node_id(); + else + nid = mempolicy->v.preferred_node; + init_nodemask_of_node(mask, nid); + break; + + case MPOL_BIND: + /* Fall through */ + case MPOL_INTERLEAVE: + *mask = mempolicy->v.nodes; + break; + + default: + BUG(); + } + task_unlock(current); + + return true; +} #endif +/* + * mempolicy_nodemask_intersects + * + * If tsk's mempolicy is "default" [NULL], return 'true' to indicate default + * policy. Otherwise, check for intersection between mask and the policy + * nodemask for 'bind' or 'interleave' policy. For 'perferred' or 'local' + * policy, always return true since it may allocate elsewhere on fallback. + * + * Takes task_lock(tsk) to prevent freeing of its mempolicy. + */ +bool mempolicy_nodemask_intersects(struct task_struct *tsk, + const nodemask_t *mask) +{ + struct mempolicy *mempolicy; + bool ret = true; + + if (!mask) + return ret; + task_lock(tsk); + mempolicy = tsk->mempolicy; + if (!mempolicy) + goto out; + + switch (mempolicy->mode) { + case MPOL_PREFERRED: + /* + * MPOL_PREFERRED and MPOL_F_LOCAL are only preferred nodes to + * allocate from, they may fallback to other nodes when oom. + * Thus, it's possible for tsk to have allocated memory from + * nodes in mask. + */ + break; + case MPOL_BIND: + case MPOL_INTERLEAVE: + ret = nodes_intersects(mempolicy->v.nodes, *mask); + break; + default: + BUG(); + } +out: + task_unlock(tsk); + return ret; +} + /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, @@ -1588,13 +1822,17 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; + struct page *page; + get_mems_allowed(); if (unlikely(pol->mode == MPOL_INTERLEAVE)) { unsigned nid; nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - return alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, 0, nid); + put_mems_allowed(); + return page; } zl = policy_zonelist(gfp, pol); if (unlikely(mpol_needs_cond_ref(pol))) { @@ -1604,12 +1842,15 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); + put_mems_allowed(); return page; } /* * fast path: default or task policy */ - return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } /** @@ -1634,18 +1875,23 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) struct page *alloc_pages_current(gfp_t gfp, unsigned order) { struct mempolicy *pol = current->mempolicy; + struct page *page; if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; + get_mems_allowed(); /* * No reference counting needed for current->mempolicy * nor system default_policy */ if (pol->mode == MPOL_INTERLEAVE) - return alloc_page_interleave(gfp, order, interleave_nodes(pol)); - return __alloc_pages_nodemask(gfp, order, + page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); + else + page = __alloc_pages_nodemask(gfp, order, policy_zonelist(gfp, pol), policy_nodemask(gfp, pol)); + put_mems_allowed(); + return page; } EXPORT_SYMBOL(alloc_pages_current); @@ -1655,6 +1901,9 @@ EXPORT_SYMBOL(alloc_pages_current); * with the mems_allowed returned by cpuset_mems_allowed(). This * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). + * + * current's mempolicy may be rebinded by the other task(the task that changes + * cpuset's mems), so we needn't do rebind work for current task. */ /* Slow path of a mempolicy duplicate */ @@ -1664,11 +1913,24 @@ struct mempolicy *__mpol_dup(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + + /* task's mempolicy is protected by alloc_lock */ + if (old == current->mempolicy) { + task_lock(current); + *new = *old; + task_unlock(current); + } else + *new = *old; + + rcu_read_lock(); if (current_cpuset_is_being_rebound()) { nodemask_t mems = cpuset_mems_allowed(current); - mpol_rebind_policy(old, &mems); + if (new->flags & MPOL_F_REBINDING) + mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2); + else + mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE); } - *new = *old; + rcu_read_unlock(); atomic_set(&new->refcnt, 1); return new; } @@ -1695,16 +1957,6 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol, return tompol; } -static int mpol_match_intent(const struct mempolicy *a, - const struct mempolicy *b) -{ - if (a->flags != b->flags) - return 0; - if (!mpol_store_user_nodemask(a)) - return 1; - return nodes_equal(a->w.user_nodemask, b->w.user_nodemask); -} - /* Slow path of a mempolicy comparison */ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) { @@ -1712,8 +1964,12 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b) return 0; if (a->mode != b->mode) return 0; - if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b)) + if (a->flags != b->flags) return 0; + if (mpol_store_user_nodemask(a)) + if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) + return 0; + switch (a->mode) { case MPOL_BIND: /* Fall through */ @@ -1891,6 +2147,7 @@ restart: * Install non-NULL @mpol in inode's shared policy rb-tree. * On entry, the current task has a reference on a non-NULL @mpol. * This must be released on exit. + * This is called at get_inode() calls and we can use GFP_KERNEL. */ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) { @@ -1902,28 +2159,32 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) if (mpol) { struct vm_area_struct pvma; struct mempolicy *new; + NODEMASK_SCRATCH(scratch); + if (!scratch) + goto put_mpol; /* contextualize the tmpfs mount point mempolicy */ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask); - if (IS_ERR(new)) { - mpol_put(mpol); /* drop our ref on sb mpol */ - return; /* no valid nodemask intersection */ - } + if (IS_ERR(new)) + goto free_scratch; /* no valid nodemask intersection */ task_lock(current); - ret = mpol_set_nodemask(new, &mpol->w.user_nodemask); + ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch); task_unlock(current); - mpol_put(mpol); /* drop our ref on sb mpol */ - if (ret) { - mpol_put(new); - return; - } + if (ret) + goto put_new; /* Create pseudo-vma that contains just the policy */ memset(&pvma, 0, sizeof(struct vm_area_struct)); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ + +put_new: mpol_put(new); /* drop initial ref */ +free_scratch: + NODEMASK_SCRATCH_FREE(scratch); +put_mpol: + mpol_put(mpol); /* drop our incoming ref on sb mpol */ } } @@ -2028,9 +2289,15 @@ void numa_default_policy(void) * "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag * Used only for mpol_parse_str() and mpol_to_str() */ -#define MPOL_LOCAL (MPOL_INTERLEAVE + 1) -static const char * const policy_types[] = - { "default", "prefer", "bind", "interleave", "local" }; +#define MPOL_LOCAL MPOL_MAX +static const char * const policy_modes[] = +{ + [MPOL_DEFAULT] = "default", + [MPOL_PREFERRED] = "prefer", + [MPOL_BIND] = "bind", + [MPOL_INTERLEAVE] = "interleave", + [MPOL_LOCAL] = "local" +}; #ifdef CONFIG_TMPFS @@ -2055,12 +2322,11 @@ static const char * const policy_types[] = int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) { struct mempolicy *new = NULL; - unsigned short uninitialized_var(mode); + unsigned short mode; unsigned short uninitialized_var(mode_flags); nodemask_t nodes; char *nodelist = strchr(str, ':'); char *flags = strchr(str, '='); - int i; int err = 1; if (nodelist) { @@ -2076,13 +2342,12 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) if (flags) *flags++ = '\0'; /* terminate mode string */ - for (i = 0; i <= MPOL_LOCAL; i++) { - if (!strcmp(str, policy_types[i])) { - mode = i; + for (mode = 0; mode <= MPOL_LOCAL; mode++) { + if (!strcmp(str, policy_modes[mode])) { break; } } - if (i > MPOL_LOCAL) + if (mode > MPOL_LOCAL) goto out; switch (mode) { @@ -2094,8 +2359,8 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) char *rest = nodelist; while (isdigit(*rest)) rest++; - if (!*rest) - err = 0; + if (*rest) + goto out; } break; case MPOL_INTERLEAVE: @@ -2104,7 +2369,6 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) */ if (!nodelist) nodes = node_states[N_HIGH_MEMORY]; - err = 0; break; case MPOL_LOCAL: /* @@ -2114,11 +2378,19 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) goto out; mode = MPOL_PREFERRED; break; - - /* - * case MPOL_BIND: mpol_new() enforces non-empty nodemask. - * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags. - */ + case MPOL_DEFAULT: + /* + * Insist on a empty nodelist + */ + if (!nodelist) + err = 0; + goto out; + case MPOL_BIND: + /* + * Insist on a nodelist + */ + if (!nodelist) + goto out; } mode_flags = 0; @@ -2132,25 +2404,32 @@ int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context) else if (!strcmp(flags, "relative")) mode_flags |= MPOL_F_RELATIVE_NODES; else - err = 1; + goto out; } new = mpol_new(mode, mode_flags, &nodes); if (IS_ERR(new)) - err = 1; - else { - int ret; + goto out; - task_lock(current); - ret = mpol_set_nodemask(new, &nodes); - task_unlock(current); - if (ret) - err = 1; - else if (no_context) { - /* save for contextualization */ - new->w.user_nodemask = nodes; + if (no_context) { + /* save for contextualization */ + new->w.user_nodemask = nodes; + } else { + int ret; + NODEMASK_SCRATCH(scratch); + if (scratch) { + task_lock(current); + ret = mpol_set_nodemask(new, &nodes, scratch); + task_unlock(current); + } else + ret = -ENOMEM; + NODEMASK_SCRATCH_FREE(scratch); + if (ret) { + mpol_put(new); + goto out; } } + err = 0; out: /* Restore string for error message */ @@ -2219,11 +2498,11 @@ int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context) BUG(); } - l = strlen(policy_types[mode]); + l = strlen(policy_modes[mode]); if (buffer + maxlen < p + l + 1) return -ENOSPC; - strcpy(p, policy_types[mode]); + strcpy(p, policy_modes[mode]); p += l; if (flags & MPOL_MODE_FLAGS) { diff --git a/mm/mempool.c b/mm/mempool.c index a46eb1b..1a3bc3d 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -303,18 +303,11 @@ EXPORT_SYMBOL(mempool_free_slab); */ void *mempool_kmalloc(gfp_t gfp_mask, void *pool_data) { - size_t size = (size_t)(long)pool_data; + size_t size = (size_t)pool_data; return kmalloc(size, gfp_mask); } EXPORT_SYMBOL(mempool_kmalloc); -void *mempool_kzalloc(gfp_t gfp_mask, void *pool_data) -{ - size_t size = (size_t) pool_data; - return kzalloc(size, gfp_mask); -} -EXPORT_SYMBOL(mempool_kzalloc); - void mempool_kfree(void *element, void *pool_data) { kfree(element); diff --git a/mm/migrate.c b/mm/migrate.c index 939888f..6ae8a66 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -21,6 +21,7 @@ #include <linux/mm_inline.h> #include <linux/nsproxy.h> #include <linux/pagevec.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/topology.h> #include <linux/cpu.h> @@ -31,6 +32,10 @@ #include <linux/security.h> #include <linux/memcontrol.h> #include <linux/syscalls.h> +#include <linux/hugetlb.h> +#include <linux/gfp.h> + +#include <asm/tlbflush.h> #include "internal.h" @@ -38,7 +43,8 @@ /* * migrate_prep() needs to be called before we start compiling a list of pages - * to be migrated using isolate_lru_page(). + * to be migrated using isolate_lru_page(). If scheduling work on other CPUs is + * undesirable, use migrate_prep_local() */ int migrate_prep(void) { @@ -53,31 +59,36 @@ int migrate_prep(void) return 0; } +/* Do the necessary work of migrate_prep but not if it involves other CPUs */ +int migrate_prep_local(void) +{ + lru_add_drain(); + + return 0; +} + /* * Add isolated pages on the list back to the LRU under page lock * to avoid leaking evictable pages back onto unevictable list. - * - * returns the number of pages put back. */ -int putback_lru_pages(struct list_head *l) +void putback_lru_pages(struct list_head *l) { struct page *page; struct page *page2; - int count = 0; list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); putback_lru_page(page); - count++; } - return count; } /* * Restore a potential migration pte to a working pte entry */ -static void remove_migration_pte(struct vm_area_struct *vma, - struct page *old, struct page *new) +static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, + unsigned long addr, void *old) { struct mm_struct *mm = vma->vm_mm; swp_entry_t entry; @@ -86,107 +97,73 @@ static void remove_migration_pte(struct vm_area_struct *vma, pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; - unsigned long addr = page_address_in_vma(new, vma); - if (addr == -EFAULT) - return; + if (unlikely(PageHuge(new))) { + ptep = huge_pte_offset(mm, addr); + if (!ptep) + goto out; + ptl = &mm->page_table_lock; + } else { + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - return; + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - return; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - return; + ptep = pte_offset_map(pmd, addr); - ptep = pte_offset_map(pmd, addr); + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + goto out; + } - if (!is_swap_pte(*ptep)) { - pte_unmap(ptep); - return; - } + ptl = pte_lockptr(mm, pmd); + } - ptl = pte_lockptr(mm, pmd); spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) - goto out; + goto unlock; entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry) || migration_entry_to_page(entry) != old) - goto out; + if (!is_migration_entry(entry) || + migration_entry_to_page(entry) != old) + goto unlock; get_page(new); pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); +#ifdef CONFIG_HUGETLB_PAGE + if (PageHuge(new)) + pte = pte_mkhuge(pte); +#endif flush_cache_page(vma, addr, pte_pfn(pte)); set_pte_at(mm, addr, ptep, pte); - if (PageAnon(new)) + if (PageHuge(new)) { + if (PageAnon(new)) + hugepage_add_anon_rmap(new, vma, addr); + else + page_dup_rmap(new); + } else if (PageAnon(new)) page_add_anon_rmap(new, vma, addr); else page_add_file_rmap(new); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, addr, pte); - -out: + update_mmu_cache(vma, addr, ptep); +unlock: pte_unmap_unlock(ptep, ptl); -} - -/* - * Note that remove_file_migration_ptes will only work on regular mappings, - * Nonlinear mappings do not use migration entries. - */ -static void remove_file_migration_ptes(struct page *old, struct page *new) -{ - struct vm_area_struct *vma; - struct address_space *mapping = page_mapping(new); - struct prio_tree_iter iter; - pgoff_t pgoff = new->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - if (!mapping) - return; - - spin_lock(&mapping->i_mmap_lock); - - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) - remove_migration_pte(vma, old, new); - - spin_unlock(&mapping->i_mmap_lock); -} - -/* - * Must hold mmap_sem lock on at least one of the vmas containing - * the page so that the anon_vma cannot vanish. - */ -static void remove_anon_migration_ptes(struct page *old, struct page *new) -{ - struct anon_vma *anon_vma; - struct vm_area_struct *vma; - unsigned long mapping; - - mapping = (unsigned long)new->mapping; - - if (!mapping || (mapping & PAGE_MAPPING_ANON) == 0) - return; - - /* - * We hold the mmap_sem lock. So no need to call page_lock_anon_vma. - */ - anon_vma = (struct anon_vma *) (mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) - remove_migration_pte(vma, old, new); - - spin_unlock(&anon_vma->lock); +out: + return SWAP_AGAIN; } /* @@ -195,10 +172,7 @@ static void remove_anon_migration_ptes(struct page *old, struct page *new) */ static void remove_migration_ptes(struct page *old, struct page *new) { - if (PageAnon(new)) - remove_anon_migration_ptes(old, new); - else - remove_file_migration_ptes(old, new); + rmap_walk(new, remove_migration_pte, old); } /* @@ -270,7 +244,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, pslot = radix_tree_lookup_slot(&mapping->page_tree, page_index(page)); - expected_count = 2 + !!page_has_private(page); + expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || (struct page *)radix_tree_deref_slot(pslot) != page) { spin_unlock_irq(&mapping->tree_lock); @@ -312,20 +286,69 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ __dec_zone_page_state(page, NR_FILE_PAGES); __inc_zone_page_state(newpage, NR_FILE_PAGES); - + if (PageSwapBacked(page)) { + __dec_zone_page_state(page, NR_SHMEM); + __inc_zone_page_state(newpage, NR_SHMEM); + } spin_unlock_irq(&mapping->tree_lock); return 0; } /* - * Copy the page to its new location + * The expected number of remaining references is the same as that + * of migrate_page_move_mapping(). */ -static void migrate_page_copy(struct page *newpage, struct page *page) +int migrate_huge_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) { - int anon; + int expected_count; + void **pslot; + + if (!mapping) { + if (page_count(page) != 1) + return -EAGAIN; + return 0; + } + + spin_lock_irq(&mapping->tree_lock); + + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(page)); + + expected_count = 2 + page_has_private(page); + if (page_count(page) != expected_count || + (struct page *)radix_tree_deref_slot(pslot) != page) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + if (!page_freeze_refs(page, expected_count)) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + get_page(newpage); + + radix_tree_replace_slot(pslot, newpage); + + page_unfreeze_refs(page, expected_count); - copy_highpage(newpage, page); + __put_page(page); + + spin_unlock_irq(&mapping->tree_lock); + return 0; +} + +/* + * Copy the page to its new location + */ +void migrate_page_copy(struct page *newpage, struct page *page) +{ + if (PageHuge(page)) + copy_huge_page(newpage, page); + else + copy_highpage(newpage, page); if (PageError(page)) SetPageError(newpage); @@ -336,8 +359,8 @@ static void migrate_page_copy(struct page *newpage, struct page *page) if (TestClearPageActive(page)) { VM_BUG_ON(PageUnevictable(page)); SetPageActive(newpage); - } else - unevictable_migrate_page(newpage, page); + } else if (TestClearPageUnevictable(page)) + SetPageUnevictable(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -356,12 +379,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page) } mlock_migrate_page(newpage, page); + ksm_migrate_page(newpage, page); ClearPageSwapCache(page); ClearPagePrivate(page); set_page_private(page, 0); - /* page->mapping contains a flag for PageAnon() */ - anon = PageAnon(page); page->mapping = NULL; /* @@ -477,7 +499,6 @@ static int writeout(struct address_space *mapping, struct page *page) .nr_to_write = 1, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1 }; int rc; @@ -540,7 +561,8 @@ static int fallback_migrate_page(struct address_space *mapping, * < 0 - error code * == 0 - success */ -static int move_to_new_page(struct page *newpage, struct page *page) +static int move_to_new_page(struct page *newpage, struct page *page, + int remap_swapcache) { struct address_space *mapping; int rc; @@ -575,10 +597,12 @@ static int move_to_new_page(struct page *newpage, struct page *page) else rc = fallback_migrate_page(mapping, newpage, page); - if (!rc) { - remove_migration_ptes(page, newpage); - } else + if (rc) { newpage->mapping = NULL; + } else { + if (remap_swapcache) + remove_migration_ptes(page, newpage); + } unlock_page(newpage); @@ -590,14 +614,16 @@ static int move_to_new_page(struct page *newpage, struct page *page) * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force) + struct page *page, int force, int offlining) { int rc = 0; int *result = NULL; struct page *newpage = get_new_page(page, private, &result); + int remap_swapcache = 1; int rcu_locked = 0; int charge = 0; - struct mem_cgroup *mem; + struct mem_cgroup *mem = NULL; + struct anon_vma *anon_vma = NULL; if (!newpage) return -ENOMEM; @@ -616,8 +642,22 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, lock_page(page); } + /* + * Only memory hotplug's offline_pages() caller has locked out KSM, + * and can safely migrate a KSM page. The other cases have skipped + * PageKsm along with PageReserved - but it is only now when we have + * the page lock that we can be certain it will not go KSM beneath us + * (KSM will not upgrade a page from PageAnon to PageKsm when it sees + * its pagecount raised, but only here do we take the page lock which + * serializes that). + */ + if (PageKsm(page) && !offlining) { + rc = -EBUSY; + goto unlock; + } + /* charge against new page */ - charge = mem_cgroup_prepare_migration(page, &mem); + charge = mem_cgroup_prepare_migration(page, newpage, &mem); if (charge == -ENOMEM) { rc = -ENOMEM; goto unlock; @@ -640,6 +680,34 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (PageAnon(page)) { rcu_read_lock(); rcu_locked = 1; + + /* Determine how to safely use anon_vma */ + if (!page_mapped(page)) { + if (!PageSwapCache(page)) + goto rcu_unlock; + + /* + * We cannot be sure that the anon_vma of an unmapped + * swapcache page is safe to use because we don't + * know in advance if the VMA that this page belonged + * to still exists. If the VMA and others sharing the + * data have been freed, then the anon_vma could + * already be invalid. + * + * To avoid this possibility, swapcache pages get + * migrated but are not remapped when migration + * completes + */ + remap_swapcache = 0; + } else { + /* + * Take a reference count on the anon_vma if the + * page is mapped so that it is guaranteed to + * exist when the page is remapped later + */ + anon_vma = page_anon_vma(page); + get_anon_vma(anon_vma); + } } /* @@ -664,19 +732,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * needs to be effective. */ try_to_free_buffers(page); + goto rcu_unlock; } - goto rcu_unlock; + goto skip_unmap; } /* Establish migration ptes or remove ptes */ - try_to_unmap(page, 1); + try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); +skip_unmap: if (!page_mapped(page)) - rc = move_to_new_page(newpage, page); + rc = move_to_new_page(newpage, page, remap_swapcache); - if (rc) + if (rc && remap_swapcache) remove_migration_ptes(page, page); rcu_unlock: + + /* Drop an anon_vma reference if we took one */ + if (anon_vma) + drop_anon_vma(anon_vma); + if (rcu_locked) rcu_read_unlock(); uncharge: @@ -693,6 +768,8 @@ unlock: * restored. */ list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); putback_lru_page(page); } @@ -714,6 +791,92 @@ move_newpage: } /* + * Counterpart of unmap_and_move_page() for hugepage migration. + * + * This function doesn't wait the completion of hugepage I/O + * because there is no race between I/O and migration for hugepage. + * Note that currently hugepage I/O occurs only in direct I/O + * where no lock is held and PG_writeback is irrelevant, + * and writeback status of all subpages are counted in the reference + * count of the head page (i.e. if all subpages of a 2MB hugepage are + * under direct I/O, the reference of the head page is 512 and a bit more.) + * This means that when we try to migrate hugepage whose subpages are + * doing direct I/O, some references remain after try_to_unmap() and + * hugepage migration fails without data corruption. + * + * There is also no race when direct I/O is issued on the page under migration, + * because then pte is replaced with migration swap entry and direct I/O code + * will wait in the page fault for migration to complete. + */ +static int unmap_and_move_huge_page(new_page_t get_new_page, + unsigned long private, struct page *hpage, + int force, int offlining) +{ + int rc = 0; + int *result = NULL; + struct page *new_hpage = get_new_page(hpage, private, &result); + int rcu_locked = 0; + struct anon_vma *anon_vma = NULL; + + if (!new_hpage) + return -ENOMEM; + + rc = -EAGAIN; + + if (!trylock_page(hpage)) { + if (!force) + goto out; + lock_page(hpage); + } + + if (PageAnon(hpage)) { + rcu_read_lock(); + rcu_locked = 1; + + if (page_mapped(hpage)) { + anon_vma = page_anon_vma(hpage); + atomic_inc(&anon_vma->external_refcount); + } + } + + try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + + if (!page_mapped(hpage)) + rc = move_to_new_page(new_hpage, hpage, 1); + + if (rc) + remove_migration_ptes(hpage, hpage); + + if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, + &anon_vma->lock)) { + int empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + if (empty) + anon_vma_free(anon_vma); + } + + if (rcu_locked) + rcu_read_unlock(); +out: + unlock_page(hpage); + + if (rc != -EAGAIN) { + list_del(&hpage->lru); + put_page(hpage); + } + + put_page(new_hpage); + + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(new_hpage); + } + return rc; +} + +/* * migrate_pages * * The function takes one list of pages to migrate and a function @@ -722,13 +885,14 @@ move_newpage: * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty - * or no retryable pages exist anymore. All pages will be - * returned to the LRU or freed. + * or no retryable pages exist anymore. + * Caller should call putback_lru_pages to return pages to the LRU + * or free list. * * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private) + new_page_t get_new_page, unsigned long private, int offlining) { int retry = 1; int nr_failed = 0; @@ -748,7 +912,7 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2); + page, pass > 2, offlining); switch(rc) { case -ENOMEM: @@ -770,7 +934,51 @@ out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; - putback_lru_pages(from); + if (rc) + return rc; + + return nr_failed + retry; +} + +int migrate_huge_pages(struct list_head *from, + new_page_t get_new_page, unsigned long private, int offlining) +{ + int retry = 1; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int rc; + + for (pass = 0; pass < 10 && retry; pass++) { + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + cond_resched(); + + rc = unmap_and_move_huge_page(get_new_page, + private, page, pass > 2, offlining); + + switch(rc) { + case -ENOMEM: + goto out; + case -EAGAIN: + retry++; + break; + case 0: + break; + default: + /* Permanent failure */ + nr_failed++; + break; + } + } + } + rc = 0; +out: + + list_for_each_entry_safe(page, page2, from, lru) + put_page(page); if (rc) return rc; @@ -831,7 +1039,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, err = -EFAULT; vma = find_vma(mm, pp->addr); - if (!vma || !vma_migratable(vma)) + if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; page = follow_page(vma, pp->addr, FOLL_GET); @@ -844,7 +1052,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!page) goto set_status; - if (PageReserved(page)) /* Check for zero page */ + /* Use PageReserved to check for zero page */ + if (PageReserved(page) || PageKsm(page)) goto put_and_set; pp->page = page; @@ -862,8 +1071,11 @@ static int do_move_page_to_node_array(struct mm_struct *mm, goto put_and_set; err = isolate_lru_page(page); - if (!err) + if (!err) { list_add_tail(&page->lru, &pagelist); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + } put_and_set: /* * Either remove the duplicate refcount from @@ -876,9 +1088,12 @@ set_status: } err = 0; - if (!list_empty(&pagelist)) + if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm); + (unsigned long)pm, 0); + if (err) + putback_lru_pages(&pagelist); + } up_read(&mm->mmap_sem); return err; @@ -937,6 +1152,9 @@ static int do_pages_move(struct mm_struct *mm, struct task_struct *task, goto out_pm; err = -ENODEV; + if (node < 0 || node >= MAX_NUMNODES) + goto out_pm; + if (!node_state(node, N_HIGH_MEMORY)) goto out_pm; @@ -988,7 +1206,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, int err = -EFAULT; vma = find_vma(mm, addr); - if (!vma) + if (!vma || addr < vma->vm_start) goto set_status; page = follow_page(vma, addr, 0); @@ -999,7 +1217,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, err = -ENOENT; /* Use PageReserved to check for zero page */ - if (!page || PageReserved(page)) + if (!page || PageReserved(page) || PageKsm(page)) goto set_status; err = page_to_nid(page); @@ -1024,33 +1242,27 @@ static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, #define DO_PAGES_STAT_CHUNK_NR 16 const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; int chunk_status[DO_PAGES_STAT_CHUNK_NR]; - unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; - int err; - for (i = 0; i < nr_pages; i += chunk_nr) { - if (chunk_nr + i > nr_pages) - chunk_nr = nr_pages - i; + while (nr_pages) { + unsigned long chunk_nr; - err = copy_from_user(chunk_pages, &pages[i], - chunk_nr * sizeof(*chunk_pages)); - if (err) { - err = -EFAULT; - goto out; - } + chunk_nr = nr_pages; + if (chunk_nr > DO_PAGES_STAT_CHUNK_NR) + chunk_nr = DO_PAGES_STAT_CHUNK_NR; + + if (copy_from_user(chunk_pages, pages, chunk_nr * sizeof(*chunk_pages))) + break; do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); - err = copy_to_user(&status[i], chunk_status, - chunk_nr * sizeof(*chunk_status)); - if (err) { - err = -EFAULT; - goto out; - } - } - err = 0; + if (copy_to_user(status, chunk_status, chunk_nr * sizeof(*status))) + break; -out: - return err; + pages += chunk_nr; + status += chunk_nr; + nr_pages -= chunk_nr; + } + return nr_pages ? -EFAULT : 0; } /* diff --git a/mm/mincore.c b/mm/mincore.c index 8cb508f..9ac42dc 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -7,17 +7,52 @@ /* * The mincore() system call. */ -#include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/mman.h> #include <linux/syscalls.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/hugetlb.h> #include <asm/uaccess.h> #include <asm/pgtable.h> +static void mincore_hugetlb_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ +#ifdef CONFIG_HUGETLB_PAGE + struct hstate *h; + + h = hstate_vma(vma); + while (1) { + unsigned char present; + pte_t *ptep; + /* + * Huge pages are always in RAM for now, but + * theoretically it needs to be checked. + */ + ptep = huge_pte_offset(current->mm, + addr & huge_page_mask(h)); + present = ptep && !huge_pte_none(huge_ptep_get(ptep)); + while (1) { + *vec = present; + vec++; + addr += PAGE_SIZE; + if (addr == end) + return; + /* check hugepage border */ + if (!(addr & ~huge_page_mask(h))) + break; + } + } +#else + BUG(); +#endif +} + /* * Later we can get more picky about what "in core" means precisely. * For now, simply check to see if the page is in the page cache, @@ -48,109 +83,150 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff) return present; } -/* - * Do a chunk of "sys_mincore()". We've already checked - * all the arguments, we hold the mmap semaphore: we should - * just return the amount of info we're asked for. - */ -static long do_mincore(unsigned long addr, unsigned char *vec, unsigned long pages) +static void mincore_unmapped_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + unsigned char *vec) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep; - spinlock_t *ptl; - unsigned long nr; + unsigned long nr = (end - addr) >> PAGE_SHIFT; int i; - pgoff_t pgoff; - struct vm_area_struct *vma = find_vma(current->mm, addr); - - /* - * find_vma() didn't find anything above us, or we're - * in an unmapped hole in the address space: ENOMEM. - */ - if (!vma || addr < vma->vm_start) - return -ENOMEM; - - /* - * Calculate how many pages there are left in the last level of the - * PTE array for our address. - */ - nr = PTRS_PER_PTE - ((addr >> PAGE_SHIFT) & (PTRS_PER_PTE-1)); - /* - * Don't overrun this vma - */ - nr = min(nr, (vma->vm_end - addr) >> PAGE_SHIFT); + if (vma->vm_file) { + pgoff_t pgoff; - /* - * Don't return more than the caller asked for - */ - nr = min(nr, pages); + pgoff = linear_page_index(vma, addr); + for (i = 0; i < nr; i++, pgoff++) + vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); + } else { + for (i = 0; i < nr; i++) + vec[i] = 0; + } +} - pgd = pgd_offset(vma->vm_mm, addr); - if (pgd_none_or_clear_bad(pgd)) - goto none_mapped; - pud = pud_offset(pgd, addr); - if (pud_none_or_clear_bad(pud)) - goto none_mapped; - pmd = pmd_offset(pud, addr); - if (pmd_none_or_clear_bad(pmd)) - goto none_mapped; +static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + spinlock_t *ptl; + pte_t *ptep; ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - for (i = 0; i < nr; i++, ptep++, addr += PAGE_SIZE) { - unsigned char present; + do { pte_t pte = *ptep; + pgoff_t pgoff; - if (pte_present(pte)) { - present = 1; - - } else if (pte_none(pte)) { - if (vma->vm_file) { - pgoff = linear_page_index(vma, addr); - present = mincore_page(vma->vm_file->f_mapping, - pgoff); - } else - present = 0; - - } else if (pte_file(pte)) { + next = addr + PAGE_SIZE; + if (pte_none(pte)) + mincore_unmapped_range(vma, addr, next, vec); + else if (pte_present(pte)) + *vec = 1; + else if (pte_file(pte)) { pgoff = pte_to_pgoff(pte); - present = mincore_page(vma->vm_file->f_mapping, pgoff); - + *vec = mincore_page(vma->vm_file->f_mapping, pgoff); } else { /* pte is a swap entry */ swp_entry_t entry = pte_to_swp_entry(pte); + if (is_migration_entry(entry)) { /* migration entries are always uptodate */ - present = 1; + *vec = 1; } else { #ifdef CONFIG_SWAP pgoff = entry.val; - present = mincore_page(&swapper_space, pgoff); + *vec = mincore_page(&swapper_space, pgoff); #else WARN_ON(1); - present = 1; + *vec = 1; #endif } } + vec++; + } while (ptep++, addr = next, addr != end); + pte_unmap_unlock(ptep - 1, ptl); +} - vec[i] = present; - } - pte_unmap_unlock(ptep-1, ptl); +static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pmd_t *pmd; - return nr; + pmd = pmd_offset(pud, addr); + do { + next = pmd_addr_end(addr, end); + if (pmd_none_or_clear_bad(pmd)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pte_range(vma, pmd, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pmd++, addr = next, addr != end); +} -none_mapped: - if (vma->vm_file) { - pgoff = linear_page_index(vma, addr); - for (i = 0; i < nr; i++, pgoff++) - vec[i] = mincore_page(vma->vm_file->f_mapping, pgoff); - } else { - for (i = 0; i < nr; i++) - vec[i] = 0; +static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pud_t *pud; + + pud = pud_offset(pgd, addr); + do { + next = pud_addr_end(addr, end); + if (pud_none_or_clear_bad(pud)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pmd_range(vma, pud, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pud++, addr = next, addr != end); +} + +static void mincore_page_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + unsigned long next; + pgd_t *pgd; + + pgd = pgd_offset(vma->vm_mm, addr); + do { + next = pgd_addr_end(addr, end); + if (pgd_none_or_clear_bad(pgd)) + mincore_unmapped_range(vma, addr, next, vec); + else + mincore_pud_range(vma, pgd, addr, next, vec); + vec += (next - addr) >> PAGE_SHIFT; + } while (pgd++, addr = next, addr != end); +} + +/* + * Do a chunk of "sys_mincore()". We've already checked + * all the arguments, we hold the mmap semaphore: we should + * just return the amount of info we're asked for. + */ +static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *vec) +{ + struct vm_area_struct *vma; + unsigned long end; + + vma = find_vma(current->mm, addr); + if (!vma || addr < vma->vm_start) + return -ENOMEM; + + end = min(vma->vm_end, addr + (pages << PAGE_SHIFT)); + + if (is_vm_hugetlb_page(vma)) { + mincore_hugetlb_page_range(vma, addr, end, vec); + return (end - addr) >> PAGE_SHIFT; } - return nr; + end = pmd_addr_end(addr, end); + + if (is_vm_hugetlb_page(vma)) + mincore_hugetlb_page_range(vma, addr, end, vec); + else + mincore_page_range(vma, addr, end, vec); + + return (end - addr) >> PAGE_SHIFT; } /* @@ -210,7 +286,7 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, * the temporary buffer size. */ down_read(¤t->mm->mmap_sem); - retval = do_mincore(start, tmp, min(pages, PAGE_SIZE)); + retval = do_mincore(start, min(pages, PAGE_SIZE), tmp); up_read(¤t->mm->mmap_sem); if (retval <= 0) @@ -25,7 +25,7 @@ int can_do_mlock(void) { if (capable(CAP_IPC_LOCK)) return 1; - if (current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur != 0) + if (rlimit(RLIMIT_MEMLOCK) != 0) return 1; return 0; } @@ -88,25 +88,22 @@ void mlock_vma_page(struct page *page) } } -/* - * called from munlock()/munmap() path with page supposedly on the LRU. +/** + * munlock_vma_page - munlock a vma page + * @page - page to be unlocked * - * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked - * [in try_to_munlock()] and then attempt to isolate the page. We must - * isolate the page to keep others from messing with its unevictable - * and mlocked state while trying to munlock. However, we pre-clear the - * mlocked state anyway as we might lose the isolation race and we might - * not get another chance to clear PageMlocked. If we successfully - * isolate the page and try_to_munlock() detects other VM_LOCKED vmas - * mapping the page, it will restore the PageMlocked state, unless the page - * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), - * perhaps redundantly. - * If we lose the isolation race, and the page is mapped by other VM_LOCKED - * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() - * either of which will restore the PageMlocked state by calling - * mlock_vma_page() above, if it can grab the vma's mmap sem. + * called from munlock()/munmap() path with page supposedly on the LRU. + * When we munlock a page, because the vma where we found the page is being + * munlock()ed or munmap()ed, we want to check whether other vmas hold the + * page locked so that we can leave it on the unevictable lru list and not + * bother vmscan with it. However, to walk the page's rmap list in + * try_to_munlock() we must isolate the page from the LRU. If some other + * task has removed the page from the LRU, we won't be able to do that. + * So we clear the PageMlocked as we might not get another chance. If we + * can't isolate the page, we leave it for putback_lru_page() and vmscan + * [page_referenced()/try_to_unmap()] to deal with. */ -static void munlock_vma_page(struct page *page) +void munlock_vma_page(struct page *page) { BUG_ON(!PageLocked(page)); @@ -117,18 +114,18 @@ static void munlock_vma_page(struct page *page) /* * did try_to_unlock() succeed or punt? */ - if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) + if (ret != SWAP_MLOCK) count_vm_event(UNEVICTABLE_PGMUNLOCKED); putback_lru_page(page); } else { /* - * We lost the race. let try_to_unmap() deal - * with it. At least we get the page state and - * mlock stats right. However, page is still on - * the noreclaim list. We'll fix that up when - * the page is eventually freed or we scan the - * noreclaim list. + * Some other task has removed the page from the LRU. + * putback_lru_page() will take care of removing the + * page from the unevictable list, if necessary. + * vmscan [page_referenced()] will move the page back + * to the unevictable list if some other vma has it + * mlocked. */ if (PageUnevictable(page)) count_vm_event(UNEVICTABLE_PGSTRANDED); @@ -138,50 +135,50 @@ static void munlock_vma_page(struct page *page) } } +static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) +{ + return (vma->vm_flags & VM_GROWSDOWN) && + (vma->vm_start == addr) && + !vma_stack_continue(vma->vm_prev, addr); +} + /** - * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. + * __mlock_vma_pages_range() - mlock a range of pages in the vma. * @vma: target vma * @start: start address * @end: end address - * @mlock: 0 indicate munlock, otherwise mlock. * - * If @mlock == 0, unlock an mlocked range; - * else mlock the range of pages. This takes care of making the pages present , - * too. + * This takes care of making the pages present too. * * return 0 on success, negative error code on error. * * vma->vm_mm->mmap_sem must be held for at least read. */ static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - int mlock) + unsigned long start, unsigned long end) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; int ret = 0; - int gup_flags = 0; + int gup_flags; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); VM_BUG_ON(start < vma->vm_start); VM_BUG_ON(end > vma->vm_end); - VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && - (atomic_read(&mm->mm_users) != 0)); - - /* - * mlock: don't page populate if vma has PROT_NONE permission. - * munlock: always do munlock although the vma has PROT_NONE - * permission, or SIGKILL is pending. - */ - if (!mlock) - gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS | - GUP_FLAGS_IGNORE_SIGKILL; + VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); + gup_flags = FOLL_TOUCH | FOLL_GET; if (vma->vm_flags & VM_WRITE) - gup_flags |= GUP_FLAGS_WRITE; + gup_flags |= FOLL_WRITE; + + /* We don't try to access the guard page of a stack vma */ + if (stack_guard_page(vma, start)) { + addr += PAGE_SIZE; + nr_pages--; + } while (nr_pages > 0) { int i; @@ -201,51 +198,45 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, * This can happen for, e.g., VM_NONLINEAR regions before * a page has been allocated and mapped at a given offset, * or for addresses that map beyond end of a file. - * We'll mlock the the pages if/when they get faulted in. + * We'll mlock the pages if/when they get faulted in. */ if (ret < 0) break; - if (ret == 0) { - /* - * We know the vma is there, so the only time - * we cannot get a single page should be an - * error (ret < 0) case. - */ - WARN_ON(1); - break; - } lru_add_drain(); /* push cached pages to LRU */ for (i = 0; i < ret; i++) { struct page *page = pages[i]; - lock_page(page); - /* - * Because we lock page here and migration is blocked - * by the elevated reference, we need only check for - * page truncation (file-cache only). - */ if (page->mapping) { - if (mlock) + /* + * That preliminary check is mainly to avoid + * the pointless overhead of lock_page on the + * ZERO_PAGE: which might bounce very badly if + * there is contention. However, we're still + * dirtying its cacheline with get/put_page: + * we'll add another __get_user_pages flag to + * avoid it if that case turns out to matter. + */ + lock_page(page); + /* + * Because we lock page here and migration is + * blocked by the elevated reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) mlock_vma_page(page); - else - munlock_vma_page(page); + unlock_page(page); } - unlock_page(page); - put_page(page); /* ref from get_user_pages() */ - - /* - * here we assume that get_user_pages() has given us - * a list of virtually contiguous pages. - */ - addr += PAGE_SIZE; /* for next get_user_pages() */ - nr_pages--; + put_page(page); /* ref from get_user_pages() */ } + + addr += ret * PAGE_SIZE; + nr_pages -= ret; ret = 0; } - return ret; /* count entire vma as locked_vm */ + return ret; /* 0 or negative error code */ } /* @@ -289,7 +280,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - __mlock_vma_pages_range(vma, start, end, 1); + __mlock_vma_pages_range(vma, start, end); /* Hide errors from mmap() and other callers */ return 0; @@ -310,7 +301,6 @@ no_mlock: return nr_pages; /* error or pages NOT mlocked */ } - /* * munlock_vma_pages_range() - munlock all pages in the vma range.' * @vma - vma containing range to be munlock()ed. @@ -330,10 +320,38 @@ no_mlock: * free them. This will result in freeing mlocked pages. */ void munlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end) { + unsigned long addr; + + lru_add_drain(); vma->vm_flags &= ~VM_LOCKED; - __mlock_vma_pages_range(vma, start, end, 0); + + for (addr = start; addr < end; addr += PAGE_SIZE) { + struct page *page; + /* + * Although FOLL_DUMP is intended for get_dump_page(), + * it just so happens that its special treatment of the + * ZERO_PAGE (returning an error instead of doing get_page) + * suits munlock very well (and if somehow an abnormal page + * has sneaked into the range, we won't oops here: great). + */ + page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); + if (page && !IS_ERR(page)) { + lock_page(page); + /* + * Like in __mlock_vma_pages_range(), + * because we lock page here and migration is + * blocked by the elevated reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + munlock_vma_page(page); + unlock_page(page); + put_page(page); + } + cond_resched(); + } } /* @@ -400,18 +418,14 @@ success: * It's okay if try_to_unmap_one unmaps a page just after we * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - vma->vm_flags = newflags; if (lock) { - ret = __mlock_vma_pages_range(vma, start, end, 1); - - if (ret > 0) { - mm->locked_vm -= ret; - ret = 0; - } else - ret = __mlock_posix_error_return(ret); /* translate if needed */ + vma->vm_flags = newflags; + ret = __mlock_vma_pages_range(vma, start, end); + if (ret < 0) + ret = __mlock_posix_error_return(ret); } else { - __mlock_vma_pages_range(vma, start, end, 0); + munlock_vma_pages_range(vma, start, end); } out: @@ -486,7 +500,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) locked = len >> PAGE_SHIFT; locked += current->mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; /* check against resource limits */ @@ -549,7 +563,7 @@ SYSCALL_DEFINE1(mlockall, int, flags) down_write(¤t->mm->mmap_sem); - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; ret = -ENOMEM; @@ -583,7 +597,7 @@ int user_shm_lock(size_t size, struct user_struct *user) int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); if (lock_limit == RLIM_INFINITY) allowed = 1; lock_limit >>= PAGE_SHIFT; @@ -606,44 +620,3 @@ void user_shm_unlock(size_t size, struct user_struct *user) spin_unlock(&shmlock_user_lock); free_uid(user); } - -int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, - size_t size) -{ - unsigned long lim, vm, pgsz; - int error = -ENOMEM; - - pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - - down_write(&mm->mmap_sem); - - lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = mm->total_vm + pgsz; - if (lim < vm) - goto out; - - lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = mm->locked_vm + pgsz; - if (lim < vm) - goto out; - - mm->total_vm += pgsz; - mm->locked_vm += pgsz; - - error = 0; - out: - up_write(&mm->mmap_sem); - return error; -} - -void refund_locked_memory(struct mm_struct *mm, size_t size) -{ - unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - - down_write(&mm->mmap_sem); - - mm->total_vm -= pgsz; - mm->locked_vm -= pgsz; - - up_write(&mm->mmap_sem); -} @@ -20,7 +20,6 @@ #include <linux/fs.h> #include <linux/personality.h> #include <linux/security.h> -#include <linux/ima.h> #include <linux/hugetlb.h> #include <linux/profile.h> #include <linux/module.h> @@ -28,7 +27,8 @@ #include <linux/mempolicy.h> #include <linux/rmap.h> #include <linux/mmu_notifier.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> +#include <linux/audit.h> #include <asm/uaccess.h> #include <asm/cacheflush.h> @@ -88,9 +88,6 @@ int sysctl_overcommit_ratio = 50; /* default is 50% */ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT; struct percpu_counter vm_committed_as; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - /* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to @@ -269,7 +266,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * segment grow beyond its set limit the in case where the limit is * not page aligned -Ram Gupta */ - rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; + rlim = rlimit(RLIMIT_DATA); if (rlim < RLIM_INFINITY && (brk - mm->start_brk) + (mm->end_data - mm->start_data) > rlim) goto out; @@ -392,17 +389,23 @@ static inline void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent) { + struct vm_area_struct *next; + + vma->vm_prev = prev; if (prev) { - vma->vm_next = prev->vm_next; + next = prev->vm_next; prev->vm_next = vma; } else { mm->mmap = vma; if (rb_parent) - vma->vm_next = rb_entry(rb_parent, + next = rb_entry(rb_parent, struct vm_area_struct, vm_rb); else - vma->vm_next = NULL; + next = NULL; } + vma->vm_next = next; + if (next) + next->vm_prev = vma; } void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, @@ -441,7 +444,6 @@ __vma_link(struct mm_struct *mm, struct vm_area_struct *vma, { __vma_link_list(mm, vma, prev, rb_parent); __vma_link_rb(mm, vma, rb_link, rb_parent); - __anon_vma_link(vma); } static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, @@ -457,12 +459,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mapping->i_mmap_lock); vma->vm_truncate_count = mapping->truncate_count; } - anon_vma_lock(vma); __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); - anon_vma_unlock(vma); if (mapping) spin_unlock(&mapping->i_mmap_lock); @@ -490,7 +490,11 @@ static inline void __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) { - prev->vm_next = vma->vm_next; + struct vm_area_struct *next = vma->vm_next; + + prev->vm_next = next; + if (next) + next->vm_prev = prev; rb_erase(&vma->vm_rb, &mm->mm_rb); if (mm->mmap_cache == vma) mm->mmap_cache = prev; @@ -503,7 +507,7 @@ __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, * are necessary. The "insert" vma (if any) is to be inserted * before we drop the necessary locks. */ -void vma_adjust(struct vm_area_struct *vma, unsigned long start, +int vma_adjust(struct vm_area_struct *vma, unsigned long start, unsigned long end, pgoff_t pgoff, struct vm_area_struct *insert) { struct mm_struct *mm = vma->vm_mm; @@ -511,12 +515,14 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *importer = NULL; struct address_space *mapping = NULL; struct prio_tree_root *root = NULL; - struct file *file = vma->vm_file; struct anon_vma *anon_vma = NULL; + struct file *file = vma->vm_file; long adjust_next = 0; int remove_next = 0; if (next && !insert) { + struct vm_area_struct *exporter = NULL; + if (end >= next->vm_end) { /* * vma expands, overlapping all the next, and @@ -524,7 +530,7 @@ void vma_adjust(struct vm_area_struct *vma, unsigned long start, */ again: remove_next = 1 + (end > next->vm_end); end = next->vm_end; - anon_vma = next->anon_vma; + exporter = next; importer = vma; } else if (end > next->vm_start) { /* @@ -532,7 +538,7 @@ again: remove_next = 1 + (end > next->vm_end); * mprotect case 5 shifting the boundary up. */ adjust_next = (end - next->vm_start) >> PAGE_SHIFT; - anon_vma = next->anon_vma; + exporter = next; importer = vma; } else if (end < vma->vm_end) { /* @@ -541,9 +547,20 @@ again: remove_next = 1 + (end > next->vm_end); * mprotect case 4 shifting the boundary down. */ adjust_next = - ((vma->vm_end - end) >> PAGE_SHIFT); - anon_vma = next->anon_vma; + exporter = vma; importer = next; } + + /* + * Easily overlooked: when mprotect shifts the boundary, + * make sure the expanding vma has anon_vma set if the + * shrinking vma had, to cover any anon pages imported. + */ + if (exporter && exporter->anon_vma && !importer->anon_vma) { + if (anon_vma_clone(importer, exporter)) + return -ENOMEM; + importer->anon_vma = exporter->anon_vma; + } } if (file) { @@ -572,22 +589,14 @@ again: remove_next = 1 + (end > next->vm_end); } /* - * When changing only vma->vm_end, we don't really need - * anon_vma lock: but is that case worth optimizing out? + * When changing only vma->vm_end, we don't really need anon_vma + * lock. This is a fairly rare case by itself, but the anon_vma + * lock may be shared between many sibling processes. Skipping + * the lock for brk adjustments makes a difference sometimes. */ - if (vma->anon_vma) + if (vma->anon_vma && (insert || importer || start != vma->vm_start)) { anon_vma = vma->anon_vma; - if (anon_vma) { - spin_lock(&anon_vma->lock); - /* - * Easily overlooked: when mprotect shifts the boundary, - * make sure the expanding vma has anon_vma set if the - * shrinking vma had, to cover any anon pages imported. - */ - if (importer && !importer->anon_vma) { - importer->anon_vma = anon_vma; - __anon_vma_link(importer); - } + anon_vma_lock(anon_vma); } if (root) { @@ -620,8 +629,6 @@ again: remove_next = 1 + (end > next->vm_end); __vma_unlink(mm, next, vma); if (file) __remove_shared_vm_struct(next, file, mapping); - if (next->anon_vma) - __anon_vma_merge(vma, next); } else if (insert) { /* * split_vma has split insert from vma, and needs @@ -632,7 +639,7 @@ again: remove_next = 1 + (end > next->vm_end); } if (anon_vma) - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); if (mapping) spin_unlock(&mapping->i_mmap_lock); @@ -642,6 +649,8 @@ again: remove_next = 1 + (end > next->vm_end); if (next->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(mm); } + if (next->anon_vma) + anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); kmem_cache_free(vm_area_cachep, next); @@ -657,10 +666,9 @@ again: remove_next = 1 + (end > next->vm_end); } validate_mm(mm); -} -/* Flags that can be inherited from an existing mapping when merging */ -#define VM_MERGEABLE_FLAGS (VM_CAN_NONLINEAR) + return 0; +} /* * If the vma has a ->close operation then the driver probably needs to release @@ -669,7 +677,8 @@ again: remove_next = 1 + (end > next->vm_end); static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { - if ((vma->vm_flags ^ vm_flags) & ~VM_MERGEABLE_FLAGS) + /* VM_CAN_NONLINEAR may get set later by f_op->mmap() */ + if ((vma->vm_flags ^ vm_flags) & ~VM_CAN_NONLINEAR) return 0; if (vma->vm_file != file) return 0; @@ -765,6 +774,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; + int err; /* * We later require that vma->vm_flags == vm_flags, @@ -798,11 +808,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, is_mergeable_anon_vma(prev->anon_vma, next->anon_vma)) { /* cases 1, 6 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, next->vm_end, prev->vm_pgoff, NULL); } else /* cases 2, 5, 7 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, end, prev->vm_pgoff, NULL); + if (err) + return NULL; return prev; } @@ -814,11 +826,13 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen)) { if (prev && addr < prev->vm_end) /* case 4 */ - vma_adjust(prev, prev->vm_start, + err = vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL); else /* cases 3, 8 */ - vma_adjust(area, addr, next->vm_end, + err = vma_adjust(area, addr, next->vm_end, next->vm_pgoff - pglen, NULL); + if (err) + return NULL; return area; } @@ -826,6 +840,61 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, } /* + * Rough compatbility check to quickly see if it's even worth looking + * at sharing an anon_vma. + * + * They need to have the same vm_file, and the flags can only differ + * in things that mprotect may change. + * + * NOTE! The fact that we share an anon_vma doesn't _have_ to mean that + * we can merge the two vma's. For example, we refuse to merge a vma if + * there is a vm_ops->close() function, because that indicates that the + * driver is doing some kind of reference counting. But that doesn't + * really matter for the anon_vma sharing case. + */ +static int anon_vma_compatible(struct vm_area_struct *a, struct vm_area_struct *b) +{ + return a->vm_end == b->vm_start && + mpol_equal(vma_policy(a), vma_policy(b)) && + a->vm_file == b->vm_file && + !((a->vm_flags ^ b->vm_flags) & ~(VM_READ|VM_WRITE|VM_EXEC)) && + b->vm_pgoff == a->vm_pgoff + ((b->vm_start - a->vm_start) >> PAGE_SHIFT); +} + +/* + * Do some basic sanity checking to see if we can re-use the anon_vma + * from 'old'. The 'a'/'b' vma's are in VM order - one of them will be + * the same as 'old', the other will be the new one that is trying + * to share the anon_vma. + * + * NOTE! This runs with mm_sem held for reading, so it is possible that + * the anon_vma of 'old' is concurrently in the process of being set up + * by another page fault trying to merge _that_. But that's ok: if it + * is being set up, that automatically means that it will be a singleton + * acceptable for merging, so we can do all of this optimistically. But + * we do that ACCESS_ONCE() to make sure that we never re-load the pointer. + * + * IOW: that the "list_is_singular()" test on the anon_vma_chain only + * matters for the 'stable anon_vma' case (ie the thing we want to avoid + * is to return an anon_vma that is "complex" due to having gone through + * a fork). + * + * We also make sure that the two vma's are compatible (adjacent, + * and with the same memory policies). That's all stable, even with just + * a read lock on the mm_sem. + */ +static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_area_struct *a, struct vm_area_struct *b) +{ + if (anon_vma_compatible(a, b)) { + struct anon_vma *anon_vma = ACCESS_ONCE(old->anon_vma); + + if (anon_vma && list_is_singular(&old->anon_vma_chain)) + return anon_vma; + } + return NULL; +} + +/* * find_mergeable_anon_vma is used by anon_vma_prepare, to check * neighbouring vmas for a suitable anon_vma, before it goes off * to allocate a new anon_vma. It checks because a repetitive @@ -835,28 +904,16 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { + struct anon_vma *anon_vma; struct vm_area_struct *near; - unsigned long vm_flags; near = vma->vm_next; if (!near) goto try_prev; - /* - * Since only mprotect tries to remerge vmas, match flags - * which might be mprotected into each other later on. - * Neither mlock nor madvise tries to remerge at present, - * so leave their flags as obstructing a merge. - */ - vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); - vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); - - if (near->anon_vma && vma->vm_end == near->vm_start && - mpol_equal(vma_policy(vma), vma_policy(near)) && - can_vma_merge_before(near, vm_flags, - NULL, vma->vm_file, vma->vm_pgoff + - ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT))) - return near->anon_vma; + anon_vma = reusable_anon_vma(near, vma, near); + if (anon_vma) + return anon_vma; try_prev: /* * It is potentially slow to have to call find_vma_prev here. @@ -869,14 +926,9 @@ try_prev: if (!near) goto none; - vm_flags = vma->vm_flags & ~(VM_READ|VM_WRITE|VM_EXEC); - vm_flags |= near->vm_flags & (VM_READ|VM_WRITE|VM_EXEC); - - if (near->anon_vma && near->vm_end == vma->vm_start && - mpol_equal(vma_policy(near), vma_policy(vma)) && - can_vma_merge_after(near, vm_flags, - NULL, vma->vm_file, vma->vm_pgoff)) - return near->anon_vma; + anon_vma = reusable_anon_vma(near, near, vma); + if (anon_vma) + return anon_vma; none: /* * There's no absolute need to look only at touching neighbours: @@ -908,7 +960,7 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, #endif /* CONFIG_PROC_FS */ /* - * The caller must hold down_write(current->mm->mmap_sem). + * The caller must hold down_write(¤t->mm->mmap_sem). */ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, @@ -937,13 +989,9 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, if (!(flags & MAP_FIXED)) addr = round_hint_to_min(addr); - error = arch_mmap_check(addr, len, flags); - if (error) - return error; - /* Careful about overflows.. */ len = PAGE_ALIGN(len); - if (!len || len > TASK_SIZE) + if (!len) return -ENOMEM; /* offset overflow? */ @@ -968,18 +1016,16 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; - if (flags & MAP_LOCKED) { + if (flags & MAP_LOCKED) if (!can_do_mlock()) return -EPERM; - vm_flags |= VM_LOCKED; - } /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; locked = len >> PAGE_SHIFT; locked += mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) return -EAGAIN; @@ -1050,14 +1096,76 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, error = security_file_mmap(file, reqprot, prot, flags, addr, 0); if (error) return error; - error = ima_file_mmap(file, prot); - if (error) - return error; return mmap_region(file, addr, len, flags, vm_flags, pgoff); } EXPORT_SYMBOL(do_mmap_pgoff); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + if (!(flags & MAP_ANONYMOUS)) { + audit_mmap_fd(fd, flags); + if (unlikely(flags & MAP_HUGETLB)) + return -EINVAL; + file = fget(fd); + if (!file) + goto out; + } else if (flags & MAP_HUGETLB) { + struct user_struct *user = NULL; + /* + * VM_NORESERVE is used because the reservations will be + * taken when vm_ops->mmap() is called + * A dummy user value is used because we are not locking + * memory so no accounting is necessary + */ + len = ALIGN(len, huge_page_size(&default_hstate)); + file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE); + if (IS_ERR(file)) + return PTR_ERR(file); + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + +#ifdef __ARCH_WANT_SYS_OLD_MMAP +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) +{ + struct mmap_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} +#endif /* __ARCH_WANT_SYS_OLD_MMAP */ + /* * Some shared mappigns will want the pages marked read-only * to track write events. If so, we'll downgrade vm_page_prot @@ -1180,6 +1288,7 @@ munmap_back: vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; + INIT_LIST_HEAD(&vma->anon_vma_chain); if (file) { error = -EINVAL; @@ -1198,23 +1307,35 @@ munmap_back: goto unmap_and_free_vma; if (vm_flags & VM_EXECUTABLE) added_exe_file_vma(mm); + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + addr = vma->vm_start; + pgoff = vma->vm_pgoff; + vm_flags = vma->vm_flags; } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) goto free_vma; } - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - */ - addr = vma->vm_start; - pgoff = vma->vm_pgoff; - vm_flags = vma->vm_flags; + if (vma_wants_writenotify(vma)) { + pgprot_t pprot = vma->vm_page_prot; - if (vma_wants_writenotify(vma)) + /* Can vma->vm_page_prot have changed?? + * + * Answer: Yes, drivers may have changed it in their + * f_op->mmap method. + * + * Ensures that vmas marked as uncached stay that way. + */ vma->vm_page_prot = vm_get_page_prot(vm_flags & ~VM_SHARED); + if (pgprot_val(pprot) == pgprot_val(pgprot_noncached(pprot))) + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + } vma_link(mm, vma, prev, rb_link, rb_parent); file = vma->vm_file; @@ -1223,18 +1344,13 @@ munmap_back: if (correct_wcount) atomic_inc(&inode->i_writecount); out: - perf_counter_mmap(vma); + perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - /* - * makes pages present; downgrades, drops, reacquires mmap_sem - */ - long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); - if (nr_pages < 0) - return nr_pages; /* vma gone! */ - mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; + if (!mlock_vma_pages_range(vma, addr, addr + len)) + mm->locked_vm += (len >> PAGE_SHIFT); } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); return addr; @@ -1448,6 +1564,14 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long (*get_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); + unsigned long error = arch_mmap_check(addr, len, flags); + if (error) + return error; + + /* Careful about overflows.. */ + if (len > TASK_SIZE) + return -ENOMEM; + get_area = current->mm->get_unmapped_area; if (file && file->f_op && file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; @@ -1554,7 +1678,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns return -ENOMEM; /* Stack limit test */ - if (size > rlim[RLIMIT_STACK].rlim_cur) + if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur)) return -ENOMEM; /* mlock limit tests */ @@ -1562,7 +1686,8 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns unsigned long locked; unsigned long limit; locked = mm->locked_vm + grow; - limit = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur); + limit >>= PAGE_SHIFT; if (locked > limit && !capable(CAP_IPC_LOCK)) return -ENOMEM; } @@ -1593,9 +1718,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns * PA-RISC uses this for its stack; IA64 for its Register Backing Store. * vma is the last one with address > vma->vm_end. Have to extend vma. */ -#ifndef CONFIG_IA64 -static -#endif int expand_upwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1609,7 +1731,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) */ if (unlikely(anon_vma_prepare(vma))) return -ENOMEM; - anon_vma_lock(vma); + vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller @@ -1620,7 +1742,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (address < PAGE_ALIGN(address+4)) address = PAGE_ALIGN(address+4); else { - anon_vma_unlock(vma); + vma_unlock_anon_vma(vma); return -ENOMEM; } error = 0; @@ -1633,10 +1755,12 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) grow = (address - vma->vm_end) >> PAGE_SHIFT; error = acct_stack_growth(vma, size, grow); - if (!error) + if (!error) { vma->vm_end = address; + perf_event_mmap(vma); + } } - anon_vma_unlock(vma); + vma_unlock_anon_vma(vma); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1661,7 +1785,7 @@ static int expand_downwards(struct vm_area_struct *vma, if (error) return error; - anon_vma_lock(vma); + vma_lock_anon_vma(vma); /* * vma->vm_start/vm_end cannot change under us because the caller @@ -1680,9 +1804,10 @@ static int expand_downwards(struct vm_area_struct *vma, if (!error) { vma->vm_start = address; vma->vm_pgoff -= grow; + perf_event_mmap(vma); } } - anon_vma_unlock(vma); + vma_unlock_anon_vma(vma); return error; } @@ -1709,8 +1834,7 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) if (!prev || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) { - if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) - return NULL; /* vma gone! */ + mlock_vma_pages_range(prev, addr, prev->vm_end); } return prev; } @@ -1738,8 +1862,7 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) if (expand_stack(vma, addr)) return NULL; if (vma->vm_flags & VM_LOCKED) { - if (mlock_vma_pages_range(vma, addr, start) < 0) - return NULL; /* vma gone! */ + mlock_vma_pages_range(vma, addr, start); } return vma; } @@ -1801,6 +1924,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); + vma->vm_prev = NULL; do { rb_erase(&vma->vm_rb, &mm->mm_rb); mm->map_count--; @@ -1808,6 +1932,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, vma = vma->vm_next; } while (vma && vma->vm_start < end); *insertion_point = vma; + if (vma) + vma->vm_prev = prev; tail_vma->vm_next = NULL; if (mm->unmap_area == arch_unmap_area) addr = prev ? prev->vm_end : mm->mmap_base; @@ -1818,29 +1944,29 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } /* - * Split a vma into two pieces at address 'addr', a new vma is allocated - * either for the first part or the tail. + * __split_vma() bypasses sysctl_max_map_count checking. We use this on the + * munmap path where it doesn't make sense to fail. */ -int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, +static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma, unsigned long addr, int new_below) { struct mempolicy *pol; struct vm_area_struct *new; + int err = -ENOMEM; if (is_vm_hugetlb_page(vma) && (addr & ~(huge_page_mask(hstate_vma(vma))))) return -EINVAL; - if (mm->map_count >= sysctl_max_map_count) - return -ENOMEM; - new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!new) - return -ENOMEM; + goto out_err; /* most fields are the same, copy all, and then fixup */ *new = *vma; + INIT_LIST_HEAD(&new->anon_vma_chain); + if (new_below) new->vm_end = addr; else { @@ -1850,11 +1976,14 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, pol = mpol_dup(vma_policy(vma)); if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new); - return PTR_ERR(pol); + err = PTR_ERR(pol); + goto out_free_vma; } vma_set_policy(new, pol); + if (anon_vma_clone(new, vma)) + goto out_free_mpol; + if (new->vm_file) { get_file(new->vm_file); if (vma->vm_flags & VM_EXECUTABLE) @@ -1865,12 +1994,43 @@ int split_vma(struct mm_struct * mm, struct vm_area_struct * vma, new->vm_ops->open(new); if (new_below) - vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + + err = vma_adjust(vma, addr, vma->vm_end, vma->vm_pgoff + ((addr - new->vm_start) >> PAGE_SHIFT), new); else - vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); - return 0; + /* Success. */ + if (!err) + return 0; + + /* Clean everything up if vma_adjust failed. */ + if (new->vm_ops && new->vm_ops->close) + new->vm_ops->close(new); + if (new->vm_file) { + if (vma->vm_flags & VM_EXECUTABLE) + removed_exe_file_vma(mm); + fput(new->vm_file); + } + unlink_anon_vmas(new); + out_free_mpol: + mpol_put(pol); + out_free_vma: + kmem_cache_free(vm_area_cachep, new); + out_err: + return err; +} + +/* + * Split a vma into two pieces at address 'addr', a new vma is allocated + * either for the first part or the tail. + */ +int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, int new_below) +{ + if (mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + return __split_vma(mm, vma, addr, new_below); } /* Munmap is split into 2 main parts -- this part which finds @@ -1908,7 +2068,17 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) * places tmp vma above, and higher split_vma places tmp vma below. */ if (start > vma->vm_start) { - int error = split_vma(mm, vma, start, 0); + int error; + + /* + * Make sure that map_count on return from munmap() will + * not exceed its limit; but let map_count go just above + * its limit temporarily, to help free resources as expected. + */ + if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) + return -ENOMEM; + + error = __split_vma(mm, vma, start, 0); if (error) return error; prev = vma; @@ -1917,7 +2087,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) /* Does it split the last one? */ last = find_vma(mm, end); if (last && end > last->vm_start) { - int error = split_vma(mm, last, end, 1); + int error = __split_vma(mm, last, end, 1); if (error) return error; } @@ -1992,20 +2162,14 @@ unsigned long do_brk(unsigned long addr, unsigned long len) if (!len) return addr; - if ((addr + len) > TASK_SIZE || (addr + len) < addr) - return -EINVAL; - - if (is_hugepage_only_range(mm, addr, len)) - return -EINVAL; - error = security_file_mmap(NULL, 0, 0, 0, addr, 1); if (error) return error; flags = VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - error = arch_mmap_check(addr, len, flags); - if (error) + error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (error & ~PAGE_MASK) return error; /* @@ -2015,7 +2179,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) unsigned long locked, lock_limit; locked = len >> PAGE_SHIFT; locked += mm->locked_vm; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit = rlimit(RLIMIT_MEMLOCK); lock_limit >>= PAGE_SHIFT; if (locked > lock_limit && !capable(CAP_IPC_LOCK)) return -EAGAIN; @@ -2063,6 +2227,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) return -ENOMEM; } + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -2071,6 +2236,7 @@ unsigned long do_brk(unsigned long addr, unsigned long len) vma->vm_page_prot = vm_get_page_prot(flags); vma_link(mm, vma, prev, rb_link, rb_parent); out: + perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { if (!mlock_vma_pages_range(vma, addr, addr + len)) @@ -2114,6 +2280,7 @@ void exit_mmap(struct mm_struct *mm) /* Use -1 here to ensure all VMAs in the mm are unmapped */ end = unmap_vmas(&tlb, vma, 0, -1, &nr_accounted, NULL); vm_unacct_memory(nr_accounted); + free_pgtables(tlb, vma, FIRST_USER_ADDRESS, 0); tlb_finish_mmu(tlb, 0, end); @@ -2198,10 +2365,11 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, if (new_vma) { *new_vma = *vma; pol = mpol_dup(vma_policy(vma)); - if (IS_ERR(pol)) { - kmem_cache_free(vm_area_cachep, new_vma); - return NULL; - } + if (IS_ERR(pol)) + goto out_free_vma; + INIT_LIST_HEAD(&new_vma->anon_vma_chain); + if (anon_vma_clone(new_vma, vma)) + goto out_free_mempol; vma_set_policy(new_vma, pol); new_vma->vm_start = addr; new_vma->vm_end = addr + len; @@ -2217,6 +2385,12 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, } } return new_vma; + + out_free_mempol: + mpol_put(pol); + out_free_vma: + kmem_cache_free(vm_area_cachep, new_vma); + return NULL; } /* @@ -2228,7 +2402,7 @@ int may_expand_vm(struct mm_struct *mm, unsigned long npages) unsigned long cur = mm->total_vm; /* pages */ unsigned long lim; - lim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + lim = rlimit(RLIMIT_AS) >> PAGE_SHIFT; if (cur + npages > lim) return 0; @@ -2270,7 +2444,7 @@ static void special_mapping_close(struct vm_area_struct *vma) { } -static struct vm_operations_struct special_mapping_vmops = { +static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, }; @@ -2288,12 +2462,14 @@ int install_special_mapping(struct mm_struct *mm, unsigned long addr, unsigned long len, unsigned long vm_flags, struct page **pages) { + int ret; struct vm_area_struct *vma; vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (unlikely(vma == NULL)) return -ENOMEM; + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_mm = mm; vma->vm_start = addr; vma->vm_end = addr + len; @@ -2304,39 +2480,46 @@ int install_special_mapping(struct mm_struct *mm, vma->vm_ops = &special_mapping_vmops; vma->vm_private_data = pages; - if (unlikely(insert_vm_struct(mm, vma))) { - kmem_cache_free(vm_area_cachep, vma); - return -ENOMEM; - } + ret = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1); + if (ret) + goto out; + + ret = insert_vm_struct(mm, vma); + if (ret) + goto out; mm->total_vm += len >> PAGE_SHIFT; - perf_counter_mmap(vma); + perf_event_mmap(vma); return 0; + +out: + kmem_cache_free(vm_area_cachep, vma); + return ret; } static DEFINE_MUTEX(mm_all_locks_mutex); static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma) { - if (!test_bit(0, (unsigned long *) &anon_vma->head.next)) { + if (!test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { /* * The LSB of head.next can't change from under us * because we hold the mm_all_locks_mutex. */ - spin_lock_nest_lock(&anon_vma->lock, &mm->mmap_sem); + spin_lock_nest_lock(&anon_vma->root->lock, &mm->mmap_sem); /* * We can safely modify head.next after taking the - * anon_vma->lock. If some other vma in this mm shares + * anon_vma->root->lock. If some other vma in this mm shares * the same anon_vma we won't take it again. * * No need of atomic instructions here, head.next * can't change from under us thanks to the - * anon_vma->lock. + * anon_vma->root->lock. */ if (__test_and_set_bit(0, (unsigned long *) - &anon_vma->head.next)) + &anon_vma->root->head.next)) BUG(); } } @@ -2394,6 +2577,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping) int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; + struct anon_vma_chain *avc; int ret = -EINTR; BUG_ON(down_read_trylock(&mm->mmap_sem)); @@ -2411,7 +2595,8 @@ int mm_take_all_locks(struct mm_struct *mm) if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) - vm_lock_anon_vma(mm, vma->anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_lock_anon_vma(mm, avc->anon_vma); } ret = 0; @@ -2425,7 +2610,7 @@ out_unlock: static void vm_unlock_anon_vma(struct anon_vma *anon_vma) { - if (test_bit(0, (unsigned long *) &anon_vma->head.next)) { + if (test_bit(0, (unsigned long *) &anon_vma->root->head.next)) { /* * The LSB of head.next can't change to 0 from under * us because we hold the mm_all_locks_mutex. @@ -2436,12 +2621,12 @@ static void vm_unlock_anon_vma(struct anon_vma *anon_vma) * * No need of atomic instructions here, head.next * can't change from under us until we release the - * anon_vma->lock. + * anon_vma->root->lock. */ if (!__test_and_clear_bit(0, (unsigned long *) - &anon_vma->head.next)) + &anon_vma->root->head.next)) BUG(); - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); } } @@ -2466,13 +2651,15 @@ static void vm_unlock_mapping(struct address_space *mapping) void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; + struct anon_vma_chain *avc; BUG_ON(down_read_trylock(&mm->mmap_sem)); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); for (vma = mm->mmap; vma; vma = vma->vm_next) { if (vma->anon_vma) - vm_unlock_anon_vma(vma->anon_vma); + list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) + vm_unlock_anon_vma(avc->anon_vma); if (vma->vm_file && vma->vm_file->f_mapping) vm_unlock_mapping(vma->vm_file->f_mapping); } diff --git a/mm/mmu_context.c b/mm/mmu_context.c new file mode 100644 index 0000000..9e82e93 --- /dev/null +++ b/mm/mmu_context.c @@ -0,0 +1,62 @@ +/* Copyright (C) 2009 Red Hat, Inc. + * + * See ../COPYING for licensing terms. + */ + +#include <linux/mm.h> +#include <linux/mmu_context.h> +#include <linux/module.h> +#include <linux/sched.h> + +#include <asm/mmu_context.h> + +/* + * use_mm + * Makes the calling kernel thread take on the specified + * mm context. + * Called by the retry thread execute retries within the + * iocb issuer's mm context, so that copy_from/to_user + * operations work seamlessly for aio. + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void use_mm(struct mm_struct *mm) +{ + struct mm_struct *active_mm; + struct task_struct *tsk = current; + + task_lock(tsk); + active_mm = tsk->active_mm; + if (active_mm != mm) { + atomic_inc(&mm->mm_count); + tsk->active_mm = mm; + } + tsk->mm = mm; + switch_mm(active_mm, mm, tsk); + task_unlock(tsk); + + if (active_mm != mm) + mmdrop(active_mm); +} +EXPORT_SYMBOL_GPL(use_mm); + +/* + * unuse_mm + * Reverses the effect of use_mm, i.e. releases the + * specified mm context which was earlier taken on + * by the calling kernel thread + * (Note: this routine is intended to be called only + * from a kernel thread context) + */ +void unuse_mm(struct mm_struct *mm) +{ + struct task_struct *tsk = current; + + task_lock(tsk); + sync_mm_rss(tsk, mm); + tsk->mm = NULL; + /* active_mm is still 'mm' */ + enter_lazy_tlb(mm, tsk); + task_unlock(tsk); +} +EXPORT_SYMBOL_GPL(unuse_mm); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 5f4ef02..438951d 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -16,6 +16,7 @@ #include <linux/err.h> #include <linux/rcupdate.h> #include <linux/sched.h> +#include <linux/slab.h> /* * This function can't run concurrently against mmu_notifier_register @@ -99,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, + pte_t pte) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->change_pte) + mn->ops->change_pte(mn, mm, address, pte); + /* + * Some drivers don't have change_pte, + * so we must call invalidate_page in that case. + */ + else if (mn->ops->invalidate_page) + mn->ops->invalidate_page(mn, mm, address); + } + rcu_read_unlock(); +} + void __mmu_notifier_invalidate_page(struct mm_struct *mm, unsigned long address) { diff --git a/mm/mmzone.c b/mm/mmzone.c index f5b7d17..e35bfb8 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ + +#ifdef CONFIG_SMP +/* Called when a more accurate view of NR_FREE_PAGES is needed */ +unsigned long zone_nr_free_pages(struct zone *zone) +{ + unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); + + /* + * While kswapd is awake, it is considered the zone is under some + * memory pressure. Under pressure, there is a risk that + * per-cpu-counter-drift will allow the min watermark to be breached + * potentially causing a live-lock. While kswapd is awake and + * free pages are low, get a better estimate for free pages + */ + if (nr_free_pages < zone->percpu_drift_mark && + !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) + return zone_page_state_snapshot(zone, NR_FREE_PAGES); + + return nr_free_pages; +} +#endif /* CONFIG_SMP */ diff --git a/mm/mprotect.c b/mm/mprotect.c index d80311b..4c51338 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -10,7 +10,6 @@ #include <linux/mm.h> #include <linux/hugetlb.h> -#include <linux/slab.h> #include <linux/shm.h> #include <linux/mman.h> #include <linux/fs.h> @@ -23,7 +22,7 @@ #include <linux/swapops.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> -#include <linux/perf_counter.h> +#include <linux/perf_event.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/cacheflush.h> @@ -212,6 +211,7 @@ success: mmu_notifier_invalidate_range_end(mm, start, end); vm_stat_account(mm, oldflags, vma->vm_file, -nrpages); vm_stat_account(mm, newflags, vma->vm_file, nrpages); + perf_event_mmap(vma); return 0; fail: @@ -300,7 +300,6 @@ SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, error = mprotect_fixup(vma, &prev, nstart, tmp, newflags); if (error) goto out; - perf_counter_mmap(vma); nstart = tmp; if (nstart < prev->vm_end) diff --git a/mm/mremap.c b/mm/mremap.c index a39b7b9..563fbdd 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -9,8 +9,8 @@ #include <linux/mm.h> #include <linux/hugetlb.h> -#include <linux/slab.h> #include <linux/shm.h> +#include <linux/ksm.h> #include <linux/mman.h> #include <linux/swap.h> #include <linux/capability.h> @@ -85,8 +85,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock vmtruncate out, - * since it might clean the dst vma before the src vma, + * moving file-based ptes, we must lock truncate_pagecache + * out, since it might clean the dst vma before the src vma, * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; @@ -101,7 +101,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, * pte locks because exclusive mmap_sem prevents deadlock. */ old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); - new_pte = pte_offset_map_nested(new_pmd, new_addr); + new_pte = pte_offset_map(new_pmd, new_addr); new_ptl = pte_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -119,7 +119,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, arch_leave_lazy_mmu_mode(); if (new_ptl != old_ptl) spin_unlock(new_ptl); - pte_unmap_nested(new_pte - 1); + pte_unmap(new_pte - 1); pte_unmap_unlock(old_pte - 1, old_ptl); if (mapping) spin_unlock(&mapping->i_mmap_lock); @@ -174,6 +174,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, unsigned long excess = 0; unsigned long hiwater_vm; int split = 0; + int err; /* * We'd prefer to avoid failure later on in do_munmap: @@ -182,6 +183,18 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (mm->map_count >= sysctl_max_map_count - 3) return -ENOMEM; + /* + * Advise KSM to break any KSM pages in the area to be moved: + * it would be confusing if they were to turn up at the new + * location, where they happen to coincide with different KSM + * pages recently unmapped. But leave vma->vm_flags as it was, + * so KSM can come around to merge on vma and new_vma afterwards. + */ + err = ksm_madvise(vma, old_addr, old_addr + old_len, + MADV_UNMERGEABLE, &vm_flags); + if (err) + return err; + new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT); new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff); if (!new_vma) @@ -247,6 +260,137 @@ static unsigned long move_vma(struct vm_area_struct *vma, return new_addr; } +static struct vm_area_struct *vma_to_resize(unsigned long addr, + unsigned long old_len, unsigned long new_len, unsigned long *p) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = find_vma(mm, addr); + + if (!vma || vma->vm_start > addr) + goto Efault; + + if (is_vm_hugetlb_page(vma)) + goto Einval; + + /* We can't remap across vm area boundaries */ + if (old_len > vma->vm_end - addr) + goto Efault; + + if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { + if (new_len > old_len) + goto Efault; + } + + if (vma->vm_flags & VM_LOCKED) { + unsigned long locked, lock_limit; + locked = mm->locked_vm << PAGE_SHIFT; + lock_limit = rlimit(RLIMIT_MEMLOCK); + locked += new_len - old_len; + if (locked > lock_limit && !capable(CAP_IPC_LOCK)) + goto Eagain; + } + + if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) + goto Enomem; + + if (vma->vm_flags & VM_ACCOUNT) { + unsigned long charged = (new_len - old_len) >> PAGE_SHIFT; + if (security_vm_enough_memory(charged)) + goto Efault; + *p = charged; + } + + return vma; + +Efault: /* very odd choice for most of the cases, but... */ + return ERR_PTR(-EFAULT); +Einval: + return ERR_PTR(-EINVAL); +Enomem: + return ERR_PTR(-ENOMEM); +Eagain: + return ERR_PTR(-EAGAIN); +} + +static unsigned long mremap_to(unsigned long addr, + unsigned long old_len, unsigned long new_addr, + unsigned long new_len) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long ret = -EINVAL; + unsigned long charged = 0; + unsigned long map_flags; + + if (new_addr & ~PAGE_MASK) + goto out; + + if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) + goto out; + + /* Check if the location we're moving into overlaps the + * old location at all, and fail if it does. + */ + if ((new_addr <= addr) && (new_addr+new_len) > addr) + goto out; + + if ((addr <= new_addr) && (addr+old_len) > new_addr) + goto out; + + ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); + if (ret) + goto out; + + ret = do_munmap(mm, new_addr, new_len); + if (ret) + goto out; + + if (old_len >= new_len) { + ret = do_munmap(mm, addr+new_len, old_len - new_len); + if (ret && old_len != new_len) + goto out; + old_len = new_len; + } + + vma = vma_to_resize(addr, old_len, new_len, &charged); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); + goto out; + } + + map_flags = MAP_FIXED; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff + + ((addr - vma->vm_start) >> PAGE_SHIFT), + map_flags); + if (ret & ~PAGE_MASK) + goto out1; + + ret = move_vma(vma, addr, old_len, new_len, new_addr); + if (!(ret & ~PAGE_MASK)) + goto out; +out1: + vm_unacct_memory(charged); + +out: + return ret; +} + +static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) +{ + unsigned long end = vma->vm_end + delta; + if (end < vma->vm_end) /* overflow */ + return 0; + if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ + return 0; + if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, + 0, MAP_FIXED) & ~PAGE_MASK) + return 0; + return 1; +} + /* * Expand (or shrink) an existing mapping, potentially moving it at the * same time (controlled by the MREMAP_MAYMOVE flag and available VM space) @@ -280,32 +424,10 @@ unsigned long do_mremap(unsigned long addr, if (!new_len) goto out; - /* new_addr is only valid if MREMAP_FIXED is specified */ if (flags & MREMAP_FIXED) { - if (new_addr & ~PAGE_MASK) - goto out; - if (!(flags & MREMAP_MAYMOVE)) - goto out; - - if (new_len > TASK_SIZE || new_addr > TASK_SIZE - new_len) - goto out; - - /* Check if the location we're moving into overlaps the - * old location at all, and fail if it does. - */ - if ((new_addr <= addr) && (new_addr+new_len) > addr) - goto out; - - if ((addr <= new_addr) && (addr+old_len) > new_addr) - goto out; - - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; - - ret = do_munmap(mm, new_addr, new_len); - if (ret) - goto out; + if (flags & MREMAP_MAYMOVE) + ret = mremap_to(addr, old_len, new_addr, new_len); + goto out; } /* @@ -318,64 +440,30 @@ unsigned long do_mremap(unsigned long addr, if (ret && old_len != new_len) goto out; ret = addr; - if (!(flags & MREMAP_FIXED) || (new_addr == addr)) - goto out; - old_len = new_len; + goto out; } /* - * Ok, we need to grow.. or relocate. + * Ok, we need to grow.. */ - ret = -EFAULT; - vma = find_vma(mm, addr); - if (!vma || vma->vm_start > addr) - goto out; - if (is_vm_hugetlb_page(vma)) { - ret = -EINVAL; - goto out; - } - /* We can't remap across vm area boundaries */ - if (old_len > vma->vm_end - addr) - goto out; - if (vma->vm_flags & (VM_DONTEXPAND | VM_PFNMAP)) { - if (new_len > old_len) - goto out; - } - if (vma->vm_flags & VM_LOCKED) { - unsigned long locked, lock_limit; - locked = mm->locked_vm << PAGE_SHIFT; - lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; - locked += new_len - old_len; - ret = -EAGAIN; - if (locked > lock_limit && !capable(CAP_IPC_LOCK)) - goto out; - } - if (!may_expand_vm(mm, (new_len - old_len) >> PAGE_SHIFT)) { - ret = -ENOMEM; + vma = vma_to_resize(addr, old_len, new_len, &charged); + if (IS_ERR(vma)) { + ret = PTR_ERR(vma); goto out; } - if (vma->vm_flags & VM_ACCOUNT) { - charged = (new_len - old_len) >> PAGE_SHIFT; - if (security_vm_enough_memory(charged)) - goto out_nc; - } - /* old_len exactly to the end of the area.. - * And we're not relocating the area. */ - if (old_len == vma->vm_end - addr && - !((flags & MREMAP_FIXED) && (addr != new_addr)) && - (old_len != new_len || !(flags & MREMAP_MAYMOVE))) { - unsigned long max_addr = TASK_SIZE; - if (vma->vm_next) - max_addr = vma->vm_next->vm_start; + if (old_len == vma->vm_end - addr) { /* can we just expand the current mapping? */ - if (max_addr - addr >= new_len) { + if (vma_expandable(vma, new_len - old_len)) { int pages = (new_len - old_len) >> PAGE_SHIFT; - vma_adjust(vma, vma->vm_start, - addr + new_len, vma->vm_pgoff, NULL); + if (vma_adjust(vma, vma->vm_start, addr + new_len, + vma->vm_pgoff, NULL)) { + ret = -ENOMEM; + goto out; + } mm->total_vm += pages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); @@ -395,28 +483,27 @@ unsigned long do_mremap(unsigned long addr, */ ret = -ENOMEM; if (flags & MREMAP_MAYMOVE) { - if (!(flags & MREMAP_FIXED)) { - unsigned long map_flags = 0; - if (vma->vm_flags & VM_MAYSHARE) - map_flags |= MAP_SHARED; - - new_addr = get_unmapped_area(vma->vm_file, 0, new_len, - vma->vm_pgoff, map_flags); - if (new_addr & ~PAGE_MASK) { - ret = new_addr; - goto out; - } - - ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); - if (ret) - goto out; + unsigned long map_flags = 0; + if (vma->vm_flags & VM_MAYSHARE) + map_flags |= MAP_SHARED; + + new_addr = get_unmapped_area(vma->vm_file, 0, new_len, + vma->vm_pgoff + + ((addr - vma->vm_start) >> PAGE_SHIFT), + map_flags); + if (new_addr & ~PAGE_MASK) { + ret = new_addr; + goto out; } + + ret = security_file_mmap(NULL, 0, 0, 0, new_addr, 1); + if (ret) + goto out; ret = move_vma(vma, addr, old_len, new_len, new_addr); } out: if (ret & ~PAGE_MASK) vm_unacct_memory(charged); -out_nc: return ret; } @@ -82,7 +82,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) (vma->vm_flags & VM_SHARED)) { get_file(file); up_read(&mm->mmap_sem); - error = vfs_fsync(file, file->f_path.dentry, 0); + error = vfs_fsync(file, 0); fput(file); if (error || start >= end) goto out; @@ -10,7 +10,7 @@ * Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com> * Copyright (c) 2000-2001 D Jeff Dionne <jeff@uClinux.org> * Copyright (c) 2002 Greg Ungerer <gerg@snapgear.com> - * Copyright (c) 2007-2009 Paul Mundt <lethal@linux-sh.org> + * Copyright (c) 2007-2010 Paul Mundt <lethal@linux-sh.org> */ #include <linux/module.h> @@ -29,17 +29,14 @@ #include <linux/personality.h> #include <linux/security.h> #include <linux/syscalls.h> +#include <linux/audit.h> #include <asm/uaccess.h> #include <asm/tlb.h> #include <asm/tlbflush.h> +#include <asm/mmu_context.h> #include "internal.h" -static inline __attribute__((format(printf, 1, 2))) -void no_printk(const char *fmt, ...) -{ -} - #if 0 #define kenter(FMT, ...) \ printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__) @@ -56,12 +53,11 @@ void no_printk(const char *fmt, ...) no_printk(KERN_DEBUG FMT"\n", ##__VA_ARGS__) #endif -#include "internal.h" - void *high_memory; struct page *mem_map; unsigned long max_mapnr; unsigned long num_physpages; +unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ int sysctl_overcommit_ratio = 50; /* default is 50% */ @@ -69,9 +65,6 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT; int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS; int heap_stack_gap = 0; -/* amount of vm to protect from userspace access */ -unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR; - atomic_long_t mmap_pages_allocated; EXPORT_SYMBOL(mem_map); @@ -82,50 +75,10 @@ static struct kmem_cache *vm_region_jar; struct rb_root nommu_region_tree = RB_ROOT; DECLARE_RWSEM(nommu_region_sem); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { }; /* - * Handle all mappings that got truncated by a "truncate()" - * system call. - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode *inode, loff_t offset) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long limit; - - if (inode->i_size < offset) - goto do_expand; - i_size_write(inode, offset); - - truncate_inode_pages(mapping, offset); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out; - i_size_write(inode, offset); - -out_truncate: - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out: - return -EFBIG; -} - -EXPORT_SYMBOL(vmtruncate); - -/* * Return the total memory allocated for this pointer, not * just what the caller asked for. * @@ -173,30 +126,29 @@ unsigned int kobjsize(const void *objp) } int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int flags, - struct page **pages, struct vm_area_struct **vmas) + unsigned long start, int nr_pages, unsigned int foll_flags, + struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; int i; - int write = !!(flags & GUP_FLAGS_WRITE); - int force = !!(flags & GUP_FLAGS_FORCE); - int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); /* calculate required read or write permissions. - * - if 'force' is set, we only require the "MAY" flags. + * If FOLL_FORCE is set, we only require the "MAY" flags. */ - vm_flags = write ? (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); - vm_flags &= force ? (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); + vm_flags = (foll_flags & FOLL_WRITE) ? + (VM_WRITE | VM_MAYWRITE) : (VM_READ | VM_MAYREAD); + vm_flags &= (foll_flags & FOLL_FORCE) ? + (VM_MAYREAD | VM_MAYWRITE) : (VM_READ | VM_WRITE); - for (i = 0; i < len; i++) { + for (i = 0; i < nr_pages; i++) { vma = find_vma(mm, start); if (!vma) goto finish_or_fault; /* protect what we can, including chardevs */ - if (vma->vm_flags & (VM_IO | VM_PFNMAP) || - (!ignore && !(vm_flags & vma->vm_flags))) + if ((vma->vm_flags & (VM_IO | VM_PFNMAP)) || + !(vm_flags & vma->vm_flags)) goto finish_or_fault; if (pages) { @@ -206,7 +158,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } if (vmas) vmas[i] = vma; - start += PAGE_SIZE; + start = (start + PAGE_SIZE) & PAGE_MASK; } return i; @@ -215,7 +167,6 @@ finish_or_fault: return i ? : -EFAULT; } - /* * get a list of pages in an address range belonging to the specified process * and indicate the VMA that covers each page @@ -224,22 +175,41 @@ finish_or_fault: * - don't permit access to VMAs that don't support it, such as I/O mappings */ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + unsigned long start, int nr_pages, int write, int force, struct page **pages, struct vm_area_struct **vmas) { int flags = 0; if (write) - flags |= GUP_FLAGS_WRITE; + flags |= FOLL_WRITE; if (force) - flags |= GUP_FLAGS_FORCE; + flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, - start, len, flags, - pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); } EXPORT_SYMBOL(get_user_pages); +/** + * follow_pfn - look up PFN at a user virtual address + * @vma: memory mapping + * @address: user virtual address + * @pfn: location to store found PFN + * + * Only IO mappings and raw PFN mappings are allowed. + * + * Returns zero and the pfn at @pfn on success, -ve otherwise. + */ +int follow_pfn(struct vm_area_struct *vma, unsigned long address, + unsigned long *pfn) +{ + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) + return -EINVAL; + + *pfn = address >> PAGE_SHIFT; + return 0; +} +EXPORT_SYMBOL(follow_pfn); + DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; @@ -324,12 +294,60 @@ void *vmalloc(unsigned long size) } EXPORT_SYMBOL(vmalloc); +/* + * vzalloc - allocate virtually continguos memory with zero fill + * + * @size: allocation size + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into continguos kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + PAGE_KERNEL); +} +EXPORT_SYMBOL(vzalloc); + +/** + * vmalloc_node - allocate memory on a specific node + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ void *vmalloc_node(unsigned long size, int node) { return vmalloc(size); } EXPORT_SYMBOL(vmalloc_node); +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return vzalloc(size); +} +EXPORT_SYMBOL(vzalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif @@ -423,6 +441,31 @@ void __attribute__((weak)) vmalloc_sync_all(void) { } +/** + * alloc_vm_area - allocate a range of kernel address space + * @size: size of the area + * + * Returns: NULL on failure, vm_struct on success + * + * This function reserves a range of kernel address space, and + * allocates pagetables to map that range. No actual mappings + * are created. If the kernel address space is not shared + * between processes, it syncs the pagetable across all + * processes. + */ +struct vm_struct *alloc_vm_area(size_t size) +{ + BUG(); + return NULL; +} +EXPORT_SYMBOL_GPL(alloc_vm_area); + +void free_vm_area(struct vm_struct *area) +{ + BUG(); +} +EXPORT_SYMBOL_GPL(free_vm_area); + int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page) { @@ -458,6 +501,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ + flush_icache_range(mm->brk, brk); return mm->brk = brk; } @@ -577,11 +621,11 @@ static void free_page_series(unsigned long from, unsigned long to) static void __put_nommu_region(struct vm_region *region) __releases(nommu_region_sem) { - kenter("%p{%d}", region, atomic_read(®ion->vm_usage)); + kenter("%p{%d}", region, region->vm_usage); BUG_ON(!nommu_region_tree.rb_node); - if (atomic_dec_and_test(®ion->vm_usage)) { + if (--region->vm_usage == 0) { if (region->vm_top > region->vm_start) delete_nommu_region(region); up_write(&nommu_region_sem); @@ -611,6 +655,22 @@ static void put_nommu_region(struct vm_region *region) } /* + * update protection on a vma + */ +static void protect_vma(struct vm_area_struct *vma, unsigned long flags) +{ +#ifdef CONFIG_MPU + struct mm_struct *mm = vma->vm_mm; + long start = vma->vm_start & PAGE_MASK; + while (start < vma->vm_end) { + protect_page(mm, start, flags); + start += PAGE_SIZE; + } + update_protections(mm); +#endif +} + +/* * add a VMA into a process's mm_struct in the appropriate place in the list * and tree and add to the address space's page tree also if not an anonymous * page @@ -618,7 +678,7 @@ static void put_nommu_region(struct vm_region *region) */ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma, **pp; + struct vm_area_struct *pvma, **pp, *next; struct address_space *mapping; struct rb_node **p, *parent; @@ -629,6 +689,8 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) mm->map_count++; vma->vm_mm = mm; + protect_vma(vma, vma->vm_flags); + /* add the VMA to the mapping */ if (vma->vm_file) { mapping = vma->vm_file->f_mapping; @@ -676,8 +738,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) break; } - vma->vm_next = *pp; + next = *pp; *pp = vma; + vma->vm_next = next; + if (next) + next->vm_prev = vma; } /* @@ -691,6 +756,8 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) kenter("%p", vma); + protect_vma(vma, 0); + mm->map_count--; if (mm->mmap_cache == vma) mm->mmap_cache = NULL; @@ -832,7 +899,7 @@ static int validate_mmap_request(struct file *file, int ret; /* do the simple checks first */ - if (flags & MAP_FIXED || addr) { + if (flags & MAP_FIXED) { printk(KERN_DEBUG "%d: Can't do fixed-address/overlay mmap of RAM\n", current->pid); @@ -903,6 +970,10 @@ static int validate_mmap_request(struct file *file, if (!file->f_op->read) capabilities &= ~BDI_CAP_MAP_COPY; + /* The file shall have been opened with read permission. */ + if (!(file->f_mode & FMODE_READ)) + return -EACCES; + if (flags & MAP_SHARED) { /* do checks for writing, appending and locking */ if ((prot & PROT_WRITE) && @@ -919,14 +990,6 @@ static int validate_mmap_request(struct file *file, if (!(capabilities & BDI_CAP_MAP_DIRECT)) return -ENODEV; - if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || - ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || - ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) - ) { - printk("MAP_SHARED not completely supported on !MMU\n"); - return -EINVAL; - } - /* we mustn't privatise shared mappings */ capabilities &= ~BDI_CAP_MAP_COPY; } @@ -942,6 +1005,20 @@ static int validate_mmap_request(struct file *file, capabilities &= ~BDI_CAP_MAP_DIRECT; } + if (capabilities & BDI_CAP_MAP_DIRECT) { + if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) || + ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) || + ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP)) + ) { + capabilities &= ~BDI_CAP_MAP_DIRECT; + if (flags & MAP_SHARED) { + printk(KERN_WARNING + "MAP_SHARED not completely supported on !MMU\n"); + return -EINVAL; + } + } + } + /* handle executable mappings and implied executable * mappings */ if (file->f_path.mnt->mnt_flags & MNT_NOEXEC) { @@ -997,22 +1074,20 @@ static unsigned long determine_vm_flags(struct file *file, unsigned long vm_flags; vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags); - vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; /* vm_flags |= mm->def_flags; */ if (!(capabilities & BDI_CAP_MAP_DIRECT)) { /* attempt to share read-only copies of mapped file chunks */ + vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (file && !(prot & PROT_WRITE)) vm_flags |= VM_MAYSHARE; - } - else { + } else { /* overlay a shareable mapping on the backing device or inode * if possible - used for chardevs, ramfs/tmpfs/shmfs and * romfs/cramfs */ + vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS); if (flags & MAP_SHARED) - vm_flags |= VM_MAYSHARE | VM_SHARED; - else if ((((vm_flags & capabilities) ^ vm_flags) & BDI_CAP_VMFLAGS) == 0) - vm_flags |= VM_MAYSHARE; + vm_flags |= VM_SHARED; } /* refuse to let anyone share private mappings with this process if @@ -1036,15 +1111,14 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; - /* getting an ENOSYS error indicates that direct mmap isn't - * possible (as opposed to tried but failed) so we'll fall - * through to making a private copy of the data and mapping - * that if we can */ + /* getting -ENOSYS indicates that direct mmap isn't possible (as + * opposed to tried but failed) so we can only give a suitable error as + * it's not possible to make a private copy if MAP_SHARED was given */ return -ENODEV; } @@ -1053,7 +1127,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) */ static int do_mmap_private(struct vm_area_struct *vma, struct vm_region *region, - unsigned long len) + unsigned long len, + unsigned long capabilities) { struct page *pages; unsigned long total, point, n, rlen; @@ -1064,13 +1139,13 @@ static int do_mmap_private(struct vm_area_struct *vma, * shared mappings on devices or memory * - VM_MAYSHARE will be set if it may attempt to share */ - if (vma->vm_file) { + if (capabilities & BDI_CAP_MAP_DIRECT) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { /* shouldn't return success if we're not sharing */ BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; @@ -1144,9 +1219,6 @@ static int do_mmap_private(struct vm_area_struct *vma, if (ret < rlen) memset(base + ret, 0, rlen - ret); - } else { - /* if it's an anonymous mapping, then just clear it */ - memset(base, 0, rlen); } return 0; @@ -1183,9 +1255,6 @@ unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); - if (!(flags & MAP_FIXED)) - addr = round_hint_to_min(addr); - /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, @@ -1195,6 +1264,9 @@ unsigned long do_mmap_pgoff(struct file *file, return ret; } + /* we ignore the address hint */ + addr = 0; + /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); @@ -1208,11 +1280,11 @@ unsigned long do_mmap_pgoff(struct file *file, if (!vma) goto error_getting_vma; - atomic_set(®ion->vm_usage, 1); + region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; - INIT_LIST_HEAD(&vma->anon_vma_node); + INIT_LIST_HEAD(&vma->anon_vma_chain); vma->vm_flags = vm_flags; vma->vm_pgoff = pgoff; @@ -1275,7 +1347,7 @@ unsigned long do_mmap_pgoff(struct file *file, } /* we've found a region we can share */ - atomic_inc(&pregion->vm_usage); + pregion->vm_usage++; vma->vm_region = pregion; start = pregion->vm_start; start += (pgoff - pregion->vm_pgoff) << PAGE_SHIFT; @@ -1292,7 +1364,7 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_region = NULL; vma->vm_start = 0; vma->vm_end = 0; - atomic_dec(&pregion->vm_usage); + pregion->vm_usage--; pregion = NULL; goto error_just_free; } @@ -1308,7 +1380,7 @@ unsigned long do_mmap_pgoff(struct file *file, * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping */ - if (file && file->f_op->get_unmapped_area) { + if (capabilities & BDI_CAP_MAP_DIRECT) { addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); if (IS_ERR((void *) addr)) { @@ -1333,16 +1405,22 @@ unsigned long do_mmap_pgoff(struct file *file, vma->vm_region = region; - /* set up the mapping */ + /* set up the mapping + * - the region is filled in if BDI_CAP_MAP_DIRECT is still set + */ if (file && vma->vm_flags & VM_SHARED) ret = do_mmap_shared_file(vma); else - ret = do_mmap_private(vma, region, len); + ret = do_mmap_private(vma, region, len, capabilities); if (ret < 0) - goto error_put_region; - + goto error_just_free; add_nommu_region(region); + /* clear anonymous mappings that don't ask for uninitialized data */ + if (!vma->vm_file && !(flags & MAP_UNINITIALIZED)) + memset((void *)region->vm_start, 0, + region->vm_end - region->vm_start); + /* okay... we have a mapping; now we have to register it */ result = vma->vm_start; @@ -1351,33 +1429,26 @@ unsigned long do_mmap_pgoff(struct file *file, share: add_vma_to_mm(current->mm, vma); - up_write(&nommu_region_sem); + /* we flush the region from the icache only when the first executable + * mapping of it is made */ + if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { + flush_icache_range(region->vm_start, region->vm_end); + region->vm_icache_flushed = true; + } - if (prot & PROT_EXEC) - flush_icache_range(result, result + len); + up_write(&nommu_region_sem); kleave(" = %lx", result); return result; -error_put_region: - __put_nommu_region(region); - if (vma) { - if (vma->vm_file) { - fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); - } - kmem_cache_free(vm_area_cachep, vma); - } - kleave(" = %d [pr]", ret); - return ret; - error_just_free: up_write(&nommu_region_sem); error: - fput(region->vm_file); + if (region->vm_file) + fput(region->vm_file); kmem_cache_free(vm_region_jar, region); - fput(vma->vm_file); + if (vma->vm_file) + fput(vma->vm_file); if (vma->vm_flags & VM_EXECUTABLE) removed_exe_file_vma(vma->vm_mm); kmem_cache_free(vm_area_cachep, vma); @@ -1407,6 +1478,56 @@ error_getting_region: } EXPORT_SYMBOL(do_mmap_pgoff); +SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, + unsigned long, prot, unsigned long, flags, + unsigned long, fd, unsigned long, pgoff) +{ + struct file *file = NULL; + unsigned long retval = -EBADF; + + audit_mmap_fd(fd, flags); + if (!(flags & MAP_ANONYMOUS)) { + file = fget(fd); + if (!file) + goto out; + } + + flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); + + down_write(¤t->mm->mmap_sem); + retval = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); + up_write(¤t->mm->mmap_sem); + + if (file) + fput(file); +out: + return retval; +} + +#ifdef __ARCH_WANT_SYS_OLD_MMAP +struct mmap_arg_struct { + unsigned long addr; + unsigned long len; + unsigned long prot; + unsigned long flags; + unsigned long fd; + unsigned long offset; +}; + +SYSCALL_DEFINE1(old_mmap, struct mmap_arg_struct __user *, arg) +{ + struct mmap_arg_struct a; + + if (copy_from_user(&a, arg, sizeof(a))) + return -EFAULT; + if (a.offset & ~PAGE_MASK) + return -EINVAL; + + return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd, + a.offset >> PAGE_SHIFT); +} +#endif /* __ARCH_WANT_SYS_OLD_MMAP */ + /* * split a vma into two pieces at address 'addr', a new vma is allocated either * for the first part or the tail. @@ -1420,10 +1541,9 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, kenter(""); - /* we're only permitted to split anonymous regions that have a single - * owner */ - if (vma->vm_file || - atomic_read(&vma->vm_region->vm_usage) != 1) + /* we're only permitted to split anonymous regions (these should have + * only a single usage on the region) */ + if (vma->vm_file) return -ENOMEM; if (mm->map_count >= sysctl_max_map_count) @@ -1497,7 +1617,7 @@ static int shrink_vma(struct mm_struct *mm, /* cut the backing region down to size */ region = vma->vm_region; - BUG_ON(atomic_read(®ion->vm_usage) != 1); + BUG_ON(region->vm_usage != 1); down_write(&nommu_region_sem); delete_nommu_region(region); @@ -1623,6 +1743,7 @@ void exit_mmap(struct mm_struct *mm) mm->mmap = vma->vm_next; delete_vma_from_mm(vma); delete_vma(mm, vma); + cond_resched(); } kleave(""); @@ -1741,27 +1862,6 @@ void unmap_mapping_range(struct address_space *mapping, EXPORT_SYMBOL(unmap_mapping_range); /* - * ask for an unmapped area at which to create a mapping on a file - */ -unsigned long get_unmapped_area(struct file *file, unsigned long addr, - unsigned long len, unsigned long pgoff, - unsigned long flags) -{ - unsigned long (*get_area)(struct file *, unsigned long, unsigned long, - unsigned long, unsigned long); - - get_area = current->mm->get_unmapped_area; - if (file && file->f_op && file->f_op->get_unmapped_area) - get_area = file->f_op->get_unmapped_area; - - if (!get_area) - return -ENOSYS; - - return get_area(file, addr, len, pgoff, flags); -} -EXPORT_SYMBOL(get_unmapped_area); - -/* * Check that a process has enough memory to allocate a new virtual * mapping. 0 means there is enough memory for the allocation to * succeed and -ENOMEM implies there is not. @@ -1900,9 +2000,11 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in /* only read or write mappings where it is permitted */ if (write && vma->vm_flags & VM_MAYWRITE) - len -= copy_to_user((void *) addr, buf, len); + copy_to_user_page(vma, NULL, addr, + (void *) addr, buf, len); else if (!write && vma->vm_flags & VM_MAYREAD) - len -= copy_from_user(buf, (void *) addr, len); + copy_from_user_page(vma, NULL, addr, + buf, (void *) addr, len); else len = 0; } else { @@ -1913,3 +2015,65 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in mmput(mm); return len; } + +/** + * nommu_shrink_inode_mappings - Shrink the shared mappings on an inode + * @inode: The inode to check + * @size: The current filesize of the inode + * @newsize: The proposed filesize of the inode + * + * Check the shared mappings on an inode on behalf of a shrinking truncate to + * make sure that that any outstanding VMAs aren't broken and then shrink the + * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't + * automatically grant mappings that are too large. + */ +int nommu_shrink_inode_mappings(struct inode *inode, size_t size, + size_t newsize) +{ + struct vm_area_struct *vma; + struct prio_tree_iter iter; + struct vm_region *region; + pgoff_t low, high; + size_t r_size, r_top; + + low = newsize >> PAGE_SHIFT; + high = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + + down_write(&nommu_region_sem); + + /* search for VMAs that fall within the dead zone */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + low, high) { + /* found one - only interested if it's shared out of the page + * cache */ + if (vma->vm_flags & VM_SHARED) { + up_write(&nommu_region_sem); + return -ETXTBSY; /* not quite true, but near enough */ + } + } + + /* reduce any regions that overlap the dead zone - if in existence, + * these will be pointed to by VMAs that don't overlap the dead zone + * + * we don't check for any regions that start beyond the EOF as there + * shouldn't be any + */ + vma_prio_tree_foreach(vma, &iter, &inode->i_mapping->i_mmap, + 0, ULONG_MAX) { + if (!(vma->vm_flags & VM_SHARED)) + continue; + + region = vma->vm_region; + r_size = region->vm_top - region->vm_start; + r_top = (region->vm_pgoff << PAGE_SHIFT) + r_size; + + if (r_top > newsize) { + region->vm_top -= r_top - newsize; + if (region->vm_end > region->vm_top) + region->vm_end = region->vm_top; + } + } + + up_write(&nommu_region_sem); + return 0; +} diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 175a67a..7dcca55 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -4,6 +4,8 @@ * Copyright (C) 1998,2000 Rik van Riel * Thanks go out to Claus Fischer for some serious inspiration and * for goading me into coding this file... + * Copyright (C) 2010 Google, Inc. + * Rewritten by David Rientjes * * The routines in this file are used to kill a process when * we're seriously out of memory. This gets called from __alloc_pages() @@ -18,6 +20,7 @@ #include <linux/oom.h> #include <linux/mm.h> #include <linux/err.h> +#include <linux/gfp.h> #include <linux/sched.h> #include <linux/swap.h> #include <linux/timex.h> @@ -26,176 +29,258 @@ #include <linux/module.h> #include <linux/notifier.h> #include <linux/memcontrol.h> +#include <linux/mempolicy.h> #include <linux/security.h> int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; -int sysctl_oom_dump_tasks; +int sysctl_oom_dump_tasks = 1; static DEFINE_SPINLOCK(zone_scan_lock); -/* #define DEBUG */ +#ifdef CONFIG_NUMA /** - * badness - calculate a numeric value for how bad this task has been - * @p: task struct of which task we should calculate - * @uptime: current uptime in seconds + * has_intersects_mems_allowed() - check task eligiblity for kill + * @tsk: task struct of which task to consider + * @mask: nodemask passed to page allocator for mempolicy ooms * - * The formula used is relatively simple and documented inline in the - * function. The main rationale is that we want to select a good task - * to kill when we run out of memory. - * - * Good in this context means that: - * 1) we lose the minimum amount of work done - * 2) we recover a large amount of memory - * 3) we don't kill anything innocent of eating tons of memory - * 4) we want to kill the minimum amount of processes (one) - * 5) we try to kill the process the user expects us to kill, this - * algorithm has been meticulously tuned to meet the principle - * of least surprise ... (be careful when you change it) + * Task eligibility is determined by whether or not a candidate task, @tsk, + * shares the same mempolicy nodes as current if it is bound by such a policy + * and whether or not it has the same set of allowed cpuset nodes. */ +static bool has_intersects_mems_allowed(struct task_struct *tsk, + const nodemask_t *mask) +{ + struct task_struct *start = tsk; -unsigned long badness(struct task_struct *p, unsigned long uptime) + do { + if (mask) { + /* + * If this is a mempolicy constrained oom, tsk's + * cpuset is irrelevant. Only return true if its + * mempolicy intersects current, otherwise it may be + * needlessly killed. + */ + if (mempolicy_nodemask_intersects(tsk, mask)) + return true; + } else { + /* + * This is not a mempolicy constrained oom, so only + * check the mems of tsk's cpuset. + */ + if (cpuset_mems_allowed_intersects(current, tsk)) + return true; + } + } while_each_thread(start, tsk); + + return false; +} +#else +static bool has_intersects_mems_allowed(struct task_struct *tsk, + const nodemask_t *mask) { - unsigned long points, cpu_time, run_time; - struct mm_struct *mm; - struct task_struct *child; - int oom_adj; + return true; +} +#endif /* CONFIG_NUMA */ - task_lock(p); - mm = p->mm; - if (!mm) { - task_unlock(p); - return 0; - } - oom_adj = mm->oom_adj; - if (oom_adj == OOM_DISABLE) { - task_unlock(p); - return 0; - } +/* + * If this is a system OOM (not a memcg OOM) and the task selected to be + * killed is not already running at high (RT) priorities, speed up the + * recovery by boosting the dying task to the lowest FIFO priority. + * That helps with the recovery and avoids interfering with RT tasks. + */ +static void boost_dying_task_prio(struct task_struct *p, + struct mem_cgroup *mem) +{ + struct sched_param param = { .sched_priority = 1 }; - /* - * The memory size of the process is the basis for the badness. - */ - points = mm->total_vm; + if (mem) + return; - /* - * After this unlock we can no longer dereference local variable `mm' - */ - task_unlock(p); + if (!rt_task(p)) + sched_setscheduler_nocheck(p, SCHED_FIFO, ¶m); +} - /* - * swapoff can easily use up all memory, so kill those first. - */ - if (p->flags & PF_SWAPOFF) - return ULONG_MAX; +/* + * The process p may have detached its own ->mm while exiting or through + * use_mm(), but one or more of its subthreads may still have a valid + * pointer. Return p, or any of its subthreads with a valid ->mm, with + * task_lock() held. + */ +struct task_struct *find_lock_task_mm(struct task_struct *p) +{ + struct task_struct *t = p; + + do { + task_lock(t); + if (likely(t->mm)) + return t; + task_unlock(t); + } while_each_thread(p, t); + + return NULL; +} + +/* return true if the task is not adequate as candidate victim task. */ +static bool oom_unkillable_task(struct task_struct *p, + const struct mem_cgroup *mem, const nodemask_t *nodemask) +{ + if (is_global_init(p)) + return true; + if (p->flags & PF_KTHREAD) + return true; + + /* When mem_cgroup_out_of_memory() and p is not member of the group */ + if (mem && !task_in_mem_cgroup(p, mem)) + return true; + + /* p may not have freeable memory in nodemask */ + if (!has_intersects_mems_allowed(p, nodemask)) + return true; + + return false; +} + +/** + * oom_badness - heuristic function to determine which candidate task to kill + * @p: task struct of which task we should calculate + * @totalpages: total present RAM allowed for page allocation + * + * The heuristic for determining which task to kill is made to be as simple and + * predictable as possible. The goal is to return the highest value for the + * task consuming the most memory to avoid subsequent oom failures. + */ +unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem, + const nodemask_t *nodemask, unsigned long totalpages) +{ + int points; + + if (oom_unkillable_task(p, mem, nodemask)) + return 0; + + p = find_lock_task_mm(p); + if (!p) + return 0; /* - * Processes which fork a lot of child processes are likely - * a good choice. We add half the vmsize of the children if they - * have an own mm. This prevents forking servers to flood the - * machine with an endless amount of children. In case a single - * child is eating the vast majority of memory, adding only half - * to the parents will make the child our kill candidate of choice. + * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN + * so the entire heuristic doesn't need to be executed for something + * that cannot be killed. */ - list_for_each_entry(child, &p->children, sibling) { - task_lock(child); - if (child->mm != mm && child->mm) - points += child->mm->total_vm/2 + 1; - task_unlock(child); + if (atomic_read(&p->mm->oom_disable_count)) { + task_unlock(p); + return 0; } /* - * CPU time is in tens of seconds and run time is in thousands - * of seconds. There is no particular reason for this other than - * that it turned out to work very well in practice. + * When the PF_OOM_ORIGIN bit is set, it indicates the task should have + * priority for oom killing. */ - cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) - >> (SHIFT_HZ + 3); - - if (uptime >= p->start_time.tv_sec) - run_time = (uptime - p->start_time.tv_sec) >> 10; - else - run_time = 0; - - if (cpu_time) - points /= int_sqrt(cpu_time); - if (run_time) - points /= int_sqrt(int_sqrt(run_time)); + if (p->flags & PF_OOM_ORIGIN) { + task_unlock(p); + return 1000; + } /* - * Niced processes are most likely less important, so double - * their badness points. + * The memory controller may have a limit of 0 bytes, so avoid a divide + * by zero, if necessary. */ - if (task_nice(p) > 0) - points *= 2; + if (!totalpages) + totalpages = 1; /* - * Superuser processes are usually more important, so we make it - * less likely that we kill those. + * The baseline for the badness score is the proportion of RAM that each + * task's rss and swap space use. */ - if (has_capability_noaudit(p, CAP_SYS_ADMIN) || - has_capability_noaudit(p, CAP_SYS_RESOURCE)) - points /= 4; + points = (get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS)) * 1000 / + totalpages; + task_unlock(p); /* - * We don't want to kill a process with direct hardware access. - * Not only could that mess up the hardware, but usually users - * tend to only have this flag set on applications they think - * of as important. + * Root processes get 3% bonus, just like the __vm_enough_memory() + * implementation used by LSMs. */ - if (has_capability_noaudit(p, CAP_SYS_RAWIO)) - points /= 4; + if (has_capability_noaudit(p, CAP_SYS_ADMIN)) + points -= 30; /* - * If p's nodes don't overlap ours, it may still help to kill p - * because p may have allocated or otherwise mapped memory on - * this node before. However it will be less likely. + * /proc/pid/oom_score_adj ranges from -1000 to +1000 such that it may + * either completely disable oom killing or always prefer a certain + * task. */ - if (!cpuset_mems_allowed_intersects(current, p)) - points /= 8; + points += p->signal->oom_score_adj; /* - * Adjust the score by oom_adj. + * Never return 0 for an eligible task that may be killed since it's + * possible that no single user task uses more than 0.1% of memory and + * no single admin tasks uses more than 3.0%. */ - if (oom_adj) { - if (oom_adj > 0) { - if (!points) - points = 1; - points <<= oom_adj; - } else - points >>= -(oom_adj); - } - -#ifdef DEBUG - printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", - p->pid, p->comm, points); -#endif - return points; + if (points <= 0) + return 1; + return (points < 1000) ? points : 1000; } /* * Determine the type of allocation constraint. */ -static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, - gfp_t gfp_mask) -{ #ifdef CONFIG_NUMA +static enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask, nodemask_t *nodemask, + unsigned long *totalpages) +{ struct zone *zone; struct zoneref *z; enum zone_type high_zoneidx = gfp_zone(gfp_mask); - nodemask_t nodes = node_states[N_HIGH_MEMORY]; + bool cpuset_limited = false; + int nid; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - if (cpuset_zone_allowed_softwall(zone, gfp_mask)) - node_clear(zone_to_nid(zone), nodes); - else - return CONSTRAINT_CPUSET; + /* Default to all available memory */ + *totalpages = totalram_pages + total_swap_pages; - if (!nodes_empty(nodes)) + if (!zonelist) + return CONSTRAINT_NONE; + /* + * Reach here only when __GFP_NOFAIL is used. So, we should avoid + * to kill current.We have to random task kill in this case. + * Hopefully, CONSTRAINT_THISNODE...but no way to handle it, now. + */ + if (gfp_mask & __GFP_THISNODE) + return CONSTRAINT_NONE; + + /* + * This is not a __GFP_THISNODE allocation, so a truncated nodemask in + * the page allocator means a mempolicy is in effect. Cpuset policy + * is enforced in get_page_from_freelist(). + */ + if (nodemask && !nodes_subset(node_states[N_HIGH_MEMORY], *nodemask)) { + *totalpages = total_swap_pages; + for_each_node_mask(nid, *nodemask) + *totalpages += node_spanned_pages(nid); return CONSTRAINT_MEMORY_POLICY; -#endif + } + /* Check this allocation failure is caused by cpuset's wall function */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + high_zoneidx, nodemask) + if (!cpuset_zone_allowed_softwall(zone, gfp_mask)) + cpuset_limited = true; + + if (cpuset_limited) { + *totalpages = total_swap_pages; + for_each_node_mask(nid, cpuset_current_mems_allowed) + *totalpages += node_spanned_pages(nid); + return CONSTRAINT_CPUSET; + } return CONSTRAINT_NONE; } +#else +static enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask, nodemask_t *nodemask, + unsigned long *totalpages) +{ + *totalpages = totalram_pages + total_swap_pages; + return CONSTRAINT_NONE; +} +#endif /* * Simple selection loop. We chose the process with the highest @@ -203,28 +288,18 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, * * (not docbooked, we don't want this one cluttering up the manual) */ -static struct task_struct *select_bad_process(unsigned long *ppoints, - struct mem_cgroup *mem) +static struct task_struct *select_bad_process(unsigned int *ppoints, + unsigned long totalpages, struct mem_cgroup *mem, + const nodemask_t *nodemask) { - struct task_struct *g, *p; + struct task_struct *p; struct task_struct *chosen = NULL; - struct timespec uptime; *ppoints = 0; - do_posix_clock_monotonic_gettime(&uptime); - do_each_thread(g, p) { - unsigned long points; + for_each_process(p) { + unsigned int points; - /* - * skip kernel threads and tasks which have already released - * their mm. - */ - if (!p->mm) - continue; - /* skip the init task */ - if (is_global_init(p)) - continue; - if (mem && !task_in_mem_cgroup(p, mem)) + if (oom_unkillable_task(p, mem, nodemask)) continue; /* @@ -249,187 +324,228 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, * the process of exiting and releasing its resources. * Otherwise we could get an easy OOM deadlock. */ - if (p->flags & PF_EXITING) { + if (thread_group_empty(p) && (p->flags & PF_EXITING) && p->mm) { if (p != current) return ERR_PTR(-1UL); chosen = p; - *ppoints = ULONG_MAX; + *ppoints = 1000; } - points = badness(p, uptime.tv_sec); + points = oom_badness(p, mem, nodemask, totalpages); if (points > *ppoints) { chosen = p; *ppoints = points; } - } while_each_thread(g, p); + } return chosen; } /** * dump_tasks - dump current memory state of all system tasks - * @mem: target memory controller + * @mem: current's memory controller, if constrained + * @nodemask: nodemask passed to page allocator for mempolicy ooms * - * Dumps the current memory state of all system tasks, excluding kernel threads. + * Dumps the current memory state of all eligible tasks. Tasks not in the same + * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes + * are not shown. * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj - * score, and name. - * - * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are - * shown. + * value, oom_score_adj value, and name. * * Call with tasklist_lock read-locked. */ -static void dump_tasks(const struct mem_cgroup *mem) +static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask) { - struct task_struct *g, *p; - - printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " - "name\n"); - do_each_thread(g, p) { - struct mm_struct *mm; + struct task_struct *p; + struct task_struct *task; - if (mem && !task_in_mem_cgroup(p, mem)) - continue; - if (!thread_group_leader(p)) + pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); + for_each_process(p) { + if (oom_unkillable_task(p, mem, nodemask)) continue; - task_lock(p); - mm = p->mm; - if (!mm) { + task = find_lock_task_mm(p); + if (!task) { /* - * total_vm and rss sizes do not exist for tasks with no - * mm so there's no need to report them; they can't be - * oom killed anyway. + * This is a kthread or all of p's threads have already + * detached their mm's. There's no need to report + * them; they can't be oom killed anyway. */ - task_unlock(p); continue; } - printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", - p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); - task_unlock(p); - } while_each_thread(g, p); -} -/* - * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO - * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO - * set. - */ -static void __oom_kill_task(struct task_struct *p, int verbose) -{ - if (is_global_init(p)) { - WARN_ON(1); - printk(KERN_WARNING "tried to kill init!\n"); - return; + pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", + task->pid, task_uid(task), task->tgid, + task->mm->total_vm, get_mm_rss(task->mm), + task_cpu(task), task->signal->oom_adj, + task->signal->oom_score_adj, task->comm); + task_unlock(task); } +} - if (!p->mm) - return; - - if (verbose) - printk(KERN_ERR "Killed process %d (%s)\n", - task_pid_nr(p), p->comm); - - /* - * We give our sacrificial lamb high priority and access to - * all the memory it needs. That way it should be able to - * exit() and clear out its resources quickly... - */ - p->rt.time_slice = HZ; - set_tsk_thread_flag(p, TIF_MEMDIE); - - force_sig(SIGKILL, p); +static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, + struct mem_cgroup *mem, const nodemask_t *nodemask) +{ + task_lock(current); + pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " + "oom_adj=%d, oom_score_adj=%d\n", + current->comm, gfp_mask, order, current->signal->oom_adj, + current->signal->oom_score_adj); + cpuset_print_task_mems_allowed(current); + task_unlock(current); + dump_stack(); + mem_cgroup_print_oom_info(mem, p); + show_mem(); + if (sysctl_oom_dump_tasks) + dump_tasks(mem, nodemask); } -static int oom_kill_task(struct task_struct *p) +#define K(x) ((x) << (PAGE_SHIFT-10)) +static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem) { + struct task_struct *q; struct mm_struct *mm; - struct task_struct *g, *q; - task_lock(p); - mm = p->mm; - if (!mm || mm->oom_adj == OOM_DISABLE) { - task_unlock(p); + p = find_lock_task_mm(p); + if (!p) return 1; - } + + /* mm cannot be safely dereferenced after task_unlock(p) */ + mm = p->mm; + + pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n", + task_pid_nr(p), p->comm, K(p->mm->total_vm), + K(get_mm_counter(p->mm, MM_ANONPAGES)), + K(get_mm_counter(p->mm, MM_FILEPAGES))); task_unlock(p); - __oom_kill_task(p, 1); /* - * kill all processes that share the ->mm (i.e. all threads), - * but are in a different thread group. Don't let them have access - * to memory reserves though, otherwise we might deplete all memory. + * Kill all processes sharing p->mm in other thread groups, if any. + * They don't get access to memory reserves or a higher scheduler + * priority, though, to avoid depletion of all memory or task + * starvation. This prevents mm->mmap_sem livelock when an oom killed + * task cannot exit because it requires the semaphore and its contended + * by another thread trying to allocate memory itself. That thread will + * now get access to memory reserves since it has a pending fatal + * signal. */ - do_each_thread(g, q) { - if (q->mm == mm && !same_thread_group(q, p)) + for_each_process(q) + if (q->mm == mm && !same_thread_group(q, p)) { + task_lock(q); /* Protect ->comm from prctl() */ + pr_err("Kill process %d (%s) sharing same memory\n", + task_pid_nr(q), q->comm); + task_unlock(q); force_sig(SIGKILL, q); - } while_each_thread(g, q); + } + + set_tsk_thread_flag(p, TIF_MEMDIE); + force_sig(SIGKILL, p); + + /* + * We give our sacrificial lamb high priority and access to + * all the memory it needs. That way it should be able to + * exit() and clear out its resources quickly... + */ + boost_dying_task_prio(p, mem); return 0; } +#undef K static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, - unsigned long points, struct mem_cgroup *mem, + unsigned int points, unsigned long totalpages, + struct mem_cgroup *mem, nodemask_t *nodemask, const char *message) { - struct task_struct *c; - - if (printk_ratelimit()) { - task_lock(current); - printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", - current->comm, gfp_mask, order, - current->mm ? current->mm->oom_adj : OOM_DISABLE); - cpuset_print_task_mems_allowed(current); - task_unlock(current); - dump_stack(); - mem_cgroup_print_oom_info(mem, current); - show_mem(); - if (sysctl_oom_dump_tasks) - dump_tasks(mem); - } + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t = p; + unsigned int victim_points = 0; + + if (printk_ratelimit()) + dump_header(p, gfp_mask, order, mem, nodemask); /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly - * if its mm is still attached. */ - if (p->mm && (p->flags & PF_EXITING)) { - __oom_kill_task(p, 0); + if (p->flags & PF_EXITING) { + set_tsk_thread_flag(p, TIF_MEMDIE); + boost_dying_task_prio(p, mem); return 0; } - printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", - message, task_pid_nr(p), p->comm, points); + task_lock(p); + pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + task_unlock(p); + + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + do { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; - /* Try to kill a child first */ - list_for_each_entry(c, &p->children, sibling) { - if (c->mm == p->mm) - continue; - if (!oom_kill_task(c)) - return 0; + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, mem, nodemask, + totalpages); + if (child_points > victim_points) { + victim = child; + victim_points = child_points; + } + } + } while_each_thread(p, t); + + return oom_kill_task(victim, mem); +} + +/* + * Determines whether the kernel must panic because of the panic_on_oom sysctl. + */ +static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, + int order, const nodemask_t *nodemask) +{ + if (likely(!sysctl_panic_on_oom)) + return; + if (sysctl_panic_on_oom != 2) { + /* + * panic_on_oom == 1 only affects CONSTRAINT_NONE, the kernel + * does not panic for cpuset, mempolicy, or memcg allocation + * failures. + */ + if (constraint != CONSTRAINT_NONE) + return; } - return oom_kill_task(p); + read_lock(&tasklist_lock); + dump_header(NULL, gfp_mask, order, NULL, nodemask); + read_unlock(&tasklist_lock); + panic("Out of memory: %s panic_on_oom is enabled\n", + sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } #ifdef CONFIG_CGROUP_MEM_RES_CTLR void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) { - unsigned long points = 0; + unsigned long limit; + unsigned int points = 0; struct task_struct *p; + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL); + limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT; read_lock(&tasklist_lock); retry: - p = select_bad_process(&points, mem); - if (PTR_ERR(p) == -1UL) + p = select_bad_process(&points, limit, mem, NULL); + if (!p || PTR_ERR(p) == -1UL) goto out; - if (!p) - p = current; - - if (oom_kill_process(p, gfp_mask, 0, points, mem, + if (oom_kill_process(p, gfp_mask, 0, points, limit, mem, NULL, "Memory cgroup out of memory")) goto retry; out: @@ -456,7 +572,7 @@ EXPORT_SYMBOL_GPL(unregister_oom_notifier); * if a parallel OOM killing is already taking place that includes a zone in * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. */ -int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) +int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) { struct zoneref *z; struct zone *zone; @@ -473,7 +589,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { /* * Lock each zone in the zonelist under zone_scan_lock so a - * parallel invocation of try_set_zone_oom() doesn't succeed + * parallel invocation of try_set_zonelist_oom() doesn't succeed * when it shouldn't. */ zone_set_flag(zone, ZONE_OOM_LOCKED); @@ -502,72 +618,40 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) } /* - * Must be called with tasklist_lock held for read. + * Try to acquire the oom killer lock for all system zones. Returns zero if a + * parallel oom killing is taking place, otherwise locks all zones and returns + * non-zero. */ -static void __out_of_memory(gfp_t gfp_mask, int order) +static int try_set_system_oom(void) { - struct task_struct *p; - unsigned long points; - - if (sysctl_oom_kill_allocating_task) - if (!oom_kill_process(current, gfp_mask, order, 0, NULL, - "Out of memory (oom_kill_allocating_task)")) - return; -retry: - /* - * Rambo mode: Shoot down a process and hope it solves whatever - * issues we may have. - */ - p = select_bad_process(&points, NULL); - - if (PTR_ERR(p) == -1UL) - return; - - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - read_unlock(&tasklist_lock); - panic("Out of memory and no killable processes...\n"); - } + struct zone *zone; + int ret = 1; - if (oom_kill_process(p, gfp_mask, order, points, NULL, - "Out of memory")) - goto retry; + spin_lock(&zone_scan_lock); + for_each_populated_zone(zone) + if (zone_is_oom_locked(zone)) { + ret = 0; + goto out; + } + for_each_populated_zone(zone) + zone_set_flag(zone, ZONE_OOM_LOCKED); +out: + spin_unlock(&zone_scan_lock); + return ret; } /* - * pagefault handler calls into here because it is out of memory but - * doesn't know exactly how or why. + * Clears ZONE_OOM_LOCKED for all system zones so that failed allocation + * attempts or page faults may now recall the oom killer, if necessary. */ -void pagefault_out_of_memory(void) +static void clear_system_oom(void) { - unsigned long freed = 0; - - blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) - /* Got some memory back in the last second. */ - return; - - /* - * If this is from memcg, oom-killer is already invoked. - * and not worth to go system-wide-oom. - */ - if (mem_cgroup_oom_called(current)) - goto rest_and_return; - - if (sysctl_panic_on_oom) - panic("out of memory from page fault. panic_on_oom is selected.\n"); - - read_lock(&tasklist_lock); - __out_of_memory(0, 0); /* unknown gfp_mask and order */ - read_unlock(&tasklist_lock); + struct zone *zone; - /* - * Give "p" a good chance of killing itself before we - * retry to allocate memory. - */ -rest_and_return: - if (!test_thread_flag(TIF_MEMDIE)) - schedule_timeout_uninterruptible(1); + spin_lock(&zone_scan_lock); + for_each_populated_zone(zone) + zone_clear_flag(zone, ZONE_OOM_LOCKED); + spin_unlock(&zone_scan_lock); } /** @@ -575,53 +659,103 @@ rest_and_return: * @zonelist: zonelist pointer * @gfp_mask: memory allocation flags * @order: amount of memory being requested as a power of 2 + * @nodemask: nodemask passed to page allocator * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) * OR try to be smart about which process to kill. Note that we * don't have to be perfect here, we just have to be good. */ -void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) +void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, + int order, nodemask_t *nodemask) { + const nodemask_t *mpol_mask; + struct task_struct *p; + unsigned long totalpages; unsigned long freed = 0; - enum oom_constraint constraint; + unsigned int points; + enum oom_constraint constraint = CONSTRAINT_NONE; + int killed = 0; blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) /* Got some memory back in the last second. */ return; - if (sysctl_panic_on_oom == 2) - panic("out of memory. Compulsory panic_on_oom is selected.\n"); + /* + * If current has a pending SIGKILL, then automatically select it. The + * goal is to allow it to allocate so that it may quickly exit and free + * its memory. + */ + if (fatal_signal_pending(current)) { + set_thread_flag(TIF_MEMDIE); + boost_dying_task_prio(current, NULL); + return; + } /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - constraint = constrained_alloc(zonelist, gfp_mask); + constraint = constrained_alloc(zonelist, gfp_mask, nodemask, + &totalpages); + mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; + check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); + read_lock(&tasklist_lock); + if (sysctl_oom_kill_allocating_task && + !oom_unkillable_task(current, NULL, nodemask) && + current->mm && !atomic_read(¤t->mm->oom_disable_count)) { + /* + * oom_kill_process() needs tasklist_lock held. If it returns + * non-zero, current could not be killed so we must fallback to + * the tasklist scan. + */ + if (!oom_kill_process(current, gfp_mask, order, 0, totalpages, + NULL, nodemask, + "Out of memory (oom_kill_allocating_task)")) + goto out; + } + +retry: + p = select_bad_process(&points, totalpages, NULL, mpol_mask); + if (PTR_ERR(p) == -1UL) + goto out; - switch (constraint) { - case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, gfp_mask, order, 0, NULL, - "No available memory (MPOL_BIND)"); - break; - - case CONSTRAINT_NONE: - if (sysctl_panic_on_oom) - panic("out of memory. panic_on_oom is selected\n"); - /* Fall-through */ - case CONSTRAINT_CPUSET: - __out_of_memory(gfp_mask, order); - break; + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + dump_header(NULL, gfp_mask, order, NULL, mpol_mask); + read_unlock(&tasklist_lock); + panic("Out of memory and no killable processes...\n"); } + if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL, + nodemask, "Out of memory")) + goto retry; + killed = 1; +out: read_unlock(&tasklist_lock); /* * Give "p" a good chance of killing itself before we * retry to allocate memory unless "p" is current */ + if (killed && !test_thread_flag(TIF_MEMDIE)) + schedule_timeout_uninterruptible(1); +} + +/* + * The pagefault handler calls here because it is out of memory, so kill a + * memory-hogging task. If a populated zone has ZONE_OOM_LOCKED set, a parallel + * oom killing is already in progress so do nothing. If a task is found with + * TIF_MEMDIE set, it has been killed so do nothing and allow it to exit. + */ +void pagefault_out_of_memory(void) +{ + if (try_set_system_oom()) { + out_of_memory(NULL, 0, 0, NULL); + clear_system_oom(); + } if (!test_thread_flag(TIF_MEMDIE)) schedule_timeout_uninterruptible(1); } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7b0dcea..b4edfe7 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -34,15 +34,7 @@ #include <linux/syscalls.h> #include <linux/buffer_head.h> #include <linux/pagevec.h> - -/* - * The maximum number of pages to writeout in a single bdflush/kupdate - * operation. We do this so we don't hold I_SYNC against an inode for - * enormous amounts of time, which would block a userspace task which has - * been forced to throttle against that inode. Also, the code reevaluates - * the dirty each time it has written this many pages. - */ -#define MAX_WRITEBACK_PAGES 1024 +#include <trace/events/writeback.h> /* * After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited @@ -53,18 +45,21 @@ static long ratelimit_pages = 32; /* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * It should be somewhat larger than dirtied pages to ensure that reasonably * large amounts of I/O are submitted. */ -static inline long sync_writeback_pages(void) +static inline long sync_writeback_pages(unsigned long dirtied) { - return ratelimit_pages + ratelimit_pages / 2; + if (dirtied < ratelimit_pages) + dirtied = ratelimit_pages; + + return dirtied + dirtied / 2; } /* The following parameters are exported via /proc/sys/vm */ /* - * Start background writeback (via pdflush) at this percentage + * Start background writeback (via writeback threads) at this percentage */ int dirty_background_ratio = 10; @@ -117,8 +112,6 @@ EXPORT_SYMBOL(laptop_mode); /* End of sysctl-exported parameters */ -static void background_writeout(unsigned long _min_pages); - /* * Scale the writeback cache size proportional to the relative writeout speeds. * @@ -166,37 +159,37 @@ static void update_completion_period(void) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret; } int dirty_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { update_completion_period(); vm_dirty_bytes = 0; @@ -206,13 +199,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write, int dirty_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { update_completion_period(); vm_dirty_ratio = 0; @@ -260,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi, } } -/* - * Clip the earned share of dirty pages to that which is actually available. - * This avoids exceeding the total dirty_limit when the floating averages - * fluctuate too quickly. - */ -static void clip_bdi_dirty_limit(struct backing_dev_info *bdi, - unsigned long dirty, unsigned long *pbdi_dirty) -{ - unsigned long avail_dirty; - - avail_dirty = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_WRITEBACK) + - global_page_state(NR_UNSTABLE_NFS) + - global_page_state(NR_WRITEBACK_TEMP); - - if (avail_dirty < dirty) - avail_dirty = dirty - avail_dirty; - else - avail_dirty = 0; - - avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) + - bdi_stat(bdi, BDI_WRITEBACK); - - *pbdi_dirty = min(*pbdi_dirty, avail_dirty); -} - static inline void task_dirties_fraction(struct task_struct *tsk, long *numerator, long *denominator) { @@ -294,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk, } /* - * scale the dirty limit + * task_dirty_limit - scale down dirty throttling threshold for one task * * task specific dirty limit: * * dirty -= (dirty/8) * p_{t} + * + * To protect light/slow dirtying tasks from heavier/fast ones, we start + * throttling individual tasks before reaching the bdi dirty limit. + * Relatively low thresholds will be allocated to heavy dirtiers. So when + * dirty pages grow large, heavy dirtiers will be throttled first, which will + * effectively curb the growth of dirty pages. Light dirtiers with high enough + * dirty threshold may never get throttled. */ -static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) +static unsigned long task_dirty_limit(struct task_struct *tsk, + unsigned long bdi_dirty) { long numerator, denominator; - unsigned long dirty = *pdirty; + unsigned long dirty = bdi_dirty; u64 inv = dirty >> 3; task_dirties_fraction(tsk, &numerator, &denominator); @@ -311,24 +286,20 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty) do_div(inv, denominator); dirty -= inv; - if (dirty < *pdirty/2) - dirty = *pdirty/2; - *pdirty = dirty; + return max(dirty, bdi_dirty/2); } /* * */ -static DEFINE_SPINLOCK(bdi_lock); static unsigned int bdi_min_ratio; int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { int ret = 0; - unsigned long flags; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { @@ -340,27 +311,26 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) ret = -EINVAL; } } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock_bh(&bdi_lock); return ret; } int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) { - unsigned long flags; int ret = 0; if (max_ratio > 100) return -EINVAL; - spin_lock_irqsave(&bdi_lock, flags); + spin_lock_bh(&bdi_lock); if (bdi->min_ratio > max_ratio) { ret = -EINVAL; } else { bdi->max_ratio = max_ratio; bdi->max_prop_frac = (PROP_FRAC_BASE * max_ratio) / 100; } - spin_unlock_irqrestore(&bdi_lock, flags); + spin_unlock_bh(&bdi_lock); return ret; } @@ -394,7 +364,8 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); + x += zone_page_state(z, NR_FREE_PAGES) + + zone_reclaimable_pages(z); } /* * Make sure that the number of highmem pages is never larger @@ -418,7 +389,7 @@ unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -426,9 +397,16 @@ unsigned long determine_dirtyable_memory(void) return x + 1; /* Ensure that we never return 0 */ } -void -get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, - unsigned long *pbdi_dirty, struct backing_dev_info *bdi) +/* + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * + * Calculate the dirty thresholds based on sysctl parameters + * - vm.dirty_background_ratio or vm.dirty_background_bytes + * - vm.dirty_ratio or vm.dirty_bytes + * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * runtime tasks. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { unsigned long background; unsigned long dirty; @@ -437,14 +415,8 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else { - int dirty_ratio; - - dirty_ratio = vm_dirty_ratio; - if (dirty_ratio < 5) - dirty_ratio = 5; - dirty = (dirty_ratio * available_memory) / 100; - } + else + dirty = (vm_dirty_ratio * available_memory) / 100; if (dirty_background_bytes) background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); @@ -460,37 +432,48 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, } *pbackground = background; *pdirty = dirty; +} + +/* + * bdi_dirty_limit - @bdi's share of dirty throttling threshold + * + * Allocate high/low dirty limits to fast/slow devices, in order to prevent + * - starving fast devices + * - piling up dirty pages (that will take long time to sync) on slow devices + * + * The bdi's share of dirty limit will be adapting to its throughput and + * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. + */ +unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty) +{ + u64 bdi_dirty; + long numerator, denominator; + + /* + * Calculate this BDI's share of the dirty ratio. + */ + bdi_writeout_fraction(bdi, &numerator, &denominator); - if (bdi) { - u64 bdi_dirty; - long numerator, denominator; + bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; + bdi_dirty *= numerator; + do_div(bdi_dirty, denominator); - /* - * Calculate this BDI's share of the dirty ratio. - */ - bdi_writeout_fraction(bdi, &numerator, &denominator); - - bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100; - bdi_dirty *= numerator; - do_div(bdi_dirty, denominator); - bdi_dirty += (dirty * bdi->min_ratio) / 100; - if (bdi_dirty > (dirty * bdi->max_ratio) / 100) - bdi_dirty = dirty * bdi->max_ratio / 100; - - *pbdi_dirty = bdi_dirty; - clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty); - task_dirty_limit(current, pbdi_dirty); - } + bdi_dirty += (dirty * bdi->min_ratio) / 100; + if (bdi_dirty > (dirty * bdi->max_ratio) / 100) + bdi_dirty = dirty * bdi->max_ratio / 100; + + return bdi_dirty; } /* * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. - * If we're over `background_thresh' then pdflush is woken to perform some - * writeout. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. */ -static void balance_dirty_pages(struct address_space *mapping) +static void balance_dirty_pages(struct address_space *mapping, + unsigned long write_chunk) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; @@ -498,56 +481,35 @@ static void balance_dirty_pages(struct address_space *mapping) unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long pages_written = 0; - unsigned long write_chunk = sync_writeback_pages(); - + unsigned long pause = 1; + bool dirty_exceeded = false; struct backing_dev_info *bdi = mapping->backing_dev_info; for (;;) { struct writeback_control wbc = { - .bdi = bdi, .sync_mode = WB_SYNC_NONE, .older_than_this = NULL, .nr_to_write = write_chunk, .range_cyclic = 1, }; - get_dirty_limits(&background_thresh, &dirty_thresh, - &bdi_thresh, bdi); - nr_reclaimable = global_page_state(NR_FILE_DIRTY) + global_page_state(NR_UNSTABLE_NFS); nr_writeback = global_page_state(NR_WRITEBACK); - bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); - bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); - - if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) - break; + global_dirty_limits(&background_thresh, &dirty_thresh); /* * Throttle it only when the background writeback cannot * catch-up. This avoids (excessively) small writeouts * when the bdi limits are ramping up. */ - if (nr_reclaimable + nr_writeback < + if (nr_reclaimable + nr_writeback <= (background_thresh + dirty_thresh) / 2) break; - if (!bdi->dirty_exceeded) - bdi->dirty_exceeded = 1; - - /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - */ - if (bdi_nr_reclaimable) { - writeback_inodes(&wbc); - pages_written += write_chunk - wbc.nr_to_write; - get_dirty_limits(&background_thresh, &dirty_thresh, - &bdi_thresh, bdi); - } + bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh); + bdi_thresh = task_dirty_limit(current, bdi_thresh); /* * In order to avoid the stacked BDI deadlock we need @@ -562,25 +524,62 @@ static void balance_dirty_pages(struct address_space *mapping) if (bdi_thresh < 2*bdi_stat_error(bdi)) { bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE); bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK); - } else if (bdi_nr_reclaimable) { + } else { bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE); bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK); } - if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh) + /* + * The bdi thresh is somehow "soft" limit derived from the + * global "hard" limit. The former helps to prevent heavy IO + * bdi or process from holding back light ones; The latter is + * the last resort safeguard. + */ + dirty_exceeded = + (bdi_nr_reclaimable + bdi_nr_writeback > bdi_thresh) + || (nr_reclaimable + nr_writeback > dirty_thresh); + + if (!dirty_exceeded) break; - if (pages_written >= write_chunk) - break; /* We've done our duty */ - congestion_wait(WRITE, HZ/10); + if (!bdi->dirty_exceeded) + bdi->dirty_exceeded = 1; + + /* Note: nr_reclaimable denotes nr_dirty + nr_unstable. + * Unstable writes are a feature of certain networked + * filesystems (i.e. NFS) in which data may have been + * written to the server's write cache, but has not yet + * been flushed to permanent storage. + * Only move pages to writeback if this bdi is over its + * threshold otherwise wait until the disk writes catch + * up. + */ + trace_wbc_balance_dirty_start(&wbc, bdi); + if (bdi_nr_reclaimable > bdi_thresh) { + writeback_inodes_wb(&bdi->wb, &wbc); + pages_written += write_chunk - wbc.nr_to_write; + trace_wbc_balance_dirty_written(&wbc, bdi); + if (pages_written >= write_chunk) + break; /* We've done our duty */ + } + trace_wbc_balance_dirty_wait(&wbc, bdi); + __set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(pause); + + /* + * Increase the delay for each loop, up to our previous + * default of taking a 100ms nap. + */ + pause <<= 1; + if (pause > HZ / 10) + pause = HZ / 10; } - if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh && - bdi->dirty_exceeded) + if (!dirty_exceeded && bdi->dirty_exceeded) bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) - return; /* pdflush is already working this queue */ + return; /* * In laptop mode, we wait until hitting the higher threshold before @@ -591,10 +590,8 @@ static void balance_dirty_pages(struct address_space *mapping) * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && (global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS) - > background_thresh))) - pdflush_operation(background_writeout, 0); + (!laptop_mode && (nr_reclaimable > background_thresh))) + bdi_start_background_writeback(bdi); } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -607,6 +604,8 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) } } +static DEFINE_PER_CPU(unsigned long, bdp_ratelimits) = 0; + /** * balance_dirty_pages_ratelimited_nr - balance dirty memory state * @mapping: address_space which was dirtied @@ -624,7 +623,6 @@ void set_page_dirty_balance(struct page *page, int page_mkwrite) void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, unsigned long nr_pages_dirtied) { - static DEFINE_PER_CPU(unsigned long, ratelimits) = 0; unsigned long ratelimit; unsigned long *p; @@ -637,12 +635,13 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, * tasks in balance_dirty_pages(). Period. */ preempt_disable(); - p = &__get_cpu_var(ratelimits); + p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { + ratelimit = sync_writeback_pages(*p); *p = 0; preempt_enable(); - balance_dirty_pages(mapping); + balance_dirty_pages(mapping, ratelimit); return; } preempt_enable(); @@ -655,7 +654,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) unsigned long dirty_thresh; for ( ; ; ) { - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); + global_dirty_limits(&background_thresh, &dirty_thresh); /* * Boost the allowable dirty threshold a bit for page @@ -666,7 +665,7 @@ void throttle_vm_writeout(gfp_t gfp_mask) if (global_page_state(NR_UNSTABLE_NFS) + global_page_state(NR_WRITEBACK) <= dirty_thresh) break; - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The caller might hold locks which can prevent IO completion @@ -679,152 +678,29 @@ void throttle_vm_writeout(gfp_t gfp_mask) } /* - * writeback at least _min_pages, and keep writing until the amount of dirty - * memory is less than the background threshold, or until we're all clean. - */ -static void background_writeout(unsigned long _min_pages) -{ - long min_pages = _min_pages; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = NULL, - .nr_to_write = 0, - .nonblocking = 1, - .range_cyclic = 1, - }; - - for ( ; ; ) { - unsigned long background_thresh; - unsigned long dirty_thresh; - - get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL); - if (global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) < background_thresh - && min_pages <= 0) - break; - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - wbc.pages_skipped = 0; - writeback_inodes(&wbc); - min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - if (wbc.nr_to_write > 0 || wbc.pages_skipped > 0) { - /* Wrote less than expected */ - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; - } - } -} - -/* - * Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back - * the whole world. Returns 0 if a pdflush thread was dispatched. Returns - * -1 if all pdflush threads were busy. - */ -int wakeup_pdflush(long nr_pages) -{ - if (nr_pages == 0) - nr_pages = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS); - return pdflush_operation(background_writeout, nr_pages); -} - -static void wb_timer_fn(unsigned long unused); -static void laptop_timer_fn(unsigned long unused); - -static DEFINE_TIMER(wb_timer, wb_timer_fn, 0, 0); -static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); - -/* - * Periodic writeback of "old" data. - * - * Define "old": the first time one of an inode's pages is dirtied, we mark the - * dirtying-time in the inode's address_space. So this periodic writeback code - * just walks the superblock inode list, writing back any inodes which are - * older than a specific point in time. - * - * Try to run once per dirty_writeback_interval. But if a writeback event - * takes longer than a dirty_writeback_interval interval, then leave a - * one-second gap. - * - * older_than_this takes precedence over nr_to_write. So we'll only write back - * all dirty pages if they are all attached to "old" mappings. - */ -static void wb_kupdate(unsigned long arg) -{ - unsigned long oldest_jif; - unsigned long start_jif; - unsigned long next_jif; - long nr_to_write; - struct writeback_control wbc = { - .bdi = NULL, - .sync_mode = WB_SYNC_NONE, - .older_than_this = &oldest_jif, - .nr_to_write = 0, - .nonblocking = 1, - .for_kupdate = 1, - .range_cyclic = 1, - }; - - sync_supers(); - - oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval * 10); - start_jif = jiffies; - next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10); - nr_to_write = global_page_state(NR_FILE_DIRTY) + - global_page_state(NR_UNSTABLE_NFS) + - (inodes_stat.nr_inodes - inodes_stat.nr_unused); - while (nr_to_write > 0) { - wbc.more_io = 0; - wbc.encountered_congestion = 0; - wbc.nr_to_write = MAX_WRITEBACK_PAGES; - writeback_inodes(&wbc); - if (wbc.nr_to_write > 0) { - if (wbc.encountered_congestion || wbc.more_io) - congestion_wait(WRITE, HZ/10); - else - break; /* All the old data is written */ - } - nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write; - } - if (time_before(next_jif, jiffies + HZ)) - next_jif = jiffies + HZ; - if (dirty_writeback_interval) - mod_timer(&wb_timer, next_jif); -} - -/* * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); - if (dirty_writeback_interval) - mod_timer(&wb_timer, jiffies + - msecs_to_jiffies(dirty_writeback_interval * 10)); - else - del_timer(&wb_timer); + proc_dointvec(table, write, buffer, length, ppos); + bdi_arm_supers_timer(); return 0; } -static void wb_timer_fn(unsigned long unused) -{ - if (pdflush_operation(wb_kupdate, 0) < 0) - mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */ -} - -static void laptop_flush(unsigned long unused) +#ifdef CONFIG_BLOCK +void laptop_mode_timer_fn(unsigned long data) { - sys_sync(); -} + struct request_queue *q = (struct request_queue *)data; + int nr_pages = global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS); -static void laptop_timer_fn(unsigned long unused) -{ - pdflush_operation(laptop_flush, 0); + /* + * We want to write everything out, not just down to the dirty + * threshold + */ + if (bdi_has_dirty_io(&q->backing_dev_info)) + bdi_start_writeback(&q->backing_dev_info, nr_pages); } /* @@ -832,9 +708,9 @@ static void laptop_timer_fn(unsigned long unused) * of all dirty data a few seconds from now. If the flush is already scheduled * then push it back - the user is still using the disk. */ -void laptop_io_completion(void) +void laptop_io_completion(struct backing_dev_info *info) { - mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode); + mod_timer(&info->laptop_mode_wb_timer, jiffies + laptop_mode); } /* @@ -844,8 +720,16 @@ void laptop_io_completion(void) */ void laptop_sync_completion(void) { - del_timer(&laptop_mode_wb_timer); + struct backing_dev_info *bdi; + + rcu_read_lock(); + + list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) + del_timer(&bdi->laptop_mode_wb_timer); + + rcu_read_unlock(); } +#endif /* * If ratelimit_pages is too high then we can get into dirty-data overload @@ -907,8 +791,6 @@ void __init page_writeback_init(void) { int shift; - mod_timer(&wb_timer, - jiffies + msecs_to_jiffies(dirty_writeback_interval * 10)); writeback_set_ratelimit(); register_cpu_notifier(&ratelimit_nb); @@ -918,6 +800,42 @@ void __init page_writeback_init(void) } /** + * tag_pages_for_writeback - tag pages to be written by write_cache_pages + * @mapping: address space structure to write + * @start: starting page index + * @end: ending page index (inclusive) + * + * This function scans the page range from @start to @end (inclusive) and tags + * all pages that have DIRTY tag set with a special TOWRITE tag. The idea is + * that write_cache_pages (or whoever calls this function) will then use + * TOWRITE tag to identify pages eligible for writeback. This mechanism is + * used to avoid livelocking of writeback by a process steadily creating new + * dirty pages in the file (thus it is important for this function to be quick + * so that it can tag pages faster than a dirtying process can create them). + */ +/* + * We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency. + */ +void tag_pages_for_writeback(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ +#define WRITEBACK_TAG_BATCH 4096 + unsigned long tagged; + + do { + spin_lock_irq(&mapping->tree_lock); + tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, + &start, end, WRITEBACK_TAG_BATCH, + PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); + cond_resched(); + /* We check 'start' to handle wrapping when end == ~0UL */ + } while (tagged >= WRITEBACK_TAG_BATCH && start); +} +EXPORT_SYMBOL(tag_pages_for_writeback); + +/** * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. * @mapping: address space structure to write * @wbc: subtract the number of written pages from *@wbc->nr_to_write @@ -931,12 +849,18 @@ void __init page_writeback_init(void) * the call was made get new I/O started against them. If wbc->sync_mode is * WB_SYNC_ALL then we were called for data integrity and we must wait for * existing IO to complete. + * + * To avoid livelocks (when other process dirties new pages), we first tag + * pages which should be written back with TOWRITE tag and only then start + * writing them. For data-integrity sync we have to be careful so that we do + * not miss some pages (e.g., because some other process has cleared TOWRITE + * tag we set). The rule we follow is that TOWRITE tag can be cleared only + * by the process clearing the DIRTY tag (and submitting the page for IO). */ int write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, writepage_t writepage, void *data) { - struct backing_dev_info *bdi = mapping->backing_dev_info; int ret = 0; int done = 0; struct pagevec pvec; @@ -947,12 +871,7 @@ int write_cache_pages(struct address_space *mapping, pgoff_t done_index; int cycled; int range_whole = 0; - long nr_to_write = wbc->nr_to_write; - - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; - return 0; - } + int tag; pagevec_init(&pvec, 0); if (wbc->range_cyclic) { @@ -970,13 +889,18 @@ int write_cache_pages(struct address_space *mapping, range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ } + if (wbc->sync_mode == WB_SYNC_ALL) + tag = PAGECACHE_TAG_TOWRITE; + else + tag = PAGECACHE_TAG_DIRTY; retry: + if (wbc->sync_mode == WB_SYNC_ALL) + tag_pages_for_writeback(mapping, index, end); done_index = index; while (!done && (index <= end)) { int i; - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, tag, min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; @@ -1034,6 +958,7 @@ continue_unlock: if (!clear_page_dirty_for_io(page)) goto continue_unlock; + trace_wbc_writepage(wbc, mapping->backing_dev_info); ret = (*writepage)(page, wbc, data); if (unlikely(ret)) { if (ret == AOP_WRITEPAGE_ACTIVATE) { @@ -1052,29 +977,16 @@ continue_unlock: done = 1; break; } - } - - if (nr_to_write > 0) { - nr_to_write--; - if (nr_to_write == 0 && - wbc->sync_mode == WB_SYNC_NONE) { - /* - * We stop writing back only if we are - * not doing integrity sync. In case of - * integrity sync we have to keep going - * because someone may be concurrently - * dirtying pages, and we might have - * synced a lot of newly appeared dirty - * pages, but have not synced all of the - * old dirty pages. - */ - done = 1; - break; - } } - if (wbc->nonblocking && bdi_write_congested(bdi)) { - wbc->encountered_congestion = 1; + /* + * We stop writing back only if we are not doing + * integrity sync. In case of integrity sync we have to + * keep going until we have written all the pages + * we tagged for writeback prior to entering this loop. + */ + if (--wbc->nr_to_write <= 0 && + wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } @@ -1093,11 +1005,8 @@ continue_unlock: end = writeback_index - 1; goto retry; } - if (!wbc->no_nrwrite_index_update) { - if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) - mapping->writeback_index = done_index; - wbc->nr_to_write = nr_to_write; - } + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = done_index; return ret; } @@ -1142,12 +1051,10 @@ int do_writepages(struct address_space *mapping, struct writeback_control *wbc) if (wbc->nr_to_write <= 0) return 0; - wbc->for_writepages = 1; if (mapping->a_ops->writepages) ret = mapping->a_ops->writepages(mapping, wbc); else ret = generic_writepages(mapping, wbc); - wbc->for_writepages = 0; return ret; } @@ -1208,11 +1115,25 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) { if (mapping_cap_account_dirty(mapping)) { __inc_zone_page_state(page, NR_FILE_DIRTY); + __inc_zone_page_state(page, NR_DIRTIED); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } } +EXPORT_SYMBOL(account_page_dirtied); + +/* + * Helper function for set_page_writeback family. + * NOTE: Unlike account_page_dirtied this does not rely on being atomic + * wrt interrupts. + */ +void account_page_writeback(struct page *page) +{ + inc_zone_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_WRITTEN); +} +EXPORT_SYMBOL(account_page_writeback); /* * For address_spaces which do not use buffers. Just tag the page as dirty in @@ -1271,6 +1192,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) EXPORT_SYMBOL(redirty_page_for_writepage); /* + * Dirty a page. + * + * For pages with a mapping this should be done under the page lock + * for the benefit of asynchronous memory errors who prefer a consistent + * dirty state. This rule can be broken in some special cases, + * but should be better not to. + * * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ @@ -1437,12 +1365,15 @@ int test_set_page_writeback(struct page *page) radix_tree_tag_clear(&mapping->page_tree, page_index(page), PAGECACHE_TAG_DIRTY); + radix_tree_tag_clear(&mapping->page_tree, + page_index(page), + PAGECACHE_TAG_TOWRITE); spin_unlock_irqrestore(&mapping->tree_lock, flags); } else { ret = TestSetPageWriteback(page); } if (!ret) - inc_zone_page_state(page, NR_WRITEBACK); + account_page_writeback(page); return ret; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30d5093..ff7e158 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -21,6 +21,7 @@ #include <linux/pagemap.h> #include <linux/jiffies.h> #include <linux/bootmem.h> +#include <linux/memblock.h> #include <linux/compiler.h> #include <linux/kernel.h> #include <linux/kmemcheck.h> @@ -48,11 +49,31 @@ #include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <linux/kmemleak.h> +#include <linux/memory.h> +#include <linux/compaction.h> +#include <trace/events/kmem.h> +#include <linux/ftrace_event.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" +#ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID +DEFINE_PER_CPU(int, numa_node); +EXPORT_PER_CPU_SYMBOL(numa_node); +#endif + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * N.B., Do NOT reference the '_numa_mem_' per cpu variable directly. + * It will not be defined when CONFIG_HAVE_MEMORYLESS_NODES is not defined. + * Use the accessor functions set_numa_mem(), numa_mem_id() and cpu_to_mem() + * defined in <linux/topology.h>. + */ +DEFINE_PER_CPU(int, _numa_mem_); /* Kernel "local memory" node */ +EXPORT_PER_CPU_SYMBOL(_numa_mem_); +#endif + /* * Array of node states. */ @@ -71,10 +92,39 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; -unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; +#ifdef CONFIG_PM_SLEEP +/* + * The following functions are used by the suspend/hibernate code to temporarily + * change gfp_allowed_mask in order to avoid using I/O during memory allocations + * while devices are suspended. To avoid races with the suspend/hibernate code, + * they should always be called with pm_mutex held (gfp_allowed_mask also should + * only be modified with pm_mutex held, unless the suspend/hibernate code is + * guaranteed not to run in parallel with that modification). + */ + +static gfp_t saved_gfp_mask; + +void pm_restore_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&pm_mutex)); + if (saved_gfp_mask) { + gfp_allowed_mask = saved_gfp_mask; + saved_gfp_mask = 0; + } +} + +void pm_restrict_gfp_mask(void) +{ + WARN_ON(!mutex_is_locked(&pm_mutex)); + WARN_ON(saved_gfp_mask); + saved_gfp_mask = gfp_allowed_mask; + gfp_allowed_mask &= ~GFP_IOFS; +} +#endif /* CONFIG_PM_SLEEP */ + #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE int pageblock_order __read_mostly; #endif @@ -123,8 +173,8 @@ static char * const zone_names[MAX_NR_ZONES] = { int min_free_kbytes = 1024; -unsigned long __meminitdata nr_kernel_pages; -unsigned long __meminitdata nr_all_pages; +static unsigned long __meminitdata nr_kernel_pages; +static unsigned long __meminitdata nr_all_pages; static unsigned long __meminitdata dma_reserve; #ifdef CONFIG_ARCH_POPULATES_NODE_MAP @@ -234,6 +284,12 @@ static void bad_page(struct page *page) static unsigned long nr_shown; static unsigned long nr_unshown; + /* Don't complain about poisoned pages */ + if (PageHWPoison(page)) { + __ClearPageBuddy(page); + return; + } + /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. @@ -256,10 +312,7 @@ static void bad_page(struct page *page) printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", current->comm, page_to_pfn(page)); - printk(KERN_ALERT - "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", - page, (void *)page->flags, page_count(page), - page_mapcount(page), page->mapping, page->index); + dump_page(page); dump_stack(); out: @@ -445,6 +498,8 @@ static inline void __free_one_page(struct page *page, int migratetype) { unsigned long page_idx; + unsigned long combined_idx; + struct page *buddy; if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) @@ -458,9 +513,6 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(bad_range(zone, page)); while (order < MAX_ORDER-1) { - unsigned long combined_idx; - struct page *buddy; - buddy = __page_find_buddy(page, page_idx, order); if (!page_is_buddy(page, buddy, order)) break; @@ -475,12 +527,32 @@ static inline void __free_one_page(struct page *page, order++; } set_page_order(page, order); - list_add(&page->lru, - &zone->free_area[order].free_list[migratetype]); + + /* + * If this is not the largest possible page, check if the buddy + * of the next-highest order is free. If it is, it's possible + * that pages are being freed that will coalesce soon. In case, + * that is happening, add the free page to the tail of the list + * so it's less likely to be used soon and more likely to be merged + * as a higher order page + */ + if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { + struct page *higher_page, *higher_buddy; + combined_idx = __find_combined_index(page_idx, order); + higher_page = page + combined_idx - page_idx; + higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); + if (page_is_buddy(higher_page, higher_buddy, order + 1)) { + list_add_tail(&page->lru, + &zone->free_area[order].free_list[migratetype]); + goto out; + } + } + + list_add(&page->lru, &zone->free_area[order].free_list[migratetype]); +out: zone->free_area[order].nr_free++; } -#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT /* * free_page_mlock() -- clean up attempts to free and mlocked() page. * Page should not be on lru, so no need to fix that up. @@ -491,9 +563,6 @@ static inline void free_page_mlock(struct page *page) __dec_zone_page_state(page, NR_MLOCK); __count_vm_event(UNEVICTABLE_MLOCKFREED); } -#else -static void free_page_mlock(struct page *page) { } -#endif static inline int free_pages_check(struct page *page) { @@ -510,7 +579,7 @@ static inline int free_pages_check(struct page *page) } /* - * Frees a list of pages. + * Frees a number of pages from the PCP lists * Assumes all pages on list are in same zone, and of same order. * count is the number of pages to free. * @@ -520,23 +589,45 @@ static inline int free_pages_check(struct page *page) * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static void free_pages_bulk(struct zone *zone, int count, - struct list_head *list, int order) +static void free_pcppages_bulk(struct zone *zone, int count, + struct per_cpu_pages *pcp) { + int migratetype = 0; + int batch_free = 0; + int to_free = count; + spin_lock(&zone->lock); - zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); + zone->all_unreclaimable = 0; zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); - while (count--) { + while (to_free) { struct page *page; + struct list_head *list; - VM_BUG_ON(list_empty(list)); - page = list_entry(list->prev, struct page, lru); - /* have to delete it as __free_one_page list manipulates */ - list_del(&page->lru); - __free_one_page(page, zone, order, page_private(page)); + /* + * Remove pages from lists in a round-robin fashion. A + * batch_free count is maintained that is incremented when an + * empty list is encountered. This is so more pages are freed + * off fuller lists instead of spinning excessively around empty + * lists + */ + do { + batch_free++; + if (++migratetype == MIGRATE_PCPTYPES) + migratetype = 0; + list = &pcp->lists[migratetype]; + } while (list_empty(list)); + + do { + page = list_entry(list->prev, struct page, lru); + /* must delete as __free_one_page list manipulates */ + list_del(&page->lru); + /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ + __free_one_page(page, zone, 0, page_private(page)); + trace_mm_page_pcpu_drain(page, 0, page_private(page)); + } while (--to_free && --batch_free && !list_empty(list)); } + __mod_zone_page_state(zone, NR_FREE_PAGES, count); spin_unlock(&zone->lock); } @@ -544,27 +635,31 @@ static void free_one_page(struct zone *zone, struct page *page, int order, int migratetype) { spin_lock(&zone->lock); - zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); + zone->all_unreclaimable = 0; zone->pages_scanned = 0; - __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); __free_one_page(page, zone, order, migratetype); + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); spin_unlock(&zone->lock); } -static void __free_pages_ok(struct page *page, unsigned int order) +static bool free_pages_prepare(struct page *page, unsigned int order) { - unsigned long flags; int i; int bad = 0; - int wasMlocked = TestClearPageMlocked(page); + trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); - for (i = 0 ; i < (1 << order) ; ++i) - bad += free_pages_check(page + i); + for (i = 0; i < (1 << order); i++) { + struct page *pg = page + i; + + if (PageAnon(pg)) + pg->mapping = NULL; + bad += free_pages_check(pg); + } if (bad) - return; + return false; if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order); @@ -574,6 +669,17 @@ static void __free_pages_ok(struct page *page, unsigned int order) arch_free_page(page, order); kernel_map_pages(page, 1 << order, 0); + return true; +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ + unsigned long flags; + int wasMlocked = __TestClearPageMlocked(page); + + if (!free_pages_prepare(page, order)) + return; + local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); @@ -646,7 +752,7 @@ static inline void expand(struct zone *zone, struct page *page, /* * This page is about to be returned from the page allocator */ -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static inline int check_new_page(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -655,6 +761,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) bad_page(page); return 1; } + return 0; +} + +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + if (unlikely(check_new_page(p))) + return 1; + } set_page_private(page, 0); set_page_refcounted(page); @@ -783,6 +901,17 @@ static int move_freepages_block(struct zone *zone, struct page *page, return move_freepages(zone, start_page, end_page, migratetype); } +static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +{ + int nr_pageblocks = 1 << (start_order - pageblock_order); + + while (nr_pageblocks--) { + set_pageblock_migratetype(pageblock_page, migratetype); + pageblock_page += pageblock_nr_pages; + } +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) @@ -817,13 +946,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * agressive about taking ownership of free pages */ if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE) { + start_migratetype == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled) { unsigned long pages; pages = move_freepages_block(zone, page, start_migratetype); /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1))) + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_migratetype); @@ -834,11 +965,16 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) list_del(&page->lru); rmv_page_order(page); - if (current_order == pageblock_order) - set_pageblock_migratetype(page, + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) + change_pageblock_range(page, current_order, start_migratetype); expand(zone, page, order, current_order, area, migratetype); + + trace_mm_page_alloc_extfrag(page, order, current_order, + start_migratetype, migratetype); + return page; } } @@ -872,6 +1008,7 @@ retry_reserve: } } + trace_mm_page_alloc_zone_locked(page, order, migratetype); return page; } @@ -882,7 +1019,7 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype) + int migratetype, int cold) { int i; @@ -901,7 +1038,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - list_add(&page->lru, list); + if (likely(cold == 0)) + list_add(&page->lru, list); + else + list_add_tail(&page->lru, list); set_page_private(page, migratetype); list = &page->lru; } @@ -929,7 +1069,7 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->batch; else to_drain = pcp->count; - free_pages_bulk(zone, to_drain, &pcp->list, 0); + free_pcppages_bulk(zone, to_drain, pcp); pcp->count -= to_drain; local_irq_restore(flags); } @@ -951,11 +1091,11 @@ static void drain_pages(unsigned int cpu) struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - pset = zone_pcp(zone, cpu); + local_irq_save(flags); + pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - local_irq_save(flags); - free_pages_bulk(zone, pcp->count, &pcp->list, 0); + free_pcppages_bulk(zone, pcp->count, pcp); pcp->count = 0; local_irq_restore(flags); } @@ -1015,56 +1155,54 @@ void mark_free_pages(struct zone *zone) /* * Free a 0-order page + * cold == 1 ? free a cold page : free a hot page */ -static void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; - int wasMlocked = TestClearPageMlocked(page); - - kmemcheck_free_shadow(page, 0); + int migratetype; + int wasMlocked = __TestClearPageMlocked(page); - if (PageAnon(page)) - page->mapping = NULL; - if (free_pages_check(page)) + if (!free_pages_prepare(page, 0)) return; - if (!PageHighMem(page)) { - debug_check_no_locks_freed(page_address(page), PAGE_SIZE); - debug_check_no_obj_freed(page_address(page), PAGE_SIZE); - } - arch_free_page(page, 0); - kernel_map_pages(page, 1, 0); - - pcp = &zone_pcp(zone, get_cpu())->pcp; - set_page_private(page, get_pageblock_migratetype(page)); + migratetype = get_pageblock_migratetype(page); + set_page_private(page, migratetype); local_irq_save(flags); if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_event(PGFREE); + /* + * We only track unmovable, reclaimable and movable on pcp lists. + * Free ISOLATE pages back to the allocator because they are being + * offlined but treat RESERVE as movable pages so we can get those + * areas back if necessary. Otherwise, we may have to free + * excessively into the page allocator + */ + if (migratetype >= MIGRATE_PCPTYPES) { + if (unlikely(migratetype == MIGRATE_ISOLATE)) { + free_one_page(zone, page, 0, migratetype); + goto out; + } + migratetype = MIGRATE_MOVABLE; + } + + pcp = &this_cpu_ptr(zone->pageset)->pcp; if (cold) - list_add_tail(&page->lru, &pcp->list); + list_add_tail(&page->lru, &pcp->lists[migratetype]); else - list_add(&page->lru, &pcp->list); + list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + free_pcppages_bulk(zone, pcp->batch, pcp); pcp->count -= pcp->batch; } - local_irq_restore(flags); - put_cpu(); -} -void free_hot_page(struct page *page) -{ - free_hot_cold_page(page, 0); -} - -void free_cold_page(struct page *page) -{ - free_hot_cold_page(page, 1); +out: + local_irq_restore(flags); } /* @@ -1096,6 +1234,51 @@ void split_page(struct page *page, unsigned int order) } /* + * Similar to split_page except the page is already free. As this is only + * being used for migration, the migratetype of the block also changes. + * As this is called with interrupts disabled, the caller is responsible + * for calling arch_alloc_page() and kernel_map_page() after interrupts + * are enabled. + * + * Note: this is probably too low level an operation for use in drivers. + * Please consult with lkml before using this in your driver. + */ +int split_free_page(struct page *page) +{ + unsigned int order; + unsigned long watermark; + struct zone *zone; + + BUG_ON(!PageBuddy(page)); + + zone = page_zone(page); + order = page_order(page); + + /* Obey watermarks as if the page was being allocated */ + watermark = low_wmark_pages(zone) + (1 << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return 0; + + /* Remove page from free list */ + list_del(&page->lru); + zone->free_area[order].nr_free--; + rmv_page_order(page); + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1UL << order)); + + /* Split into individual pages */ + set_page_refcounted(page); + split_page(page, order); + + if (order >= pageblock_order - 1) { + struct page *endpage = page + (1 << order) - 1; + for (; page < endpage; page += pageblock_nr_pages) + set_pageblock_migratetype(page, MIGRATE_MOVABLE); + } + + return 1 << order; +} + +/* * Really, prep_compound_page() should be called from __rmqueue_bulk(). But * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. @@ -1108,39 +1291,27 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); - int cpu; again: - cpu = get_cpu(); if (likely(order == 0)) { struct per_cpu_pages *pcp; + struct list_head *list; - pcp = &zone_pcp(zone, cpu)->pcp; local_irq_save(flags); - if (!pcp->count) { - pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); - if (unlikely(!pcp->count)) + pcp = &this_cpu_ptr(zone->pageset)->pcp; + list = &pcp->lists[migratetype]; + if (list_empty(list)) { + pcp->count += rmqueue_bulk(zone, 0, + pcp->batch, list, + migratetype, cold); + if (unlikely(list_empty(list))) goto failed; } - /* Find a page of the appropriate migrate type */ - if (cold) { - list_for_each_entry_reverse(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; - } else { - list_for_each_entry(page, &pcp->list, lru) - if (page_private(page) == migratetype) - break; - } - - /* Allocate more to the pcp list if necessary */ - if (unlikely(&page->lru == &pcp->list)) { - pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); - page = list_entry(pcp->list.next, struct page, lru); - } + if (cold) + page = list_entry(list->prev, struct page, lru); + else + page = list_entry(list->next, struct page, lru); list_del(&page->lru); pcp->count--; @@ -1153,23 +1324,22 @@ again: * properly detect and handle allocation failures. * * We most definitely don't want callers attempting to - * allocate greater than single-page units with + * allocate greater than order-1 page units with * __GFP_NOFAIL. */ - WARN_ON_ONCE(order > 0); + WARN_ON_ONCE(order > 1); } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); - __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); spin_unlock(&zone->lock); if (!page) goto failed; + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); } __count_zone_vm_events(PGALLOC, zone, 1 << order); zone_statistics(preferred_zone, zone); local_irq_restore(flags); - put_cpu(); VM_BUG_ON(bad_range(zone, page)); if (prep_new_page(page, order, gfp_flags)) @@ -1178,7 +1348,6 @@ again: failed: local_irq_restore(flags); - put_cpu(); return NULL; } @@ -1299,7 +1468,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1; + long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; if (alloc_flags & ALLOC_HIGH) @@ -1576,7 +1745,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct page *page; /* Acquire the OOM killer lock for the zones in zonelist */ - if (!try_set_zone_oom(zonelist, gfp_mask)) { + if (!try_set_zonelist_oom(zonelist, gfp_mask)) { schedule_timeout_uninterruptible(1); return NULL; } @@ -1593,18 +1762,87 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (page) goto out; - /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) - goto out; - + if (!(gfp_mask & __GFP_NOFAIL)) { + /* The OOM killer will not help higher order allocs */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto out; + /* The OOM killer does not needlessly kill tasks for lowmem */ + if (high_zoneidx < ZONE_NORMAL) + goto out; + /* + * GFP_THISNODE contains __GFP_NORETRY and we never hit this. + * Sanity check for bare calls of __GFP_THISNODE, not real OOM. + * The caller should handle page allocation failure by itself if + * it specifies __GFP_THISNODE. + * Note: Hugepage uses it but will hit PAGE_ALLOC_COSTLY_ORDER. + */ + if (gfp_mask & __GFP_THISNODE) + goto out; + } /* Exhausted what can be done so it's blamo time */ - out_of_memory(zonelist, gfp_mask, order); + out_of_memory(zonelist, gfp_mask, order, nodemask); out: clear_zonelist_oom(zonelist, gfp_mask); return page; } +#ifdef CONFIG_COMPACTION +/* Try memory compaction for high-order allocations before reclaim */ +static struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + struct page *page; + + if (!order || compaction_deferred(preferred_zone)) + return NULL; + + *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, + nodemask); + if (*did_some_progress != COMPACT_SKIPPED) { + + /* Page migration frees to the PCP lists but we want merging */ + drain_pages(get_cpu()); + put_cpu(); + + page = get_page_from_freelist(gfp_mask, nodemask, + order, zonelist, high_zoneidx, + alloc_flags, preferred_zone, + migratetype); + if (page) { + preferred_zone->compact_considered = 0; + preferred_zone->compact_defer_shift = 0; + count_vm_event(COMPACTSUCCESS); + return page; + } + + /* + * It's bad if compaction run occurs and fails. + * The most likely reason is that pages exist, + * but not enough to satisfy watermarks. + */ + count_vm_event(COMPACTFAIL); + defer_compaction(preferred_zone); + + cond_resched(); + } + + return NULL; +} +#else +static inline struct page * +__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, + struct zonelist *zonelist, enum zone_type high_zoneidx, + nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, + int migratetype, unsigned long *did_some_progress) +{ + return NULL; +} +#endif /* CONFIG_COMPACTION */ + /* The really slow allocator path where we enter direct reclaim */ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, @@ -1615,15 +1853,12 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct page *page = NULL; struct reclaim_state reclaim_state; struct task_struct *p = current; + bool drained = false; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - - /* - * The task's cpuset might have expanded its set of allowable nodes - */ p->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; @@ -1637,14 +1872,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, cond_resched(); - if (order != 0) - drain_all_pages(); + if (unlikely(!(*did_some_progress))) + return NULL; - if (likely(*did_some_progress)) - page = get_page_from_freelist(gfp_mask, nodemask, order, +retry: + page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags, preferred_zone, migratetype); + + /* + * If an allocation failed after direct reclaim, it could be because + * pages are pinned on the per-cpu lists. Drain them and try again + */ + if (!page && !drained) { + drain_all_pages(); + drained = true; + goto retry; + } + return page; } @@ -1666,7 +1912,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(WRITE, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1691,7 +1937,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) const gfp_t wait = gfp_mask & __GFP_WAIT; /* __GFP_HIGH is assumed to be the same as ALLOC_HIGH to save a branch. */ - BUILD_BUG_ON(__GFP_HIGH != ALLOC_HIGH); + BUILD_BUG_ON(__GFP_HIGH != (__force gfp_t) ALLOC_HIGH); /* * The caller may dip into page reserves a bit more if the caller @@ -1699,7 +1945,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags |= (gfp_mask & __GFP_HIGH); + alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); if (!wait) { alloc_flags |= ALLOC_HARDER; @@ -1708,7 +1954,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(p))) + } else if (unlikely(rt_task(p)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { @@ -1740,8 +1986,10 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * be using allocators in order of preference for an area that is * too large. */ - if (WARN_ON_ONCE(order >= MAX_ORDER)) + if (order >= MAX_ORDER) { + WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); return NULL; + } /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and @@ -1754,6 +2002,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; +restart: wake_all_kswapd(order, zonelist, high_zoneidx); /* @@ -1763,7 +2012,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ alloc_flags = gfp_to_alloc_flags(gfp_mask); -restart: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -1789,6 +2037,19 @@ rebalance: if (p->flags & PF_MEMALLOC) goto nopage; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + goto nopage; + + /* Try direct compaction */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress); + if (page) + goto got_pg; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -1813,15 +2074,23 @@ rebalance: if (page) goto got_pg; - /* - * The OOM killer does not trigger for high-order - * ~__GFP_NOFAIL allocations so if no progress is being - * made, there are no other options and retrying is - * unlikely to help. - */ - if (order > PAGE_ALLOC_COSTLY_ORDER && - !(gfp_mask & __GFP_NOFAIL)) - goto nopage; + if (!(gfp_mask & __GFP_NOFAIL)) { + /* + * The oom killer is not called for high-order + * allocations that may fail, so if no progress + * is being made, there are no other options and + * retrying is unlikely to help. + */ + if (order > PAGE_ALLOC_COSTLY_ORDER) + goto nopage; + /* + * The oom killer is not called for lowmem + * allocations to prevent needlessly killing + * innocent tasks. + */ + if (high_zoneidx < ZONE_NORMAL) + goto nopage; + } goto restart; } @@ -1831,7 +2100,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(WRITE, HZ/50); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; } @@ -1880,10 +2149,13 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; + get_mems_allowed(); /* The preferred zone is used for statistics later */ first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); - if (!preferred_zone) + if (!preferred_zone) { + put_mems_allowed(); return NULL; + } /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, @@ -1893,7 +2165,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + put_mems_allowed(); + trace_mm_page_alloc(page, order, gfp_mask, migratetype); return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -1903,46 +2177,42 @@ EXPORT_SYMBOL(__alloc_pages_nodemask); */ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { - struct page * page; + struct page *page; + + /* + * __get_free_pages() returns a 32-bit address, which cannot represent + * a highmem page + */ + VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + page = alloc_pages(gfp_mask, order); if (!page) return 0; return (unsigned long) page_address(page); } - EXPORT_SYMBOL(__get_free_pages); unsigned long get_zeroed_page(gfp_t gfp_mask) { - struct page * page; - - /* - * get_zeroed_page() returns a 32-bit address, which cannot represent - * a highmem page - */ - VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); - - page = alloc_pages(gfp_mask | __GFP_ZERO, 0); - if (page) - return (unsigned long) page_address(page); - return 0; + return __get_free_pages(gfp_mask | __GFP_ZERO, 0); } - EXPORT_SYMBOL(get_zeroed_page); void __pagevec_free(struct pagevec *pvec) { int i = pagevec_count(pvec); - while (--i >= 0) + while (--i >= 0) { + trace_mm_pagevec_free(pvec->pages[i], pvec->cold); free_hot_cold_page(pvec->pages[i], pvec->cold); + } } void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_page(page); + free_hot_cold_page(page, 0); else __free_pages_ok(page, order); } @@ -1983,7 +2253,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long alloc_end = addr + (PAGE_SIZE << order); unsigned long used = addr + PAGE_ALIGN(size); - split_page(virt_to_page(addr), order); + split_page(virt_to_page((void *)addr), order); while (used < alloc_end) { free_page(used); used += PAGE_SIZE; @@ -2107,7 +2377,7 @@ void show_free_areas(void) for_each_online_cpu(cpu) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, cpu); + pageset = per_cpu_ptr(zone->pageset, cpu); printk("CPU %4d: hi:%5d, btch:%4d usd:%4d\n", cpu, pageset->pcp.high, @@ -2115,23 +2385,27 @@ void show_free_areas(void) } } - printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" - " inactive_file:%lu" + printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" + " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu" " dirty:%lu writeback:%lu unstable:%lu\n" - " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", + " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_ISOLATED_ANON), + global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_FILE), + global_page_state(NR_ISOLATED_FILE), global_page_state(NR_UNEVICTABLE), global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), global_page_state(NR_FREE_PAGES), - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE), + global_page_state(NR_SLAB_RECLAIMABLE), + global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), + global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE)); @@ -2149,12 +2423,26 @@ void show_free_areas(void) " active_file:%lukB" " inactive_file:%lukB" " unevictable:%lukB" + " isolated(anon):%lukB" + " isolated(file):%lukB" " present:%lukB" + " mlocked:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " mapped:%lukB" + " shmem:%lukB" + " slab_reclaimable:%lukB" + " slab_unreclaimable:%lukB" + " kernel_stack:%lukB" + " pagetables:%lukB" + " unstable:%lukB" + " bounce:%lukB" + " writeback_tmp:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" "\n", zone->name, - K(zone_page_state(zone, NR_FREE_PAGES)), + K(zone_nr_free_pages(zone)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), @@ -2163,9 +2451,24 @@ void show_free_areas(void) K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), K(zone_page_state(zone, NR_UNEVICTABLE)), + K(zone_page_state(zone, NR_ISOLATED_ANON)), + K(zone_page_state(zone, NR_ISOLATED_FILE)), K(zone->present_pages), + K(zone_page_state(zone, NR_MLOCK)), + K(zone_page_state(zone, NR_FILE_DIRTY)), + K(zone_page_state(zone, NR_WRITEBACK)), + K(zone_page_state(zone, NR_FILE_MAPPED)), + K(zone_page_state(zone, NR_SHMEM)), + K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), + K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), + zone_page_state(zone, NR_KERNEL_STACK) * + THREAD_SIZE / 1024, + K(zone_page_state(zone, NR_PAGETABLE)), + K(zone_page_state(zone, NR_UNSTABLE_NFS)), + K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, - (zone_is_all_unreclaimable(zone) ? "yes" : "no") + (zone->all_unreclaimable ? "yes" : "no") ); printk("lowmem_reserve[]:"); for (i = 0; i < MAX_NR_ZONES; i++) @@ -2292,18 +2595,19 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, + void __user *buffer, size_t *length, loff_t *ppos) { char saved_string[NUMA_ZONELIST_ORDER_LEN]; int ret; + static DEFINE_MUTEX(zl_order_mutex); + mutex_lock(&zl_order_mutex); if (write) - strncpy(saved_string, (char*)table->data, - NUMA_ZONELIST_ORDER_LEN); - ret = proc_dostring(table, write, file, buffer, length, ppos); + strcpy(saved_string, (char*)table->data); + ret = proc_dostring(table, write, buffer, length, ppos); if (ret) - return ret; + goto out; if (write) { int oldval = user_zonelist_order; if (__parse_numa_zonelist_order((char*)table->data)) { @@ -2313,10 +2617,15 @@ int numa_zonelist_order_handler(ctl_table *table, int write, strncpy((char*)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; - } else if (oldval != user_zonelist_order) - build_all_zonelists(); + } else if (oldval != user_zonelist_order) { + mutex_lock(&zonelists_mutex); + build_all_zonelists(NULL); + mutex_unlock(&zonelists_mutex); + } } - return 0; +out: + mutex_unlock(&zl_order_mutex); + return ret; } @@ -2456,10 +2765,10 @@ static int default_zonelist_order(void) struct zone *z; int average_size; /* - * ZONE_DMA and ZONE_DMA32 can be very small area in the sytem. + * ZONE_DMA and ZONE_DMA32 can be very small area in the system. * If they are really small and used heavily, the system can fall * into OOM very easily. - * This function detect ZONE_DMA/DMA32 size and confgigures zone order. + * This function detect ZONE_DMA/DMA32 size and configures zone order. */ /* Is there ZONE_NORMAL ? (ex. ppc has only DMA zone..) */ low_kmem_size = 0; @@ -2471,6 +2780,15 @@ static int default_zonelist_order(void) if (zone_type < ZONE_NORMAL) low_kmem_size += z->present_pages; total_size += z->present_pages; + } else if (zone_type == ZONE_NORMAL) { + /* + * If any node has only lowmem, then node order + * is preferred to allow kernel allocations + * locally; otherwise, they can easily infringe + * on other nodes when there is an abundance of + * lowmem available to allocate from. + */ + return ZONELIST_ORDER_NODE; } } } @@ -2533,7 +2851,6 @@ static void build_zonelists(pg_data_t *pgdat) prev_node = local_node; nodes_clear(used_mask); - memset(node_load, 0, sizeof(node_load)); memset(node_order, 0, sizeof(node_order)); j = 0; @@ -2585,6 +2902,24 @@ static void build_zonelist_cache(pg_data_t *pgdat) zlc->z_to_n[z - zonelist->_zonerefs] = zonelist_node_idx(z); } +#ifdef CONFIG_HAVE_MEMORYLESS_NODES +/* + * Return node id of node used for "local" allocations. + * I.e., first node id of first zone in arg node's generic zonelist. + * Used for initializing percpu 'numa_mem', which is used primarily + * for kernel allocations, so use GFP_KERNEL flags to locate zonelist. + */ +int local_memory_node(int node) +{ + struct zone *zone; + + (void)first_zones_zonelist(node_zonelist(node, GFP_KERNEL), + gfp_zone(GFP_KERNEL), + NULL, + &zone); + return zone->node; +} +#endif #else /* CONFIG_NUMA */ @@ -2637,21 +2972,85 @@ static void build_zonelist_cache(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch); +static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset); +static void setup_zone_pageset(struct zone *zone); + +/* + * Global mutex to protect against size modification of zonelists + * as well as to serialize pageset setup for the new populated zone. + */ +DEFINE_MUTEX(zonelists_mutex); + /* return values int ....just for stop_machine() */ -static int __build_all_zonelists(void *dummy) +static __init_refok int __build_all_zonelists(void *data) { int nid; + int cpu; +#ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +#endif for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); build_zonelist_cache(pgdat); } + + /* + * Initialize the boot_pagesets that are going to be used + * for bootstrapping processors. The real pagesets for + * each zone will be allocated later when the per cpu + * allocator is available. + * + * boot_pagesets are used also for bootstrapping offline + * cpus if the system is already booted because the pagesets + * are needed to initialize allocators on a specific cpu too. + * F.e. the percpu allocator needs the page allocator which + * needs the percpu allocator in order to allocate its pagesets + * (a chicken-egg dilemma). + */ + for_each_possible_cpu(cpu) { + setup_pageset(&per_cpu(boot_pageset, cpu), 0); + +#ifdef CONFIG_HAVE_MEMORYLESS_NODES + /* + * We now know the "local memory node" for each node-- + * i.e., the node of the first zone in the generic zonelist. + * Set up numa_mem percpu variable for on-line cpus. During + * boot, only the boot cpu should be on-line; we'll init the + * secondary cpus' numa_mem as they come on-line. During + * node/memory hotplug, we'll fixup all on-line cpus. + */ + if (cpu_online(cpu)) + set_cpu_numa_mem(cpu, local_memory_node(cpu_to_node(cpu))); +#endif + } + return 0; } -void build_all_zonelists(void) +/* + * Called with zonelists_mutex held always + * unless system_state == SYSTEM_BOOTING. + */ +void build_all_zonelists(void *data) { set_zonelist_order(); @@ -2662,6 +3061,10 @@ void build_all_zonelists(void) } else { /* we have to stop all cpus to guarantee there is no user of zonelist */ +#ifdef CONFIG_MEMORY_HOTPLUG + if (data) + setup_zone_pageset((struct zone *)data); +#endif stop_machine(__build_all_zonelists, NULL, NULL); /* cpuset refresh routine should be here */ } @@ -2768,7 +3171,8 @@ static void setup_zone_migrate_reserve(struct zone *zone) { unsigned long start_pfn, pfn, end_pfn; struct page *page; - unsigned long reserve, block_migratetype; + unsigned long block_migratetype; + int reserve; /* Get the start pfn, end pfn and the number of blocks to reserve */ start_pfn = zone->zone_start_pfn; @@ -2776,6 +3180,15 @@ static void setup_zone_migrate_reserve(struct zone *zone) reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >> pageblock_order; + /* + * Reserve blocks are generally in place to help high-order atomic + * allocations that are short-lived. A min_free_kbytes value that + * would result in more than 2 reserve blocks for atomic allocations + * is assumed to be in place to help anti-fragmentation for the + * future allocation of hugepages at runtime. + */ + reserve = min(2, reserve); + for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) { if (!pfn_valid(pfn)) continue; @@ -2946,6 +3359,7 @@ static int zone_batchsize(struct zone *zone) static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) { struct per_cpu_pages *pcp; + int migratetype; memset(p, 0, sizeof(*p)); @@ -2953,7 +3367,8 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp->count = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); - INIT_LIST_HEAD(&pcp->list); + for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) + INIT_LIST_HEAD(&pcp->lists[migratetype]); } /* @@ -2973,121 +3388,36 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } - -#ifdef CONFIG_NUMA -/* - * Boot pageset table. One per cpu which is going to be used for all - * zones and all nodes. The parameters will be set in such a way - * that an item put on a list will immediately be handed over to - * the buddy list. This is safe since pageset manipulation is done - * with interrupts disabled. - * - * Some NUMA counter updates may also be caught by the boot pagesets. - * - * The boot_pagesets must be kept even after bootup is complete for - * unused processors and/or zones. They do play a role for bootstrapping - * hotplugged processors. - * - * zoneinfo_show() and maybe other functions do - * not check if the processor is online before following the pageset pointer. - * Other parts of the kernel may not check if the zone is available. - */ -static struct per_cpu_pageset boot_pageset[NR_CPUS]; - -/* - * Dynamically allocate memory for the - * per cpu pageset array in struct zone. - */ -static int __cpuinit process_zones(int cpu) +static __meminit void setup_zone_pageset(struct zone *zone) { - struct zone *zone, *dzone; - int node = cpu_to_node(cpu); + int cpu; - node_set_state(node, N_CPU); /* this node has a cpu */ + zone->pageset = alloc_percpu(struct per_cpu_pageset); - for_each_populated_zone(zone) { - zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), - GFP_KERNEL, node); - if (!zone_pcp(zone, cpu)) - goto bad; + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); + setup_pageset(pcp, zone_batchsize(zone)); if (percpu_pagelist_fraction) - setup_pagelist_highmark(zone_pcp(zone, cpu), - (zone->present_pages / percpu_pagelist_fraction)); - } - - return 0; -bad: - for_each_zone(dzone) { - if (!populated_zone(dzone)) - continue; - if (dzone == zone) - break; - kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = NULL; + setup_pagelist_highmark(pcp, + (zone->present_pages / + percpu_pagelist_fraction)); } - return -ENOMEM; } -static inline void free_zone_pagesets(int cpu) -{ - struct zone *zone; - - for_each_zone(zone) { - struct per_cpu_pageset *pset = zone_pcp(zone, cpu); - - /* Free per_cpu_pageset if it is slab allocated */ - if (pset != &boot_pageset[cpu]) - kfree(pset); - zone_pcp(zone, cpu) = NULL; - } -} - -static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) -{ - int cpu = (long)hcpu; - int ret = NOTIFY_OK; - - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - if (process_zones(cpu)) - ret = NOTIFY_BAD; - break; - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - free_zone_pagesets(cpu); - break; - default: - break; - } - return ret; -} - -static struct notifier_block __cpuinitdata pageset_notifier = - { &pageset_cpuup_callback, NULL, 0 }; - +/* + * Allocate per cpu pagesets and initialize them. + * Before this call only boot pagesets were available. + */ void __init setup_per_cpu_pageset(void) { - int err; + struct zone *zone; - /* Initialize per_cpu_pageset for cpu 0. - * A cpuup callback will do this for every cpu - * as it comes online - */ - err = process_zones(smp_processor_id()); - BUG_ON(err); - register_cpu_notifier(&pageset_notifier); + for_each_populated_zone(zone) + setup_zone_pageset(zone); } -#endif - static noinline __init_refok int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { @@ -3131,23 +3461,45 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) return 0; } -static __meminit void zone_pcp_init(struct zone *zone) +static int __zone_pcp_update(void *data) { + struct zone *zone = data; int cpu; - unsigned long batch = zone_batchsize(zone); + unsigned long batch = zone_batchsize(zone), flags; - for (cpu = 0; cpu < NR_CPUS; cpu++) { -#ifdef CONFIG_NUMA - /* Early boot. Slab allocator not functional yet */ - zone_pcp(zone, cpu) = &boot_pageset[cpu]; - setup_pageset(&boot_pageset[cpu],0); -#else - setup_pageset(zone_pcp(zone,cpu), batch); -#endif + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + + pset = per_cpu_ptr(zone->pageset, cpu); + pcp = &pset->pcp; + + local_irq_save(flags); + free_pcppages_bulk(zone, pcp->count, pcp); + setup_pageset(pset, batch); + local_irq_restore(flags); } + return 0; +} + +void zone_pcp_update(struct zone *zone) +{ + stop_machine(__zone_pcp_update, zone, NULL); +} + +static __meminit void zone_pcp_init(struct zone *zone) +{ + /* + * per cpu subsystem is not up at this point. The following code + * relies on the ability of the linker to provide the + * offset of a (static) per cpu variable into the per cpu area. + */ + zone->pageset = &boot_pageset; + if (zone->present_pages) - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone->name, zone->present_pages, batch); + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", + zone->name, zone->present_pages, + zone_batchsize(zone)); } __meminit int init_currently_empty_zone(struct zone *zone, @@ -3286,6 +3638,84 @@ void __init free_bootmem_with_active_regions(int nid, } } +#ifdef CONFIG_HAVE_MEMBLOCK +u64 __init find_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + int i; + + /* Need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + u64 addr; + u64 ei_start, ei_last; + u64 final_start, final_end; + + ei_last = early_node_map[i].end_pfn; + ei_last <<= PAGE_SHIFT; + ei_start = early_node_map[i].start_pfn; + ei_start <<= PAGE_SHIFT; + + final_start = max(ei_start, goal); + final_end = min(ei_last, limit); + + if (final_start >= final_end) + continue; + + addr = memblock_find_in_range(final_start, final_end, size, align); + + if (addr == MEMBLOCK_ERROR) + continue; + + return addr; + } + + return MEMBLOCK_ERROR; +} +#endif + +int __init add_from_early_node_map(struct range *range, int az, + int nr_range, int nid) +{ + int i; + u64 start, end; + + /* need to go over early_node_map to find out good range for node */ + for_each_active_range_index_in_nid(i, nid) { + start = early_node_map[i].start_pfn; + end = early_node_map[i].end_pfn; + nr_range = add_range(range, az, nr_range, start, end); + } + return nr_range; +} + +#ifdef CONFIG_NO_BOOTMEM +void * __init __alloc_memory_core_early(int nid, u64 size, u64 align, + u64 goal, u64 limit) +{ + void *ptr; + u64 addr; + + if (limit > memblock.current_limit) + limit = memblock.current_limit; + + addr = find_memory_core_early(nid, size, align, goal, limit); + + if (addr == MEMBLOCK_ERROR) + return NULL; + + ptr = phys_to_virt(addr); + memset(ptr, 0, size); + memblock_x86_reserve_range(addr, addr + size, "BOOTMEM"); + /* + * The min_count is set to 0 so that bootmem allocated blocks + * are never reported as leaks. + */ + kmemleak_alloc(ptr, size, 0, 0); + return ptr; +} +#endif + + void __init work_with_active_regions(int nid, work_fn_t work_fn, void *data) { int i; @@ -3435,7 +3865,7 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid, * Return the number of holes in a range on a node. If nid is MAX_NUMNODES, * then all holes in the requested range will be accounted for. */ -static unsigned long __meminit __absent_pages_in_range(int nid, +unsigned long __meminit __absent_pages_in_range(int nid, unsigned long range_start_pfn, unsigned long range_end_pfn) { @@ -3700,12 +4130,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_seqlock_init(zone); zone->zone_pgdat = pgdat; - zone->prev_priority = DEF_PRIORITY; - zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_saved_scan = 0; + zone->reclaim_stat.nr_saved_scan[l] = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; @@ -3850,7 +4278,7 @@ void __init add_active_range(unsigned int nid, unsigned long start_pfn, } /* Merge backward if suitable */ - if (start_pfn < early_node_map[i].end_pfn && + if (start_pfn < early_node_map[i].start_pfn && end_pfn >= early_node_map[i].start_pfn) { early_node_map[i].start_pfn = start_pfn; return; @@ -3964,7 +4392,7 @@ static int __init cmp_node_active_region(const void *a, const void *b) } /* sort the node_map by start_pfn */ -static void __init sort_node_map(void) +void __init sort_node_map(void) { sort(early_node_map, (size_t)nr_nodemap_entries, sizeof(struct node_active_region), @@ -4032,6 +4460,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) int i, nid; unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; + /* save the state before borrow the nodemask */ + nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); @@ -4059,7 +4489,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) /* If kernelcore was not specified, there is no ZONE_MOVABLE */ if (!required_kernelcore) - return; + goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ find_usable_zone_for_movable(); @@ -4158,6 +4588,10 @@ restart: for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + +out: + /* restore the node_state */ + node_states[N_HIGH_MEMORY] = saved_node_state; } /* Any regular memory on that node ? */ @@ -4222,8 +4656,12 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) for (i = 0; i < MAX_NR_ZONES; i++) { if (i == ZONE_MOVABLE) continue; - printk(" %-8s %0#10lx -> %0#10lx\n", - zone_names[i], + printk(" %-8s ", zone_names[i]); + if (arch_zone_lowest_possible_pfn[i] == + arch_zone_highest_possible_pfn[i]) + printk("empty\n"); + else + printk("%0#10lx -> %0#10lx\n", arch_zone_lowest_possible_pfn[i], arch_zone_highest_possible_pfn[i]); } @@ -4242,11 +4680,6 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn) early_node_map[i].start_pfn, early_node_map[i].end_pfn); - /* - * find_zone_movable_pfns_for_nodes/early_calculate_totalpages init - * that node_mask, clear it at first - */ - nodes_clear(node_states[N_HIGH_MEMORY]); /* Initialise every node */ mminit_verify_pageflags_layout(); setup_nr_node_ids(); @@ -4317,7 +4750,11 @@ void __init set_dma_reserve(unsigned long new_dma_reserve) } #ifndef CONFIG_NEED_MULTIPLE_NODES -struct pglist_data __refdata contig_page_data = { .bdata = &bootmem_node_data[0] }; +struct pglist_data __refdata contig_page_data = { +#ifndef CONFIG_NO_BOOTMEM + .bdata = &bootmem_node_data[0] +#endif + }; EXPORT_SYMBOL(contig_page_data); #endif @@ -4493,7 +4930,7 @@ void setup_per_zone_wmarks(void) calculate_totalreserve_pages(); } -/** +/* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance * to be referenced again before it is swapped out. @@ -4584,9 +5021,9 @@ module_init(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (write) setup_per_zone_wmarks(); return 0; @@ -4594,12 +5031,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4610,12 +5047,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, } int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4636,9 +5073,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_minmax(table, write, file, buffer, length, ppos); + proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); return 0; } @@ -4650,20 +5087,21 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, */ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; unsigned int cpu; int ret; - ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; - for_each_zone(zone) { - for_each_online_cpu(cpu) { + for_each_populated_zone(zone) { + for_each_possible_cpu(cpu) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; - setup_pagelist_highmark(zone_pcp(zone, cpu), high); + setup_pagelist_highmark( + per_cpu_ptr(zone->pageset, cpu), high); } } return 0; @@ -4716,7 +5154,14 @@ void *__init alloc_large_system_hash(const char *tablename, numentries <<= (PAGE_SHIFT - scale); /* Make sure we've got at least a 0-order allocation.. */ - if (unlikely((numentries * bucketsize) < PAGE_SIZE)) + if (unlikely(flags & HASH_SMALL)) { + /* Makes no sense without HASH_EARLY */ + WARN_ON(!(flags & HASH_EARLY)); + if (!(numentries >> *_hash_shift)) { + numentries = 1UL << *_hash_shift; + BUG_ON(!numentries); + } + } else if (unlikely((numentries * bucketsize) < PAGE_SIZE)) numentries = PAGE_SIZE / bucketsize; } numentries = roundup_pow_of_two(numentries); @@ -4744,17 +5189,19 @@ void *__init alloc_large_system_hash(const char *tablename, * some pages at the end of hash table which * alloc_pages_exact() automatically does */ - if (get_order(size) < MAX_ORDER) + if (get_order(size) < MAX_ORDER) { table = alloc_pages_exact(size, GFP_ATOMIC); + kmemleak_alloc(table, size, 1, GFP_ATOMIC); + } } } while (!table && size > PAGE_SIZE && --log2qty); if (!table) panic("Failed to allocate %s hash table\n", tablename); - printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n", + printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n", tablename, - (1U << log2qty), + (1UL << log2qty), ilog2(size) - PAGE_SHIFT, size); @@ -4763,16 +5210,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; - /* - * If hashdist is set, the table allocation is done with __vmalloc() - * which invokes the kmemleak_alloc() callback. This function may also - * be called before the slab and kmemleak are initialised when - * kmemleak simply buffers the request to be executed later - * (GFP_ATOMIC flag ignored in this case). - */ - if (!hashdist) - kmemleak_alloc(table, size, 1, GFP_ATOMIC); - return table; } @@ -4861,23 +5298,113 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, * page allocater never alloc memory from ISOLATE block. */ +static int +__count_immobile_pages(struct zone *zone, struct page *page, int count) +{ + unsigned long pfn, iter, found; + /* + * For avoiding noise data, lru_add_drain_all() should be called + * If ZONE_MOVABLE, the zone never contains immobile pages + */ + if (zone_idx(zone) == ZONE_MOVABLE) + return true; + + if (get_pageblock_migratetype(page) == MIGRATE_MOVABLE) + return true; + + pfn = page_to_pfn(page); + for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { + unsigned long check = pfn + iter; + + if (!pfn_valid_within(check)) { + iter++; + continue; + } + page = pfn_to_page(check); + if (!page_count(page)) { + if (PageBuddy(page)) + iter += (1 << page_order(page)) - 1; + continue; + } + if (!PageLRU(page)) + found++; + /* + * If there are RECLAIMABLE pages, we need to check it. + * But now, memory offline itself doesn't call shrink_slab() + * and it still to be fixed. + */ + /* + * If the page is not RAM, page_count()should be 0. + * we don't need more check. This is an _used_ not-movable page. + * + * The problematic thing here is PG_reserved pages. PG_reserved + * is set to both of a memory hole page and a _used_ kernel + * page at boot. + */ + if (found > count) + return false; + } + return true; +} + +bool is_pageblock_removable_nolock(struct page *page) +{ + struct zone *zone = page_zone(page); + return __count_immobile_pages(zone, page, 0); +} + int set_migratetype_isolate(struct page *page) { struct zone *zone; - unsigned long flags; + unsigned long flags, pfn; + struct memory_isolate_notify arg; + int notifier_ret; int ret = -EBUSY; + int zone_idx; zone = page_zone(page); + zone_idx = zone_idx(zone); + spin_lock_irqsave(&zone->lock, flags); + + pfn = page_to_pfn(page); + arg.start_pfn = pfn; + arg.nr_pages = pageblock_nr_pages; + arg.pages_found = 0; + /* - * In future, more migrate types will be able to be isolation target. + * It may be possible to isolate a pageblock even if the + * migratetype is not MIGRATE_MOVABLE. The memory isolation + * notifier chain is used by balloon drivers to return the + * number of pages in a range that are held by the balloon + * driver to shrink memory. If all the pages are accounted for + * by balloons, are free, or on the LRU, isolation can continue. + * Later, for example, when memory hotplug notifier runs, these + * pages reported as "can be isolated" should be isolated(freed) + * by the balloon driver through the memory notifier chain. */ - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); + notifier_ret = notifier_to_errno(notifier_ret); + if (notifier_ret) goto out; - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - move_freepages_block(zone, page, MIGRATE_ISOLATE); - ret = 0; + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. + * We just check MOVABLE pages. + */ + if (__count_immobile_pages(zone, page, arg.pages_found)) + ret = 0; + + /* + * immobile means "not-on-lru" paes. If immobile is larger than + * removable-by-driver pages reported by notifier, we'll fail. + */ + out: + if (!ret) { + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + move_freepages_block(zone, page, MIGRATE_ISOLATE); + } + spin_unlock_irqrestore(&zone->lock, flags); if (!ret) drain_all_pages(); @@ -4944,3 +5471,101 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) spin_unlock_irqrestore(&zone->lock, flags); } #endif + +#ifdef CONFIG_MEMORY_FAILURE +bool is_free_buddy_page(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + int order; + + spin_lock_irqsave(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) { + struct page *page_head = page - (pfn & ((1 << order) - 1)); + + if (PageBuddy(page_head) && page_order(page_head) >= order) + break; + } + spin_unlock_irqrestore(&zone->lock, flags); + + return order < MAX_ORDER; +} +#endif + +static struct trace_print_flags pageflag_names[] = { + {1UL << PG_locked, "locked" }, + {1UL << PG_error, "error" }, + {1UL << PG_referenced, "referenced" }, + {1UL << PG_uptodate, "uptodate" }, + {1UL << PG_dirty, "dirty" }, + {1UL << PG_lru, "lru" }, + {1UL << PG_active, "active" }, + {1UL << PG_slab, "slab" }, + {1UL << PG_owner_priv_1, "owner_priv_1" }, + {1UL << PG_arch_1, "arch_1" }, + {1UL << PG_reserved, "reserved" }, + {1UL << PG_private, "private" }, + {1UL << PG_private_2, "private_2" }, + {1UL << PG_writeback, "writeback" }, +#ifdef CONFIG_PAGEFLAGS_EXTENDED + {1UL << PG_head, "head" }, + {1UL << PG_tail, "tail" }, +#else + {1UL << PG_compound, "compound" }, +#endif + {1UL << PG_swapcache, "swapcache" }, + {1UL << PG_mappedtodisk, "mappedtodisk" }, + {1UL << PG_reclaim, "reclaim" }, + {1UL << PG_buddy, "buddy" }, + {1UL << PG_swapbacked, "swapbacked" }, + {1UL << PG_unevictable, "unevictable" }, +#ifdef CONFIG_MMU + {1UL << PG_mlocked, "mlocked" }, +#endif +#ifdef CONFIG_ARCH_USES_PG_UNCACHED + {1UL << PG_uncached, "uncached" }, +#endif +#ifdef CONFIG_MEMORY_FAILURE + {1UL << PG_hwpoison, "hwpoison" }, +#endif + {-1UL, NULL }, +}; + +static void dump_page_flags(unsigned long flags) +{ + const char *delim = ""; + unsigned long mask; + int i; + + printk(KERN_ALERT "page flags: %#lx(", flags); + + /* remove zone id */ + flags &= (1UL << NR_PAGEFLAGS) - 1; + + for (i = 0; pageflag_names[i].name && flags; i++) { + + mask = pageflag_names[i].mask; + if ((flags & mask) != mask) + continue; + + flags &= ~mask; + printk("%s%s", delim, pageflag_names[i].name); + delim = "|"; + } + + /* check for left over flags */ + if (flags) + printk("%s%#lx", delim, flags); + + printk(")\n"); +} + +void dump_page(struct page *page) +{ + printk(KERN_ALERT + "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", + page, page_count(page), page_mapcount(page), + page->mapping, page->index); + dump_page_flags(page->flags); +} diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index f22b4eb..5bffada 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -9,6 +9,7 @@ #include <linux/vmalloc.h> #include <linux/cgroup.h> #include <linux/swapops.h> +#include <linux/kmemleak.h> static void __meminit __init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) @@ -116,10 +117,22 @@ static int __init_refok init_section_page_cgroup(unsigned long pfn) nid = page_to_nid(pfn_to_page(pfn)); table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; VM_BUG_ON(!slab_is_available()); - base = kmalloc_node(table_size, + if (node_state(nid, N_HIGH_MEMORY)) { + base = kmalloc_node(table_size, GFP_KERNEL | __GFP_NOWARN, nid); - if (!base) - base = vmalloc_node(table_size, nid); + if (!base) + base = vmalloc_node(table_size, nid); + } else { + base = kmalloc(table_size, GFP_KERNEL | __GFP_NOWARN); + if (!base) + base = vmalloc(table_size); + } + /* + * The value stored in section->page_cgroup is (base - pfn) + * and it does not point to the memory block allocated above, + * causing kmemleak false positives. + */ + kmemleak_not_leak(base); } else { /* * We don't have to allocate page_cgroup again, but @@ -278,6 +291,7 @@ static DEFINE_MUTEX(swap_cgroup_mutex); struct swap_cgroup_ctrl { struct page **map; unsigned long length; + spinlock_t lock; }; struct swap_cgroup_ctrl swap_cgroup_ctrl[MAX_SWAPFILES]; @@ -329,6 +343,43 @@ not_enough_page: } /** + * swap_cgroup_cmpxchg - cmpxchg mem_cgroup's id for this swp_entry. + * @end: swap entry to be cmpxchged + * @old: old id + * @new: new id + * + * Returns old id at success, 0 at failure. + * (There is no mem_cgroup useing 0 as its id) + */ +unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, + unsigned short old, unsigned short new) +{ + int type = swp_type(ent); + unsigned long offset = swp_offset(ent); + unsigned long idx = offset / SC_PER_PAGE; + unsigned long pos = offset & SC_POS_MASK; + struct swap_cgroup_ctrl *ctrl; + struct page *mappage; + struct swap_cgroup *sc; + unsigned long flags; + unsigned short retval; + + ctrl = &swap_cgroup_ctrl[type]; + + mappage = ctrl->map[idx]; + sc = page_address(mappage); + sc += pos; + spin_lock_irqsave(&ctrl->lock, flags); + retval = sc->id; + if (retval == old) + sc->id = new; + else + retval = 0; + spin_unlock_irqrestore(&ctrl->lock, flags); + return retval; +} + +/** * swap_cgroup_record - record mem_cgroup for this swp_entry. * @ent: swap entry to be recorded into * @mem: mem_cgroup to be recorded @@ -346,14 +397,17 @@ unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id) struct page *mappage; struct swap_cgroup *sc; unsigned short old; + unsigned long flags; ctrl = &swap_cgroup_ctrl[type]; mappage = ctrl->map[idx]; sc = page_address(mappage); sc += pos; + spin_lock_irqsave(&ctrl->lock, flags); old = sc->id; sc->id = id; + spin_unlock_irqrestore(&ctrl->lock, flags); return old; } @@ -405,6 +459,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) mutex_lock(&swap_cgroup_mutex); ctrl->length = length; ctrl->map = array; + spin_lock_init(&ctrl->lock); if (swap_cgroup_prepare(type)) { /* memory shortage */ ctrl->map = NULL; diff --git a/mm/page_io.c b/mm/page_io.c index c6f3e50..2dee975 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/kernel_stat.h> +#include <linux/gfp.h> #include <linux/pagemap.h> #include <linux/swap.h> #include <linux/bio.h> @@ -19,20 +20,15 @@ #include <linux/writeback.h> #include <asm/pgtable.h> -static struct bio *get_swap_bio(gfp_t gfp_flags, pgoff_t index, +static struct bio *get_swap_bio(gfp_t gfp_flags, struct page *page, bio_end_io_t end_io) { struct bio *bio; bio = bio_alloc(gfp_flags, 1); if (bio) { - struct swap_info_struct *sis; - swp_entry_t entry = { .val = index, }; - - sis = get_swap_info_struct(swp_type(entry)); - bio->bi_sector = map_swap_page(sis, swp_offset(entry)) * - (PAGE_SIZE >> 9); - bio->bi_bdev = sis->bdev; + bio->bi_sector = map_swap_page(page, &bio->bi_bdev); + bio->bi_sector <<= PAGE_SHIFT - 9; bio->bi_io_vec[0].bv_page = page; bio->bi_io_vec[0].bv_len = PAGE_SIZE; bio->bi_io_vec[0].bv_offset = 0; @@ -102,8 +98,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } - bio = get_swap_bio(GFP_NOIO, page_private(page), page, - end_swap_bio_write); + bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); unlock_page(page); @@ -111,7 +106,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) goto out; } if (wbc->sync_mode == WB_SYNC_ALL) - rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); + rw |= REQ_SYNC | REQ_UNPLUG; count_vm_event(PSWPOUT); set_page_writeback(page); unlock_page(page); @@ -127,8 +122,7 @@ int swap_readpage(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); - bio = get_swap_bio(GFP_KERNEL, page_private(page), page, - end_swap_bio_read); + bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); ret = -ENOMEM; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5e0ffd9..4ae42bb 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -86,7 +86,7 @@ undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn) * all pages in [start_pfn...end_pfn) must be in the same zone. * zone->lock must be held before call this. * - * Returns 0 if all pages in the range is isolated. + * Returns 1 if all pages in the range is isolated. */ static int __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) @@ -119,7 +119,6 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) struct zone *zone; int ret; - pfn = start_pfn; /* * Note: pageblock_nr_page != MAX_ORDER. Then, chunks of free page * is not aligned to pageblock_nr_pages. diff --git a/mm/pagewalk.c b/mm/pagewalk.c index d5878be..38cc58b 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -1,6 +1,7 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/sched.h> +#include <linux/hugetlb.h> static int walk_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -79,6 +80,37 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end, return err; } +#ifdef CONFIG_HUGETLB_PAGE +static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr, + unsigned long end) +{ + unsigned long boundary = (addr & huge_page_mask(h)) + huge_page_size(h); + return boundary < end ? boundary : end; +} + +static int walk_hugetlb_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(vma); + unsigned long next; + unsigned long hmask = huge_page_mask(h); + pte_t *pte; + int err = 0; + + do { + next = hugetlb_entry_end(h, addr, end); + pte = huge_pte_offset(walk->mm, addr & hmask); + if (pte && walk->hugetlb_entry) + err = walk->hugetlb_entry(pte, hmask, addr, next, walk); + if (err) + return err; + } while (addr = next, addr != end); + + return 0; +} +#endif + /** * walk_page_range - walk a memory map's page tables with a callback * @mm: memory map to walk @@ -116,12 +148,37 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { + struct vm_area_struct *uninitialized_var(vma); + next = pgd_addr_end(addr, end); + +#ifdef CONFIG_HUGETLB_PAGE + /* + * handle hugetlb vma individually because pagetable walk for + * the hugetlb page is dependent on the architecture and + * we can't handled it in the same manner as non-huge pages. + */ + vma = find_vma(walk->mm, addr); + if (vma && is_vm_hugetlb_page(vma)) { + if (vma->vm_end < next) + next = vma->vm_end; + /* + * Hugepage is very tightly coupled with vma, so + * walk through hugetlb entries within a given vma. + */ + err = walk_hugetlb_range(vma, addr, next, walk); + if (err) + break; + pgd = pgd_offset(walk->mm, next); + continue; + } +#endif if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) break; + pgd++; continue; } if (walk->pgd_entry) @@ -131,7 +188,8 @@ int walk_page_range(unsigned long addr, unsigned long end, err = walk_pud_range(pgd, addr, next, walk); if (err) break; - } while (pgd++, addr = next, addr != end); + pgd++; + } while (addr = next, addr != end); return err; } diff --git a/mm/pdflush.c b/mm/pdflush.c deleted file mode 100644 index 235ac44..0000000 --- a/mm/pdflush.c +++ /dev/null @@ -1,269 +0,0 @@ -/* - * mm/pdflush.c - worker threads for writing back filesystem data - * - * Copyright (C) 2002, Linus Torvalds. - * - * 09Apr2002 Andrew Morton - * Initial version - * 29Feb2004 kaos@sgi.com - * Move worker thread creation to kthread to avoid chewing - * up stack space with nested calls to kernel_thread. - */ - -#include <linux/sched.h> -#include <linux/list.h> -#include <linux/signal.h> -#include <linux/spinlock.h> -#include <linux/gfp.h> -#include <linux/init.h> -#include <linux/module.h> -#include <linux/fs.h> /* Needed by writeback.h */ -#include <linux/writeback.h> /* Prototypes pdflush_operation() */ -#include <linux/kthread.h> -#include <linux/cpuset.h> -#include <linux/freezer.h> - - -/* - * Minimum and maximum number of pdflush instances - */ -#define MIN_PDFLUSH_THREADS 2 -#define MAX_PDFLUSH_THREADS 8 - -static void start_one_pdflush_thread(void); - - -/* - * The pdflush threads are worker threads for writing back dirty data. - * Ideally, we'd like one thread per active disk spindle. But the disk - * topology is very hard to divine at this level. Instead, we take - * care in various places to prevent more than one pdflush thread from - * performing writeback against a single filesystem. pdflush threads - * have the PF_FLUSHER flag set in current->flags to aid in this. - */ - -/* - * All the pdflush threads. Protected by pdflush_lock - */ -static LIST_HEAD(pdflush_list); -static DEFINE_SPINLOCK(pdflush_lock); - -/* - * The count of currently-running pdflush threads. Protected - * by pdflush_lock. - * - * Readable by sysctl, but not writable. Published to userspace at - * /proc/sys/vm/nr_pdflush_threads. - */ -int nr_pdflush_threads = 0; - -/* - * The time at which the pdflush thread pool last went empty - */ -static unsigned long last_empty_jifs; - -/* - * The pdflush thread. - * - * Thread pool management algorithm: - * - * - The minimum and maximum number of pdflush instances are bound - * by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS. - * - * - If there have been no idle pdflush instances for 1 second, create - * a new one. - * - * - If the least-recently-went-to-sleep pdflush thread has been asleep - * for more than one second, terminate a thread. - */ - -/* - * A structure for passing work to a pdflush thread. Also for passing - * state information between pdflush threads. Protected by pdflush_lock. - */ -struct pdflush_work { - struct task_struct *who; /* The thread */ - void (*fn)(unsigned long); /* A callback function */ - unsigned long arg0; /* An argument to the callback */ - struct list_head list; /* On pdflush_list, when idle */ - unsigned long when_i_went_to_sleep; -}; - -static int __pdflush(struct pdflush_work *my_work) -{ - current->flags |= PF_FLUSHER | PF_SWAPWRITE; - set_freezable(); - my_work->fn = NULL; - my_work->who = current; - INIT_LIST_HEAD(&my_work->list); - - spin_lock_irq(&pdflush_lock); - for ( ; ; ) { - struct pdflush_work *pdf; - - set_current_state(TASK_INTERRUPTIBLE); - list_move(&my_work->list, &pdflush_list); - my_work->when_i_went_to_sleep = jiffies; - spin_unlock_irq(&pdflush_lock); - schedule(); - try_to_freeze(); - spin_lock_irq(&pdflush_lock); - if (!list_empty(&my_work->list)) { - /* - * Someone woke us up, but without removing our control - * structure from the global list. swsusp will do this - * in try_to_freeze()->refrigerator(). Handle it. - */ - my_work->fn = NULL; - continue; - } - if (my_work->fn == NULL) { - printk("pdflush: bogus wakeup\n"); - continue; - } - spin_unlock_irq(&pdflush_lock); - - (*my_work->fn)(my_work->arg0); - - spin_lock_irq(&pdflush_lock); - - /* - * Thread creation: For how long have there been zero - * available threads? - * - * To throttle creation, we reset last_empty_jifs. - */ - if (time_after(jiffies, last_empty_jifs + 1 * HZ)) { - if (list_empty(&pdflush_list)) { - if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) { - last_empty_jifs = jiffies; - nr_pdflush_threads++; - spin_unlock_irq(&pdflush_lock); - start_one_pdflush_thread(); - spin_lock_irq(&pdflush_lock); - } - } - } - - my_work->fn = NULL; - - /* - * Thread destruction: For how long has the sleepiest - * thread slept? - */ - if (list_empty(&pdflush_list)) - continue; - if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS) - continue; - pdf = list_entry(pdflush_list.prev, struct pdflush_work, list); - if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) { - /* Limit exit rate */ - pdf->when_i_went_to_sleep = jiffies; - break; /* exeunt */ - } - } - nr_pdflush_threads--; - spin_unlock_irq(&pdflush_lock); - return 0; -} - -/* - * Of course, my_work wants to be just a local in __pdflush(). It is - * separated out in this manner to hopefully prevent the compiler from - * performing unfortunate optimisations against the auto variables. Because - * these are visible to other tasks and CPUs. (No problem has actually - * been observed. This is just paranoia). - */ -static int pdflush(void *dummy) -{ - struct pdflush_work my_work; - cpumask_var_t cpus_allowed; - - /* - * Since the caller doesn't even check kthread_run() worked, let's not - * freak out too much if this fails. - */ - if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { - printk(KERN_WARNING "pdflush failed to allocate cpumask\n"); - return 0; - } - - /* - * pdflush can spend a lot of time doing encryption via dm-crypt. We - * don't want to do that at keventd's priority. - */ - set_user_nice(current, 0); - - /* - * Some configs put our parent kthread in a limited cpuset, - * which kthread() overrides, forcing cpus_allowed == cpu_all_mask. - * Our needs are more modest - cut back to our cpusets cpus_allowed. - * This is needed as pdflush's are dynamically created and destroyed. - * The boottime pdflush's are easily placed w/o these 2 lines. - */ - cpuset_cpus_allowed(current, cpus_allowed); - set_cpus_allowed_ptr(current, cpus_allowed); - free_cpumask_var(cpus_allowed); - - return __pdflush(&my_work); -} - -/* - * Attempt to wake up a pdflush thread, and get it to do some work for you. - * Returns zero if it indeed managed to find a worker thread, and passed your - * payload to it. - */ -int pdflush_operation(void (*fn)(unsigned long), unsigned long arg0) -{ - unsigned long flags; - int ret = 0; - - BUG_ON(fn == NULL); /* Hard to diagnose if it's deferred */ - - spin_lock_irqsave(&pdflush_lock, flags); - if (list_empty(&pdflush_list)) { - ret = -1; - } else { - struct pdflush_work *pdf; - - pdf = list_entry(pdflush_list.next, struct pdflush_work, list); - list_del_init(&pdf->list); - if (list_empty(&pdflush_list)) - last_empty_jifs = jiffies; - pdf->fn = fn; - pdf->arg0 = arg0; - wake_up_process(pdf->who); - } - spin_unlock_irqrestore(&pdflush_lock, flags); - - return ret; -} - -static void start_one_pdflush_thread(void) -{ - struct task_struct *k; - - k = kthread_run(pdflush, NULL, "pdflush"); - if (unlikely(IS_ERR(k))) { - spin_lock_irq(&pdflush_lock); - nr_pdflush_threads--; - spin_unlock_irq(&pdflush_lock); - } -} - -static int __init pdflush_init(void) -{ - int i; - - /* - * Pre-set nr_pdflush_threads... If we fail to create, - * the count will be decremented. - */ - nr_pdflush_threads = MIN_PDFLUSH_THREADS; - - for (i = 0; i < MIN_PDFLUSH_THREADS; i++) - start_one_pdflush_thread(); - return 0; -} - -module_init(pdflush_init); diff --git a/mm/percpu-km.c b/mm/percpu-km.c new file mode 100644 index 0000000..89633fe --- /dev/null +++ b/mm/percpu-km.c @@ -0,0 +1,108 @@ +/* + * mm/percpu-km.c - kernel memory based chunk allocation + * + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + * + * Chunks are allocated as a contiguous kernel memory using gfp + * allocation. This is to be used on nommu architectures. + * + * To use percpu-km, + * + * - define CONFIG_NEED_PER_CPU_KM from the arch Kconfig. + * + * - CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK must not be defined. It's + * not compatible with PER_CPU_KM. EMBED_FIRST_CHUNK should work + * fine. + * + * - NUMA is not supported. When setting up the first chunk, + * @cpu_distance_fn should be NULL or report all CPUs to be nearer + * than or at LOCAL_DISTANCE. + * + * - It's best if the chunk size is power of two multiple of + * PAGE_SIZE. Because each chunk is allocated as a contiguous + * kernel memory block using alloc_pages(), memory will be wasted if + * chunk size is not aligned. percpu-km code will whine about it. + */ + +#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) +#error "contiguous percpu allocation is incompatible with paged first chunk" +#endif + +#include <linux/log2.h> + +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + unsigned int cpu; + + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + + return 0; +} + +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + /* nada */ +} + +static struct pcpu_chunk *pcpu_create_chunk(void) +{ + const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; + struct pcpu_chunk *chunk; + struct page *pages; + int i; + + chunk = pcpu_alloc_chunk(); + if (!chunk) + return NULL; + + pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages)); + if (!pages) { + pcpu_free_chunk(chunk); + return NULL; + } + + for (i = 0; i < nr_pages; i++) + pcpu_set_page_chunk(nth_page(pages, i), chunk); + + chunk->data = pages; + chunk->base_addr = page_address(pages) - pcpu_group_offsets[0]; + return chunk; +} + +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) +{ + const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; + + if (chunk && chunk->data) + __free_pages(chunk->data, order_base_2(nr_pages)); + pcpu_free_chunk(chunk); +} + +static struct page *pcpu_addr_to_page(void *addr) +{ + return virt_to_page(addr); +} + +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) +{ + size_t nr_pages, alloc_pages; + + /* all units must be in a single group */ + if (ai->nr_groups != 1) { + printk(KERN_CRIT "percpu: can't handle more than one groups\n"); + return -EINVAL; + } + + nr_pages = (ai->groups[0].nr_units * ai->unit_size) >> PAGE_SHIFT; + alloc_pages = roundup_pow_of_two(nr_pages); + + if (alloc_pages > nr_pages) + printk(KERN_WARNING "percpu: wasting %zu pages per chunk\n", + alloc_pages - nr_pages); + + return 0; +} diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c new file mode 100644 index 0000000..7d9c1d0 --- /dev/null +++ b/mm/percpu-vm.c @@ -0,0 +1,451 @@ +/* + * mm/percpu-vm.c - vmalloc area based chunk allocation + * + * Copyright (C) 2010 SUSE Linux Products GmbH + * Copyright (C) 2010 Tejun Heo <tj@kernel.org> + * + * This file is released under the GPLv2. + * + * Chunks are mapped into vmalloc areas and populated page by page. + * This is the default chunk allocator. + */ + +static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) +{ + /* must not be used on pre-mapped chunk */ + WARN_ON(chunk->immutable); + + return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx)); +} + +/** + * pcpu_get_pages_and_bitmap - get temp pages array and bitmap + * @chunk: chunk of interest + * @bitmapp: output parameter for bitmap + * @may_alloc: may allocate the array + * + * Returns pointer to array of pointers to struct page and bitmap, + * both of which can be indexed with pcpu_page_idx(). The returned + * array is cleared to zero and *@bitmapp is copied from + * @chunk->populated. Note that there is only one array and bitmap + * and access exclusion is the caller's responsibility. + * + * CONTEXT: + * pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc. + * Otherwise, don't care. + * + * RETURNS: + * Pointer to temp pages array on success, NULL on failure. + */ +static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk, + unsigned long **bitmapp, + bool may_alloc) +{ + static struct page **pages; + static unsigned long *bitmap; + size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]); + size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) * + sizeof(unsigned long); + + if (!pages || !bitmap) { + if (may_alloc && !pages) + pages = pcpu_mem_alloc(pages_size); + if (may_alloc && !bitmap) + bitmap = pcpu_mem_alloc(bitmap_size); + if (!pages || !bitmap) + return NULL; + } + + memset(pages, 0, pages_size); + bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages); + + *bitmapp = bitmap; + return pages; +} + +/** + * pcpu_free_pages - free pages which were allocated for @chunk + * @chunk: chunk pages were allocated for + * @pages: array of pages to be freed, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be freed + * @page_end: page index of the last page to be freed + 1 + * + * Free pages [@page_start and @page_end) in @pages for all units. + * The pages were allocated for @chunk. + */ +static void pcpu_free_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page = pages[pcpu_page_idx(cpu, i)]; + + if (page) + __free_page(page); + } + } +} + +/** + * pcpu_alloc_pages - allocates pages for @chunk + * @chunk: target chunk + * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() + * @populated: populated bitmap + * @page_start: page index of the first page to be allocated + * @page_end: page index of the last page to be allocated + 1 + * + * Allocate pages [@page_start,@page_end) into @pages for all units. + * The allocation is for @chunk. Percpu core doesn't care about the + * content of @pages and will pass it verbatim to pcpu_map_pages(). + */ +static int pcpu_alloc_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; + + *pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0); + if (!*pagep) { + pcpu_free_pages(chunk, pages, populated, + page_start, page_end); + return -ENOMEM; + } + } + } + return 0; +} + +/** + * pcpu_pre_unmap_flush - flush cache prior to unmapping + * @chunk: chunk the regions to be flushed belongs to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages in [@page_start,@page_end) of @chunk are about to be + * unmapped. Flush cache. As each flushing trial can be very + * expensive, issue flush on the whole region at once rather than + * doing it for each cpu. This could be an overkill but is more + * scalable. + */ +static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vunmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +static void __pcpu_unmap_pages(unsigned long addr, int nr_pages) +{ + unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT); +} + +/** + * pcpu_unmap_pages - unmap pages out of a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array which can be used to pass information to free + * @populated: populated bitmap + * @page_start: page index of the first page to unmap + * @page_end: page index of the last page to unmap + 1 + * + * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. + * Corresponding elements in @pages were cleared by the caller and can + * be used to carry information to pcpu_free_pages() which will be + * called after all unmaps are finished. The caller should call + * proper pre/post flush functions. + */ +static void pcpu_unmap_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu; + int i; + + for_each_possible_cpu(cpu) { + for (i = page_start; i < page_end; i++) { + struct page *page; + + page = pcpu_chunk_page(chunk, cpu, i); + WARN_ON(!page); + pages[pcpu_page_idx(cpu, i)] = page; + } + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start), + page_end - page_start); + } + + for (i = page_start; i < page_end; i++) + __clear_bit(i, populated); +} + +/** + * pcpu_post_unmap_tlb_flush - flush TLB after unmapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been unmapped. Flush + * TLB for the regions. This can be skipped if the area is to be + * returned to vmalloc as vmalloc will handle TLB flushing lazily. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_tlb_kernel_range( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +static int __pcpu_map_pages(unsigned long addr, struct page **pages, + int nr_pages) +{ + return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT, + PAGE_KERNEL, pages); +} + +/** + * pcpu_map_pages - map pages into a pcpu_chunk + * @chunk: chunk of interest + * @pages: pages array containing pages to be mapped + * @populated: populated bitmap + * @page_start: page index of the first page to map + * @page_end: page index of the last page to map + 1 + * + * For each cpu, map pages [@page_start,@page_end) into @chunk. The + * caller is responsible for calling pcpu_post_map_flush() after all + * mappings are complete. + * + * This function is responsible for setting corresponding bits in + * @chunk->populated bitmap and whatever is necessary for reverse + * lookup (addr -> chunk). + */ +static int pcpu_map_pages(struct pcpu_chunk *chunk, + struct page **pages, unsigned long *populated, + int page_start, int page_end) +{ + unsigned int cpu, tcpu; + int i, err; + + for_each_possible_cpu(cpu) { + err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start), + &pages[pcpu_page_idx(cpu, page_start)], + page_end - page_start); + if (err < 0) + goto err; + } + + /* mapping successful, link chunk and mark populated */ + for (i = page_start; i < page_end; i++) { + for_each_possible_cpu(cpu) + pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)], + chunk); + __set_bit(i, populated); + } + + return 0; + +err: + for_each_possible_cpu(tcpu) { + if (tcpu == cpu) + break; + __pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start), + page_end - page_start); + } + return err; +} + +/** + * pcpu_post_map_flush - flush cache after mapping + * @chunk: pcpu_chunk the regions to be flushed belong to + * @page_start: page index of the first page to be flushed + * @page_end: page index of the last page to be flushed + 1 + * + * Pages [@page_start,@page_end) of @chunk have been mapped. Flush + * cache. + * + * As with pcpu_pre_unmap_flush(), TLB flushing also is done at once + * for the whole region. + */ +static void pcpu_post_map_flush(struct pcpu_chunk *chunk, + int page_start, int page_end) +{ + flush_cache_vmap( + pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start), + pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end)); +} + +/** + * pcpu_populate_chunk - populate and map an area of a pcpu_chunk + * @chunk: chunk of interest + * @off: offset to the area to populate + * @size: size of the area to populate in bytes + * + * For each cpu, populate and map pages [@page_start,@page_end) into + * @chunk. The area is cleared on return. + * + * CONTEXT: + * pcpu_alloc_mutex, does GFP_KERNEL allocation. + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + int free_end = page_start, unmap_end = page_start; + struct page **pages; + unsigned long *populated; + unsigned int cpu; + int rs, re, rc; + + /* quick path, check whether all pages are already there */ + rs = page_start; + pcpu_next_pop(chunk, &rs, &re, page_end); + if (rs == page_start && re == page_end) + goto clear; + + /* need to allocate and map pages, this chunk can't be immutable */ + WARN_ON(chunk->immutable); + + pages = pcpu_get_pages_and_bitmap(chunk, &populated, true); + if (!pages) + return -ENOMEM; + + /* alloc and map */ + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_alloc_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_free; + free_end = re; + } + + pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) { + rc = pcpu_map_pages(chunk, pages, populated, rs, re); + if (rc) + goto err_unmap; + unmap_end = re; + } + pcpu_post_map_flush(chunk, page_start, page_end); + + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); +clear: + for_each_possible_cpu(cpu) + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + return 0; + +err_unmap: + pcpu_pre_unmap_flush(chunk, page_start, unmap_end); + pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end); +err_free: + pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + return rc; +} + +/** + * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk + * @chunk: chunk to depopulate + * @off: offset to the area to depopulate + * @size: size of the area to depopulate in bytes + * @flush: whether to flush cache and tlb or not + * + * For each cpu, depopulate and unmap pages [@page_start,@page_end) + * from @chunk. If @flush is true, vcache is flushed before unmapping + * and tlb after. + * + * CONTEXT: + * pcpu_alloc_mutex. + */ +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size) +{ + int page_start = PFN_DOWN(off); + int page_end = PFN_UP(off + size); + struct page **pages; + unsigned long *populated; + int rs, re; + + /* quick path, check whether it's empty already */ + rs = page_start; + pcpu_next_unpop(chunk, &rs, &re, page_end); + if (rs == page_start && re == page_end) + return; + + /* immutable chunks can't be depopulated */ + WARN_ON(chunk->immutable); + + /* + * If control reaches here, there must have been at least one + * successful population attempt so the temp pages array must + * be available now. + */ + pages = pcpu_get_pages_and_bitmap(chunk, &populated, false); + BUG_ON(!pages); + + /* unmap and free */ + pcpu_pre_unmap_flush(chunk, page_start, page_end); + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_unmap_pages(chunk, pages, populated, rs, re); + + /* no need to flush tlb, vmalloc will handle it lazily */ + + pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) + pcpu_free_pages(chunk, pages, populated, rs, re); + + /* commit new bitmap */ + bitmap_copy(chunk->populated, populated, pcpu_unit_pages); +} + +static struct pcpu_chunk *pcpu_create_chunk(void) +{ + struct pcpu_chunk *chunk; + struct vm_struct **vms; + + chunk = pcpu_alloc_chunk(); + if (!chunk) + return NULL; + + vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, + pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); + if (!vms) { + pcpu_free_chunk(chunk); + return NULL; + } + + chunk->data = vms; + chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0]; + return chunk; +} + +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk) +{ + if (chunk && chunk->data) + pcpu_free_vm_areas(chunk->data, pcpu_nr_groups); + pcpu_free_chunk(chunk); +} + +static struct page *pcpu_addr_to_page(void *addr) +{ + return vmalloc_to_page(addr); +} + +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai) +{ + /* no extra restriction */ + return 0; +} diff --git a/mm/percpu.c b/mm/percpu.c index c0b2c1a..3dd4984 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1,5 +1,5 @@ /* - * linux/mm/percpu.c - percpu memory allocator + * mm/percpu.c - percpu memory allocator * * Copyright (C) 2009 SUSE Linux Products GmbH * Copyright (C) 2009 Tejun Heo <tj@kernel.org> @@ -7,13 +7,13 @@ * This file is released under the GPLv2. * * This is percpu allocator which can handle both static and dynamic - * areas. Percpu areas are allocated in chunks in vmalloc area. Each - * chunk is consisted of num_possible_cpus() units and the first chunk - * is used for static percpu variables in the kernel image (special - * boot time alloc/init handling necessary as these areas need to be - * brought up before allocation services are running). Unit grows as - * necessary and all units grow or shrink in unison. When a chunk is - * filled up, another chunk is allocated. ie. in vmalloc area + * areas. Percpu areas are allocated in chunks. Each chunk is + * consisted of boot-time determined number of units and the first + * chunk is used for static percpu variables in the kernel image + * (special boot time alloc/init handling necessary as these areas + * need to be brought up before allocation services are running). + * Unit grows as necessary and all units grow or shrink in unison. + * When a chunk is filled up, another chunk is allocated. * * c0 c1 c2 * ------------------- ------------------- ------------ @@ -22,14 +22,16 @@ * * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, - * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers pcpu_unit_size apart. + * c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to + * cpus. On NUMA, the mapping can be non-linear and even sparse. + * Percpu access can be done by configuring percpu base registers + * according to cpu to unit mapping and pcpu_unit_size. * - * There are usually many small percpu allocations many of them as - * small as 4 bytes. The allocator organizes chunks into lists + * There are usually many small percpu allocations many of them being + * as small as 4 bytes. The allocator organizes chunks into lists * according to free size and tries to allocate from the fullest one. * Each chunk keeps the maximum contiguous area size hint which is - * guaranteed to be eqaul to or larger than the maximum contiguous + * guaranteed to be equal to or larger than the maximum contiguous * area in the chunk. This helps the allocator not to iterate the * chunk maps unnecessarily. * @@ -43,8 +45,6 @@ * * To use this allocator, arch code should do the followings. * - * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA - * * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate * regular address to percpu pointer and back if they need to be * different from the default @@ -55,7 +55,9 @@ #include <linux/bitmap.h> #include <linux/bootmem.h> +#include <linux/err.h> #include <linux/list.h> +#include <linux/log2.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/mutex.h> @@ -69,45 +71,67 @@ #include <asm/cacheflush.h> #include <asm/sections.h> #include <asm/tlbflush.h> +#include <asm/io.h> #define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */ #define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */ +#ifdef CONFIG_SMP /* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */ #ifndef __addr_to_pcpu_ptr #define __addr_to_pcpu_ptr(addr) \ - (void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \ - + (unsigned long)__per_cpu_start) + (void __percpu *)((unsigned long)(addr) - \ + (unsigned long)pcpu_base_addr + \ + (unsigned long)__per_cpu_start) #endif #ifndef __pcpu_ptr_to_addr #define __pcpu_ptr_to_addr(ptr) \ - (void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \ - - (unsigned long)__per_cpu_start) + (void __force *)((unsigned long)(ptr) + \ + (unsigned long)pcpu_base_addr - \ + (unsigned long)__per_cpu_start) #endif +#else /* CONFIG_SMP */ +/* on UP, it's always identity mapped */ +#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr) +#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr) +#endif /* CONFIG_SMP */ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ - struct vm_struct *vm; /* mapped vmalloc region */ + void *base_addr; /* base address of this chunk */ int map_used; /* # of map entries used */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ + void *data; /* chunk data */ bool immutable; /* no [de]population allowed */ - struct page **page; /* points to page array */ - struct page *page_ar[]; /* #cpus * UNIT_PAGES */ + unsigned long populated[]; /* populated bitmap */ }; static int pcpu_unit_pages __read_mostly; static int pcpu_unit_size __read_mostly; -static int pcpu_chunk_size __read_mostly; +static int pcpu_nr_units __read_mostly; +static int pcpu_atom_size __read_mostly; static int pcpu_nr_slots __read_mostly; static size_t pcpu_chunk_struct_size __read_mostly; +/* cpus with the lowest and highest unit numbers */ +static unsigned int pcpu_first_unit_cpu __read_mostly; +static unsigned int pcpu_last_unit_cpu __read_mostly; + /* the address of the first chunk which starts with the kernel static area */ void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); +static const int *pcpu_unit_map __read_mostly; /* cpu -> unit */ +const unsigned long *pcpu_unit_offsets __read_mostly; /* cpu -> unit offset */ + +/* group information, used for vm allocation */ +static int pcpu_nr_groups __read_mostly; +static const unsigned long *pcpu_group_offsets __read_mostly; +static const size_t *pcpu_group_sizes __read_mostly; + /* * The first chunk which always exists. Note that unlike other * chunks, this one can be allocated and mapped in several different @@ -129,13 +153,16 @@ static int pcpu_reserved_chunk_limit; * Synchronization rules. * * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former - * protects allocation/reclaim paths, chunks and chunk->page arrays. - * The latter is a spinlock and protects the index data structures - - * chunk slots, chunks and area maps in chunks. + * protects allocation/reclaim paths, chunks, populated bitmap and + * vmalloc mapping. The latter is a spinlock and protects the index + * data structures - chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory - * allocations are done using GFP_KERNEL with pcpu_lock released. + * allocations are done using GFP_KERNEL with pcpu_lock released. In + * general, percpu memory can't be allocated with irq off but + * irqsave/restore are still used in alloc path so that it can be used + * from early init path - sched_init() specifically. * * Free path accesses and alters only the index data structures, so it * can be safely called from atomic context. When memory needs to be @@ -155,6 +182,21 @@ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ static void pcpu_reclaim(struct work_struct *work); static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim); +static bool pcpu_addr_in_first_chunk(void *addr) +{ + void *first_start = pcpu_first_chunk->base_addr; + + return addr >= first_start && addr < first_start + pcpu_unit_size; +} + +static bool pcpu_addr_in_reserved_chunk(void *addr) +{ + void *first_start = pcpu_first_chunk->base_addr; + + return addr >= first_start && + addr < first_start + pcpu_reserved_chunk_limit; +} + static int __pcpu_size_to_slot(int size) { int highbit = fls(size); /* size is in bytes */ @@ -176,42 +218,60 @@ static int pcpu_chunk_slot(const struct pcpu_chunk *chunk) return pcpu_size_to_slot(chunk->free_size); } -static int pcpu_page_idx(unsigned int cpu, int page_idx) +/* set the pointer to a chunk in a page struct */ +static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) { - return cpu * pcpu_unit_pages + page_idx; + page->index = (unsigned long)pcpu; } -static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) +/* obtain pointer to a chunk from a page struct */ +static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) { - return &chunk->page[pcpu_page_idx(cpu, page_idx)]; + return (struct pcpu_chunk *)page->index; } -static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, - unsigned int cpu, int page_idx) +static int __maybe_unused pcpu_page_idx(unsigned int cpu, int page_idx) { - return (unsigned long)chunk->vm->addr + - (pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT); + return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx; } -static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, - int page_idx) +static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk, + unsigned int cpu, int page_idx) { - return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; + return (unsigned long)chunk->base_addr + pcpu_unit_offsets[cpu] + + (page_idx << PAGE_SHIFT); } -/* set the pointer to a chunk in a page struct */ -static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) +static void __maybe_unused pcpu_next_unpop(struct pcpu_chunk *chunk, + int *rs, int *re, int end) { - page->index = (unsigned long)pcpu; + *rs = find_next_zero_bit(chunk->populated, end, *rs); + *re = find_next_bit(chunk->populated, end, *rs + 1); } -/* obtain pointer to a chunk from a page struct */ -static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) +static void __maybe_unused pcpu_next_pop(struct pcpu_chunk *chunk, + int *rs, int *re, int end) { - return (struct pcpu_chunk *)page->index; + *rs = find_next_bit(chunk->populated, end, *rs); + *re = find_next_zero_bit(chunk->populated, end, *rs + 1); } +/* + * (Un)populated page region iterators. Iterate over (un)populated + * page regions betwen @start and @end in @chunk. @rs and @re should + * be integer variables and will be set to start and end page index of + * the current region. + */ +#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end))) + +#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \ + for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \ + (rs) < (re); \ + (rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end))) + /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate @@ -228,14 +288,13 @@ static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) */ static void *pcpu_mem_alloc(size_t size) { + if (WARN_ON_ONCE(!slab_is_available())) + return NULL; + if (size <= PAGE_SIZE) return kzalloc(size, GFP_KERNEL); - else { - void *ptr = vmalloc(size); - if (ptr) - memset(ptr, 0, size); - return ptr; - } + else + return vzalloc(size); } /** @@ -279,84 +338,81 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) } /** - * pcpu_chunk_addr_search - determine chunk containing specified address - * @addr: address for which the chunk needs to be determined. + * pcpu_need_to_extend - determine whether chunk area map needs to be extended + * @chunk: chunk of interest + * + * Determine whether area map of @chunk needs to be extended to + * accomodate a new allocation. + * + * CONTEXT: + * pcpu_lock. * * RETURNS: - * The address of the found chunk. + * New target map allocation length if extension is necessary, 0 + * otherwise. */ -static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) +static int pcpu_need_to_extend(struct pcpu_chunk *chunk) { - void *first_start = pcpu_first_chunk->vm->addr; + int new_alloc; - /* is it in the first chunk? */ - if (addr >= first_start && addr < first_start + pcpu_chunk_size) { - /* is it in the reserved area? */ - if (addr < first_start + pcpu_reserved_chunk_limit) - return pcpu_reserved_chunk; - return pcpu_first_chunk; - } + if (chunk->map_alloc >= chunk->map_used + 2) + return 0; - return pcpu_get_page_chunk(vmalloc_to_page(addr)); + new_alloc = PCPU_DFL_MAP_ALLOC; + while (new_alloc < chunk->map_used + 2) + new_alloc *= 2; + + return new_alloc; } /** - * pcpu_extend_area_map - extend area map for allocation - * @chunk: target chunk + * pcpu_extend_area_map - extend area map of a chunk + * @chunk: chunk of interest + * @new_alloc: new target allocation length of the area map * - * Extend area map of @chunk so that it can accomodate an allocation. - * A single allocation can split an area into three areas, so this - * function makes sure that @chunk->map has at least two extra slots. + * Extend area map of @chunk to have @new_alloc entries. * * CONTEXT: - * pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired - * if area map is extended. + * Does GFP_KERNEL allocation. Grabs and releases pcpu_lock. * * RETURNS: - * 0 if noop, 1 if successfully extended, -errno on failure. + * 0 on success, -errno on failure. */ -static int pcpu_extend_area_map(struct pcpu_chunk *chunk) +static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) { - int new_alloc; - int *new; - size_t size; + int *old = NULL, *new = NULL; + size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); + unsigned long flags; - /* has enough? */ - if (chunk->map_alloc >= chunk->map_used + 2) - return 0; + new = pcpu_mem_alloc(new_size); + if (!new) + return -ENOMEM; - spin_unlock_irq(&pcpu_lock); + /* acquire pcpu_lock and switch to new area map */ + spin_lock_irqsave(&pcpu_lock, flags); - new_alloc = PCPU_DFL_MAP_ALLOC; - while (new_alloc < chunk->map_used + 2) - new_alloc *= 2; + if (new_alloc <= chunk->map_alloc) + goto out_unlock; - new = pcpu_mem_alloc(new_alloc * sizeof(new[0])); - if (!new) { - spin_lock_irq(&pcpu_lock); - return -ENOMEM; - } + old_size = chunk->map_alloc * sizeof(chunk->map[0]); + old = chunk->map; - /* - * Acquire pcpu_lock and switch to new area map. Only free - * could have happened inbetween, so map_used couldn't have - * grown. - */ - spin_lock_irq(&pcpu_lock); - BUG_ON(new_alloc < chunk->map_used + 2); + memcpy(new, old, old_size); - size = chunk->map_alloc * sizeof(chunk->map[0]); - memcpy(new, chunk->map, size); + chunk->map_alloc = new_alloc; + chunk->map = new; + new = NULL; + +out_unlock: + spin_unlock_irqrestore(&pcpu_lock, flags); /* - * map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is - * one of the first chunks and still using static map. + * pcpu_mem_free() might end up calling vfree() which uses + * IRQ-unsafe lock and thus can't be called under pcpu_lock. */ - if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC) - pcpu_mem_free(chunk->map, size); + pcpu_mem_free(old, old_size); + pcpu_mem_free(new, new_size); - chunk->map_alloc = new_alloc; - chunk->map = new; return 0; } @@ -544,223 +600,92 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme) pcpu_chunk_relocate(chunk, oslot); } -/** - * pcpu_unmap - unmap pages out of a pcpu_chunk - * @chunk: chunk of interest - * @page_start: page index of the first page to unmap - * @page_end: page index of the last page to unmap + 1 - * @flush: whether to flush cache and tlb or not - * - * For each cpu, unmap pages [@page_start,@page_end) out of @chunk. - * If @flush is true, vcache is flushed before unmapping and tlb - * after. - */ -static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end, - bool flush) +static struct pcpu_chunk *pcpu_alloc_chunk(void) { - unsigned int last = num_possible_cpus() - 1; - unsigned int cpu; - - /* unmap must not be done on immutable chunk */ - WARN_ON(chunk->immutable); - - /* - * Each flushing trial can be very expensive, issue flush on - * the whole region at once rather than doing it for each cpu. - * This could be an overkill but is more scalable. - */ - if (flush) - flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); - - for_each_possible_cpu(cpu) - unmap_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT); - - /* ditto as flush_cache_vunmap() */ - if (flush) - flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); -} - -/** - * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk - * @chunk: chunk to depopulate - * @off: offset to the area to depopulate - * @size: size of the area to depopulate in bytes - * @flush: whether to flush cache and tlb or not - * - * For each cpu, depopulate and unmap pages [@page_start,@page_end) - * from @chunk. If @flush is true, vcache is flushed before unmapping - * and tlb after. - * - * CONTEXT: - * pcpu_alloc_mutex. - */ -static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size, - bool flush) -{ - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int unmap_start = -1; - int uninitialized_var(unmap_end); - unsigned int cpu; - int i; - - for (i = page_start; i < page_end; i++) { - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); - - if (!*pagep) - continue; - - __free_page(*pagep); - - /* - * If it's partial depopulation, it might get - * populated or depopulated again. Mark the - * page gone. - */ - *pagep = NULL; - - unmap_start = unmap_start < 0 ? i : unmap_start; - unmap_end = i + 1; - } - } - - if (unmap_start >= 0) - pcpu_unmap(chunk, unmap_start, unmap_end, flush); -} - -/** - * pcpu_map - map pages into a pcpu_chunk - * @chunk: chunk of interest - * @page_start: page index of the first page to map - * @page_end: page index of the last page to map + 1 - * - * For each cpu, map pages [@page_start,@page_end) into @chunk. - * vcache is flushed afterwards. - */ -static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end) -{ - unsigned int last = num_possible_cpus() - 1; - unsigned int cpu; - int err; - - /* map must not be done on immutable chunk */ - WARN_ON(chunk->immutable); - - for_each_possible_cpu(cpu) { - err = map_kernel_range_noflush( - pcpu_chunk_addr(chunk, cpu, page_start), - (page_end - page_start) << PAGE_SHIFT, - PAGE_KERNEL, - pcpu_chunk_pagep(chunk, cpu, page_start)); - if (err < 0) - return err; - } - - /* flush at once, please read comments in pcpu_unmap() */ - flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start), - pcpu_chunk_addr(chunk, last, page_end)); - return 0; -} - -/** - * pcpu_populate_chunk - populate and map an area of a pcpu_chunk - * @chunk: chunk of interest - * @off: offset to the area to populate - * @size: size of the area to populate in bytes - * - * For each cpu, populate and map pages [@page_start,@page_end) into - * @chunk. The area is cleared on return. - * - * CONTEXT: - * pcpu_alloc_mutex, does GFP_KERNEL allocation. - */ -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) -{ - const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; - int page_start = PFN_DOWN(off); - int page_end = PFN_UP(off + size); - int map_start = -1; - int uninitialized_var(map_end); - unsigned int cpu; - int i; - - for (i = page_start; i < page_end; i++) { - if (pcpu_chunk_page_occupied(chunk, i)) { - if (map_start >= 0) { - if (pcpu_map(chunk, map_start, map_end)) - goto err; - map_start = -1; - } - continue; - } - - map_start = map_start < 0 ? i : map_start; - map_end = i + 1; + struct pcpu_chunk *chunk; - for_each_possible_cpu(cpu) { - struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i); + chunk = pcpu_mem_alloc(pcpu_chunk_struct_size); + if (!chunk) + return NULL; - *pagep = alloc_pages_node(cpu_to_node(cpu), - alloc_mask, 0); - if (!*pagep) - goto err; - pcpu_set_page_chunk(*pagep, chunk); - } + chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); + if (!chunk->map) { + kfree(chunk); + return NULL; } - if (map_start >= 0 && pcpu_map(chunk, map_start, map_end)) - goto err; + chunk->map_alloc = PCPU_DFL_MAP_ALLOC; + chunk->map[chunk->map_used++] = pcpu_unit_size; - for_each_possible_cpu(cpu) - memset(chunk->vm->addr + cpu * pcpu_unit_size + off, 0, - size); + INIT_LIST_HEAD(&chunk->list); + chunk->free_size = pcpu_unit_size; + chunk->contig_hint = pcpu_unit_size; - return 0; -err: - /* likely under heavy memory pressure, give memory back */ - pcpu_depopulate_chunk(chunk, off, size, true); - return -ENOMEM; + return chunk; } -static void free_pcpu_chunk(struct pcpu_chunk *chunk) +static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - if (chunk->vm) - free_vm_area(chunk->vm); pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); kfree(chunk); } -static struct pcpu_chunk *alloc_pcpu_chunk(void) -{ - struct pcpu_chunk *chunk; - - chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL); - if (!chunk) - return NULL; - - chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); - chunk->map_alloc = PCPU_DFL_MAP_ALLOC; - chunk->map[chunk->map_used++] = pcpu_unit_size; - chunk->page = chunk->page_ar; +/* + * Chunk management implementation. + * + * To allow different implementations, chunk alloc/free and + * [de]population are implemented in a separate file which is pulled + * into this file and compiled together. The following functions + * should be implemented. + * + * pcpu_populate_chunk - populate the specified range of a chunk + * pcpu_depopulate_chunk - depopulate the specified range of a chunk + * pcpu_create_chunk - create a new chunk + * pcpu_destroy_chunk - destroy a chunk, always preceded by full depop + * pcpu_addr_to_page - translate address to physical address + * pcpu_verify_alloc_info - check alloc_info is acceptable during init + */ +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size); +static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size); +static struct pcpu_chunk *pcpu_create_chunk(void); +static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); +static struct page *pcpu_addr_to_page(void *addr); +static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); + +#ifdef CONFIG_NEED_PER_CPU_KM +#include "percpu-km.c" +#else +#include "percpu-vm.c" +#endif - chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL); - if (!chunk->vm) { - free_pcpu_chunk(chunk); - return NULL; +/** + * pcpu_chunk_addr_search - determine chunk containing specified address + * @addr: address for which the chunk needs to be determined. + * + * RETURNS: + * The address of the found chunk. + */ +static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) +{ + /* is it in the first chunk? */ + if (pcpu_addr_in_first_chunk(addr)) { + /* is it in the reserved area? */ + if (pcpu_addr_in_reserved_chunk(addr)) + return pcpu_reserved_chunk; + return pcpu_first_chunk; } - INIT_LIST_HEAD(&chunk->list); - chunk->free_size = pcpu_unit_size; - chunk->contig_hint = pcpu_unit_size; - - return chunk; + /* + * The address is relative to unit0 which might be unused and + * thus unmapped. Offset the address to the unit space of the + * current processor before looking it up in the vmalloc + * space. Note that any possible cpu id can be used here, so + * there's no need to worry about preemption or cpu hotplug. + */ + addr += pcpu_unit_offsets[raw_smp_processor_id()]; + return pcpu_get_page_chunk(pcpu_addr_to_page(addr)); } /** @@ -777,10 +702,13 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -static void *pcpu_alloc(size_t size, size_t align, bool reserved) +static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved) { + static int warn_limit = 10; struct pcpu_chunk *chunk; - int slot, off; + const char *err; + int slot, off, new_alloc; + unsigned long flags; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { WARN(true, "illegal size (%zu) or align (%zu) for " @@ -789,17 +717,31 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) } mutex_lock(&pcpu_alloc_mutex); - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; - if (size > chunk->contig_hint || - pcpu_extend_area_map(chunk) < 0) + + if (size > chunk->contig_hint) { + err = "alloc from reserved chunk failed"; goto fail_unlock; + } + + while ((new_alloc = pcpu_need_to_extend(chunk))) { + spin_unlock_irqrestore(&pcpu_lock, flags); + if (pcpu_extend_area_map(chunk, new_alloc) < 0) { + err = "failed to extend area map of reserved chunk"; + goto fail_unlock_mutex; + } + spin_lock_irqsave(&pcpu_lock, flags); + } + off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; + + err = "alloc from reserved chunk failed"; goto fail_unlock; } @@ -810,13 +752,20 @@ restart: if (size > chunk->contig_hint) continue; - switch (pcpu_extend_area_map(chunk)) { - case 0: - break; - case 1: - goto restart; /* pcpu_lock dropped, restart */ - default: - goto fail_unlock; + new_alloc = pcpu_need_to_extend(chunk); + if (new_alloc) { + spin_unlock_irqrestore(&pcpu_lock, flags); + if (pcpu_extend_area_map(chunk, + new_alloc) < 0) { + err = "failed to extend area map"; + goto fail_unlock_mutex; + } + spin_lock_irqsave(&pcpu_lock, flags); + /* + * pcpu_lock has been dropped, need to + * restart cpu_slot list walking. + */ + goto restart; } off = pcpu_alloc_area(chunk, size, align); @@ -826,34 +775,45 @@ restart: } /* hmmm... no space left, create a new chunk */ - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); - chunk = alloc_pcpu_chunk(); - if (!chunk) + chunk = pcpu_create_chunk(); + if (!chunk) { + err = "failed to allocate new chunk"; goto fail_unlock_mutex; + } - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); pcpu_chunk_relocate(chunk, -1); goto restart; area_found: - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); /* populate, map and clear the area */ if (pcpu_populate_chunk(chunk, off, size)) { - spin_lock_irq(&pcpu_lock); + spin_lock_irqsave(&pcpu_lock, flags); pcpu_free_area(chunk, off); + err = "failed to populate"; goto fail_unlock; } mutex_unlock(&pcpu_alloc_mutex); - return __addr_to_pcpu_ptr(chunk->vm->addr + off); + /* return address relative to base address */ + return __addr_to_pcpu_ptr(chunk->base_addr + off); fail_unlock: - spin_unlock_irq(&pcpu_lock); + spin_unlock_irqrestore(&pcpu_lock, flags); fail_unlock_mutex: mutex_unlock(&pcpu_alloc_mutex); + if (warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " + "%s\n", size, align, err); + dump_stack(); + if (!--warn_limit) + pr_info("PERCPU: limit reached, disable warning\n"); + } return NULL; } @@ -862,8 +822,8 @@ fail_unlock_mutex: * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * - * Allocate percpu area of @size bytes aligned at @align. Might - * sleep. Might trigger writeouts. + * Allocate zero-filled percpu area of @size bytes aligned at @align. + * Might sleep. Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. @@ -871,7 +831,7 @@ fail_unlock_mutex: * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -void *__alloc_percpu(size_t size, size_t align) +void __percpu *__alloc_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, false); } @@ -882,9 +842,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); * @size: size of area to allocate in bytes * @align: alignment of area (max PAGE_SIZE) * - * Allocate percpu area of @size bytes aligned at @align from reserved - * percpu area if arch has set it up; otherwise, allocation is served - * from the same dynamic area. Might sleep. Might trigger writeouts. + * Allocate zero-filled percpu area of @size bytes aligned at @align + * from reserved percpu area if arch has set it up; otherwise, + * allocation is served from the same dynamic area. Might sleep. + * Might trigger writeouts. * * CONTEXT: * Does GFP_KERNEL allocation. @@ -892,7 +853,7 @@ EXPORT_SYMBOL_GPL(__alloc_percpu); * RETURNS: * Percpu pointer to the allocated area on success, NULL on failure. */ -void *__alloc_reserved_percpu(size_t size, size_t align) +void __percpu *__alloc_reserved_percpu(size_t size, size_t align) { return pcpu_alloc(size, align, true); } @@ -926,12 +887,13 @@ static void pcpu_reclaim(struct work_struct *work) } spin_unlock_irq(&pcpu_lock); - mutex_unlock(&pcpu_alloc_mutex); list_for_each_entry_safe(chunk, next, &todo, list) { - pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false); - free_pcpu_chunk(chunk); + pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size); + pcpu_destroy_chunk(chunk); } + + mutex_unlock(&pcpu_alloc_mutex); } /** @@ -943,9 +905,9 @@ static void pcpu_reclaim(struct work_struct *work) * CONTEXT: * Can be called from atomic context. */ -void free_percpu(void *ptr) +void free_percpu(void __percpu *ptr) { - void *addr = __pcpu_ptr_to_addr(ptr); + void *addr; struct pcpu_chunk *chunk; unsigned long flags; int off; @@ -953,10 +915,12 @@ void free_percpu(void *ptr) if (!ptr) return; + addr = __pcpu_ptr_to_addr(ptr); + spin_lock_irqsave(&pcpu_lock, flags); chunk = pcpu_chunk_addr_search(addr); - off = addr - chunk->vm->addr; + off = addr - chunk->base_addr; pcpu_free_area(chunk, off); @@ -976,29 +940,210 @@ void free_percpu(void *ptr) EXPORT_SYMBOL_GPL(free_percpu); /** + * is_kernel_percpu_address - test whether address is from static percpu area + * @addr: address to test + * + * Test whether @addr belongs to in-kernel static percpu area. Module + * static percpu areas are not considered. For those, use + * is_module_percpu_address(). + * + * RETURNS: + * %true if @addr is from in-kernel static percpu area, %false otherwise. + */ +bool is_kernel_percpu_address(unsigned long addr) +{ +#ifdef CONFIG_SMP + const size_t static_size = __per_cpu_end - __per_cpu_start; + void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); + unsigned int cpu; + + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(base, cpu); + + if ((void *)addr >= start && (void *)addr < start + static_size) + return true; + } +#endif + /* on UP, can't distinguish from other static vars, always false */ + return false; +} + +/** + * per_cpu_ptr_to_phys - convert translated percpu address to physical address + * @addr: the address to be converted to physical address + * + * Given @addr which is dereferenceable address obtained via one of + * percpu access macros, this function translates it into its physical + * address. The caller is responsible for ensuring @addr stays valid + * until this function finishes. + * + * RETURNS: + * The physical address for @addr. + */ +phys_addr_t per_cpu_ptr_to_phys(void *addr) +{ + void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr); + bool in_first_chunk = false; + unsigned long first_start, first_end; + unsigned int cpu; + + /* + * The following test on first_start/end isn't strictly + * necessary but will speed up lookups of addresses which + * aren't in the first chunk. + */ + first_start = pcpu_chunk_addr(pcpu_first_chunk, pcpu_first_unit_cpu, 0); + first_end = pcpu_chunk_addr(pcpu_first_chunk, pcpu_last_unit_cpu, + pcpu_unit_pages); + if ((unsigned long)addr >= first_start && + (unsigned long)addr < first_end) { + for_each_possible_cpu(cpu) { + void *start = per_cpu_ptr(base, cpu); + + if (addr >= start && addr < start + pcpu_unit_size) { + in_first_chunk = true; + break; + } + } + } + + if (in_first_chunk) { + if ((unsigned long)addr < VMALLOC_START || + (unsigned long)addr >= VMALLOC_END) + return __pa(addr); + else + return page_to_phys(vmalloc_to_page(addr)); + } else + return page_to_phys(pcpu_addr_to_page(addr)); +} + +/** + * pcpu_alloc_alloc_info - allocate percpu allocation info + * @nr_groups: the number of groups + * @nr_units: the number of units + * + * Allocate ai which is large enough for @nr_groups groups containing + * @nr_units units. The returned ai's groups[0].cpu_map points to the + * cpu_map array which is long enough for @nr_units and filled with + * NR_CPUS. It's the caller's responsibility to initialize cpu_map + * pointer of other groups. + * + * RETURNS: + * Pointer to the allocated pcpu_alloc_info on success, NULL on + * failure. + */ +struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, + int nr_units) +{ + struct pcpu_alloc_info *ai; + size_t base_size, ai_size; + void *ptr; + int unit; + + base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), + __alignof__(ai->groups[0].cpu_map[0])); + ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); + + ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size)); + if (!ptr) + return NULL; + ai = ptr; + ptr += base_size; + + ai->groups[0].cpu_map = ptr; + + for (unit = 0; unit < nr_units; unit++) + ai->groups[0].cpu_map[unit] = NR_CPUS; + + ai->nr_groups = nr_groups; + ai->__ai_size = PFN_ALIGN(ai_size); + + return ai; +} + +/** + * pcpu_free_alloc_info - free percpu allocation info + * @ai: pcpu_alloc_info to free + * + * Free @ai which was allocated by pcpu_alloc_alloc_info(). + */ +void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai) +{ + free_bootmem(__pa(ai), ai->__ai_size); +} + +/** + * pcpu_dump_alloc_info - print out information about pcpu_alloc_info + * @lvl: loglevel + * @ai: allocation info to dump + * + * Print out information about @ai using loglevel @lvl. + */ +static void pcpu_dump_alloc_info(const char *lvl, + const struct pcpu_alloc_info *ai) +{ + int group_width = 1, cpu_width = 1, width; + char empty_str[] = "--------"; + int alloc = 0, alloc_end = 0; + int group, v; + int upa, apl; /* units per alloc, allocs per line */ + + v = ai->nr_groups; + while (v /= 10) + group_width++; + + v = num_possible_cpus(); + while (v /= 10) + cpu_width++; + empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0'; + + upa = ai->alloc_size / ai->unit_size; + width = upa * (cpu_width + 1) + group_width + 3; + apl = rounddown_pow_of_two(max(60 / width, 1)); + + printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu", + lvl, ai->static_size, ai->reserved_size, ai->dyn_size, + ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size); + + for (group = 0; group < ai->nr_groups; group++) { + const struct pcpu_group_info *gi = &ai->groups[group]; + int unit = 0, unit_end = 0; + + BUG_ON(gi->nr_units % upa); + for (alloc_end += gi->nr_units / upa; + alloc < alloc_end; alloc++) { + if (!(alloc % apl)) { + printk("\n"); + printk("%spcpu-alloc: ", lvl); + } + printk("[%0*d] ", group_width, group); + + for (unit_end += upa; unit < unit_end; unit++) + if (gi->cpu_map[unit] != NR_CPUS) + printk("%0*d ", cpu_width, + gi->cpu_map[unit]); + else + printk("%s ", empty_str); + } + } + printk("\n"); +} + +/** * pcpu_setup_first_chunk - initialize the first percpu chunk - * @get_page_fn: callback to fetch page pointer - * @static_size: the size of static percpu area in bytes - * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto - * @base_addr: mapped address, NULL for auto - * @populate_pte_fn: callback to allocate pagetable, NULL if unnecessary + * @ai: pcpu_alloc_info describing how to percpu area is shaped + * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static * perpcu area. This function is to be called from arch percpu area - * setup path. The first two parameters are mandatory. The rest are - * optional. - * - * @get_page_fn() should return pointer to percpu page given cpu - * number and page number. It should at least return enough pages to - * cover the static area. The returned pages for static area should - * have been initialized with valid data. If @unit_size is specified, - * it can also return pages after the static area. NULL return - * indicates end of pages for the cpu. Note that @get_page_fn() must - * return the same number of pages for all cpus. - * - * @reserved_size, if non-zero, specifies the amount of bytes to + * setup path. + * + * @ai contains all information necessary to initialize the first + * chunk and prime the dynamic percpu allocator. + * + * @ai->static_size is the size of static percpu area. + * + * @ai->reserved_size, if non-zero, specifies the amount of bytes to * reserve after the static area in the first chunk. This reserves * the first chunk such that it's available only through reserved * percpu allocation. This is primarily used to serve module percpu @@ -1006,22 +1151,29 @@ EXPORT_SYMBOL_GPL(free_percpu); * limited offset range for symbol relocations to guarantee module * percpu symbols fall inside the relocatable range. * - * @dyn_size, if non-negative, determines the number of bytes - * available for dynamic allocation in the first chunk. Specifying - * non-negative value makes percpu leave alone the area beyond - * @static_size + @reserved_size + @dyn_size. + * @ai->dyn_size determines the number of bytes available for dynamic + * allocation in the first chunk. The area between @ai->static_size + + * @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused. + * + * @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE + * and equal to or larger than @ai->static_size + @ai->reserved_size + + * @ai->dyn_size. * - * @unit_size, if non-negative, specifies unit size and must be - * aligned to PAGE_SIZE and equal to or larger than @static_size + - * @reserved_size + if non-negative, @dyn_size. + * @ai->atom_size is the allocation atom size and used as alignment + * for vm areas. * - * Non-null @base_addr means that the caller already allocated virtual - * region for the first chunk and mapped it. percpu must not mess - * with the chunk. Note that @base_addr with 0 @unit_size or non-NULL - * @populate_pte_fn doesn't make any sense. + * @ai->alloc_size is the allocation size and always multiple of + * @ai->atom_size. This is larger than @ai->atom_size if + * @ai->unit_size is larger than @ai->atom_size. * - * @populate_pte_fn is used to populate the pagetable. NULL means the - * caller already populated the pagetable. + * @ai->nr_groups and @ai->groups describe virtual memory layout of + * percpu areas. Units which should be colocated are put into the + * same group. Dynamic VM areas will be allocated according to these + * groupings. If @ai->nr_groups is zero, a single group containing + * all units is assumed. + * + * The caller should have mapped the first chunk at @base_addr and + * copied static data to each unit. * * If the first chunk ends up with both reserved and dynamic areas, it * is served by two chunks - one to serve the core static and reserved @@ -1031,49 +1183,101 @@ EXPORT_SYMBOL_GPL(free_percpu); * and available for dynamic allocation like any other chunks. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access. + * 0 on success, -errno on failure. */ -size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, - size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size, - void *base_addr, - pcpu_populate_pte_fn_t populate_pte_fn) +int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { - static struct vm_struct first_vm; - static int smap[2], dmap[2]; - size_t size_sum = static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0); + static char cpus_buf[4096] __initdata; + static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; + static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata; + size_t dyn_size = ai->dyn_size; + size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; struct pcpu_chunk *schunk, *dchunk = NULL; + unsigned long *group_offsets; + size_t *group_sizes; + unsigned long *unit_off; unsigned int cpu; - int nr_pages; - int err, i; - - /* santiy checks */ - BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || - ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(!static_size); - if (unit_size >= 0) { - BUG_ON(unit_size < size_sum); - BUG_ON(unit_size & ~PAGE_MASK); - BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE); - } else - BUG_ON(base_addr); - BUG_ON(base_addr && populate_pte_fn); + int *unit_map; + int group, unit, i; + + cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); + +#define PCPU_SETUP_BUG_ON(cond) do { \ + if (unlikely(cond)) { \ + pr_emerg("PERCPU: failed to initialize, %s", #cond); \ + pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ + pcpu_dump_alloc_info(KERN_EMERG, ai); \ + BUG(); \ + } \ +} while (0) + + /* sanity checks */ + PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); +#ifdef CONFIG_SMP + PCPU_SETUP_BUG_ON(!ai->static_size); +#endif + PCPU_SETUP_BUG_ON(!base_addr); + PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); + PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); + PCPU_SETUP_BUG_ON(ai->dyn_size < PERCPU_DYNAMIC_EARLY_SIZE); + PCPU_SETUP_BUG_ON(pcpu_verify_alloc_info(ai) < 0); + + /* process group information and build config tables accordingly */ + group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); + group_sizes = alloc_bootmem(ai->nr_groups * sizeof(group_sizes[0])); + unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0])); + unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); + + for (cpu = 0; cpu < nr_cpu_ids; cpu++) + unit_map[cpu] = UINT_MAX; + pcpu_first_unit_cpu = NR_CPUS; + + for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { + const struct pcpu_group_info *gi = &ai->groups[group]; + + group_offsets[group] = gi->base_offset; + group_sizes[group] = gi->nr_units * ai->unit_size; + + for (i = 0; i < gi->nr_units; i++) { + cpu = gi->cpu_map[i]; + if (cpu == NR_CPUS) + continue; - if (unit_size >= 0) - pcpu_unit_pages = unit_size >> PAGE_SHIFT; - else - pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT, - PFN_UP(size_sum)); + PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids); + PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); + PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); - pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; - pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size; - pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) - + num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *); + unit_map[cpu] = unit + i; + unit_off[cpu] = gi->base_offset + i * ai->unit_size; + + if (pcpu_first_unit_cpu == NR_CPUS) + pcpu_first_unit_cpu = cpu; + pcpu_last_unit_cpu = cpu; + } + } + pcpu_nr_units = unit; + + for_each_possible_cpu(cpu) + PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); - if (dyn_size < 0) - dyn_size = pcpu_unit_size - static_size - reserved_size; + /* we're done parsing the input, undefine BUG macro and dump config */ +#undef PCPU_SETUP_BUG_ON + pcpu_dump_alloc_info(KERN_DEBUG, ai); + + pcpu_nr_groups = ai->nr_groups; + pcpu_group_offsets = group_offsets; + pcpu_group_sizes = group_sizes; + pcpu_unit_map = unit_map; + pcpu_unit_offsets = unit_off; + + /* determine basic parameters */ + pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; + pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT; + pcpu_atom_size = ai->atom_size; + pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) + + BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long); /* * Allocate chunk slots. The additional last slot is for @@ -1093,181 +1297,603 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, */ schunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&schunk->list); - schunk->vm = &first_vm; + schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); - schunk->page = schunk->page_ar; + schunk->immutable = true; + bitmap_fill(schunk->populated, pcpu_unit_pages); - if (reserved_size) { - schunk->free_size = reserved_size; + if (ai->reserved_size) { + schunk->free_size = ai->reserved_size; pcpu_reserved_chunk = schunk; - pcpu_reserved_chunk_limit = static_size + reserved_size; + pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ } schunk->contig_hint = schunk->free_size; - schunk->map[schunk->map_used++] = -static_size; + schunk->map[schunk->map_used++] = -ai->static_size; if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; /* init dynamic chunk if necessary */ if (dyn_size) { - dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); + dchunk = alloc_bootmem(pcpu_chunk_struct_size); INIT_LIST_HEAD(&dchunk->list); - dchunk->vm = &first_vm; + dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); - dchunk->page = schunk->page_ar; /* share page map with schunk */ + dchunk->immutable = true; + bitmap_fill(dchunk->populated, pcpu_unit_pages); dchunk->contig_hint = dchunk->free_size = dyn_size; dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit; dchunk->map[dchunk->map_used++] = dchunk->free_size; } - /* allocate vm address */ - first_vm.flags = VM_ALLOC; - first_vm.size = pcpu_chunk_size; + /* link the first chunk in */ + pcpu_first_chunk = dchunk ?: schunk; + pcpu_chunk_relocate(pcpu_first_chunk, -1); - if (!base_addr) - vm_area_register_early(&first_vm, PAGE_SIZE); - else { - /* - * Pages already mapped. No need to remap into - * vmalloc area. In this case the first chunks can't - * be mapped or unmapped by percpu and are marked - * immutable. - */ - first_vm.addr = base_addr; - schunk->immutable = true; - if (dchunk) - dchunk->immutable = true; - } + /* we're done */ + pcpu_base_addr = base_addr; + return 0; +} - /* assign pages */ - nr_pages = -1; - for_each_possible_cpu(cpu) { - for (i = 0; i < pcpu_unit_pages; i++) { - struct page *page = get_page_fn(cpu, i); +#ifdef CONFIG_SMP + +const char *pcpu_fc_names[PCPU_FC_NR] __initdata = { + [PCPU_FC_AUTO] = "auto", + [PCPU_FC_EMBED] = "embed", + [PCPU_FC_PAGE] = "page", +}; + +enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO; + +static int __init percpu_alloc_setup(char *str) +{ + if (0) + /* nada */; +#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK + else if (!strcmp(str, "embed")) + pcpu_chosen_fc = PCPU_FC_EMBED; +#endif +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + else if (!strcmp(str, "page")) + pcpu_chosen_fc = PCPU_FC_PAGE; +#endif + else + pr_warning("PERCPU: unknown allocator %s specified\n", str); + + return 0; +} +early_param("percpu_alloc", percpu_alloc_setup); - if (!page) +/* + * pcpu_embed_first_chunk() is used by the generic percpu setup. + * Build it if needed by the arch config or the generic setup is going + * to be used. + */ +#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \ + !defined(CONFIG_HAVE_SETUP_PER_CPU_AREA) +#define BUILD_EMBED_FIRST_CHUNK +#endif + +/* build pcpu_page_first_chunk() iff needed by the arch config */ +#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK) +#define BUILD_PAGE_FIRST_CHUNK +#endif + +/* pcpu_build_alloc_info() is used by both embed and page first chunk */ +#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK) +/** + * pcpu_build_alloc_info - build alloc_info considering distances between CPUs + * @reserved_size: the size of reserved percpu area in bytes + * @dyn_size: minimum free size for dynamic allocation in bytes + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * + * This function determines grouping of units, their mappings to cpus + * and other parameters considering needed percpu size, allocation + * atom size and distances between CPUs. + * + * Groups are always mutliples of atom size and CPUs which are of + * LOCAL_DISTANCE both ways are grouped together and share space for + * units in the same group. The returned configuration is guaranteed + * to have CPUs on different nodes on different groups and >=75% usage + * of allocated virtual address space. + * + * RETURNS: + * On success, pointer to the new allocation_info is returned. On + * failure, ERR_PTR value is returned. + */ +static struct pcpu_alloc_info * __init pcpu_build_alloc_info( + size_t reserved_size, size_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn) +{ + static int group_map[NR_CPUS] __initdata; + static int group_cnt[NR_CPUS] __initdata; + const size_t static_size = __per_cpu_end - __per_cpu_start; + int nr_groups = 1, nr_units = 0; + size_t size_sum, min_unit_size, alloc_size; + int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */ + int last_allocs, group, unit; + unsigned int cpu, tcpu; + struct pcpu_alloc_info *ai; + unsigned int *cpu_map; + + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_cnt)); + + /* calculate size_sum and ensure dyn_size is enough for early alloc */ + size_sum = PFN_ALIGN(static_size + reserved_size + + max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE)); + dyn_size = size_sum - static_size - reserved_size; + + /* + * Determine min_unit_size, alloc_size and max_upa such that + * alloc_size is multiple of atom_size and is the smallest + * which can accomodate 4k aligned segments which are equal to + * or larger than min_unit_size. + */ + min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE); + + alloc_size = roundup(min_unit_size, atom_size); + upa = alloc_size / min_unit_size; + while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + upa--; + max_upa = upa; + + /* group cpus according to their proximity */ + for_each_possible_cpu(cpu) { + group = 0; + next_group: + for_each_possible_cpu(tcpu) { + if (cpu == tcpu) break; - *pcpu_chunk_pagep(schunk, cpu, i) = page; + if (group_map[tcpu] == group && cpu_distance_fn && + (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE || + cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) { + group++; + nr_groups = max(nr_groups, group + 1); + goto next_group; + } } + group_map[cpu] = group; + group_cnt[group]++; + } - BUG_ON(i < PFN_UP(static_size)); + /* + * Expand unit size until address space usage goes over 75% + * and then as much as possible without using more address + * space. + */ + last_allocs = INT_MAX; + for (upa = max_upa; upa; upa--) { + int allocs = 0, wasted = 0; - if (nr_pages < 0) - nr_pages = i; - else - BUG_ON(nr_pages != i); - } + if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK)) + continue; - /* map them */ - if (populate_pte_fn) { - for_each_possible_cpu(cpu) - for (i = 0; i < nr_pages; i++) - populate_pte_fn(pcpu_chunk_addr(schunk, - cpu, i)); - - err = pcpu_map(schunk, 0, nr_pages); - if (err) - panic("failed to setup static percpu area, err=%d\n", - err); + for (group = 0; group < nr_groups; group++) { + int this_allocs = DIV_ROUND_UP(group_cnt[group], upa); + allocs += this_allocs; + wasted += this_allocs * upa - group_cnt[group]; + } + + /* + * Don't accept if wastage is over 1/3. The + * greater-than comparison ensures upa==1 always + * passes the following check. + */ + if (wasted > num_possible_cpus() / 3) + continue; + + /* and then don't consume more memory */ + if (allocs > last_allocs) + break; + last_allocs = allocs; + best_upa = upa; } + upa = best_upa; - /* link the first chunk in */ - pcpu_first_chunk = dchunk ?: schunk; - pcpu_chunk_relocate(pcpu_first_chunk, -1); + /* allocate and fill alloc_info */ + for (group = 0; group < nr_groups; group++) + nr_units += roundup(group_cnt[group], upa); - /* we're done */ - pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); - return pcpu_unit_size; -} + ai = pcpu_alloc_alloc_info(nr_groups, nr_units); + if (!ai) + return ERR_PTR(-ENOMEM); + cpu_map = ai->groups[0].cpu_map; -/* - * Embedding first chunk setup helper. - */ -static void *pcpue_ptr __initdata; -static size_t pcpue_size __initdata; -static size_t pcpue_unit_size __initdata; + for (group = 0; group < nr_groups; group++) { + ai->groups[group].cpu_map = cpu_map; + cpu_map += roundup(group_cnt[group], upa); + } -static struct page * __init pcpue_get_page(unsigned int cpu, int pageno) -{ - size_t off = (size_t)pageno << PAGE_SHIFT; + ai->static_size = static_size; + ai->reserved_size = reserved_size; + ai->dyn_size = dyn_size; + ai->unit_size = alloc_size / upa; + ai->atom_size = atom_size; + ai->alloc_size = alloc_size; - if (off >= pcpue_size) - return NULL; + for (group = 0, unit = 0; group_cnt[group]; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + + /* + * Initialize base_offset as if all groups are located + * back-to-back. The caller should update this to + * reflect actual allocation. + */ + gi->base_offset = unit * ai->unit_size; - return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off); + for_each_possible_cpu(cpu) + if (group_map[cpu] == group) + gi->cpu_map[gi->nr_units++] = cpu; + gi->nr_units = roundup(gi->nr_units, upa); + unit += gi->nr_units; + } + BUG_ON(unit != nr_units); + + return ai; } +#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */ +#if defined(BUILD_EMBED_FIRST_CHUNK) /** * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem - * @static_size: the size of static percpu area in bytes * @reserved_size: the size of reserved percpu area in bytes - * @dyn_size: free size for dynamic allocation in bytes, -1 for auto - * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto + * @dyn_size: minimum free size for dynamic allocation in bytes + * @atom_size: allocation atom size + * @cpu_distance_fn: callback to determine distance between cpus, optional + * @alloc_fn: function to allocate percpu page + * @free_fn: funtion to free percpu page * * This is a helper to ease setting up embedded first percpu chunk and * can be called where pcpu_setup_first_chunk() is expected. * * If this function is used to setup the first chunk, it is allocated - * as a contiguous area using bootmem allocator and used as-is without - * being mapped into vmalloc area. This enables the first chunk to - * piggy back on the linear physical mapping which often uses larger - * page size. + * by calling @alloc_fn and used as-is without being mapped into + * vmalloc area. Allocations are always whole multiples of @atom_size + * aligned to @atom_size. + * + * This enables the first chunk to piggy back on the linear physical + * mapping which often uses larger page size. Please note that this + * can result in very sparse cpu->unit mapping on NUMA machines thus + * requiring large vmalloc address space. Don't use this allocator if + * vmalloc space is not orders of magnitude larger than distances + * between node memory addresses (ie. 32bit NUMA machines). * - * When @dyn_size is positive, dynamic area might be larger than - * specified to fill page alignment. Also, when @dyn_size is auto, - * @dyn_size does not fill the whole first chunk but only what's - * necessary for page alignment after static and reserved areas. + * @dyn_size specifies the minimum dynamic area size. * * If the needed size is smaller than the minimum or specified unit - * size, the leftover is returned to the bootmem allocator. + * size, the leftover is returned using @free_fn. * * RETURNS: - * The determined pcpu_unit_size which can be used to initialize - * percpu access on success, -errno on failure. + * 0 on success, -errno on failure. */ -ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size, - ssize_t dyn_size, ssize_t unit_size) +int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, + size_t atom_size, + pcpu_fc_cpu_distance_fn_t cpu_distance_fn, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn) { - unsigned int cpu; + void *base = (void *)ULONG_MAX; + void **areas = NULL; + struct pcpu_alloc_info *ai; + size_t size_sum, areas_size, max_distance; + int group, i, rc; + + ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, + cpu_distance_fn); + if (IS_ERR(ai)) + return PTR_ERR(ai); + + size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; + areas_size = PFN_ALIGN(ai->nr_groups * sizeof(void *)); + + areas = alloc_bootmem_nopanic(areas_size); + if (!areas) { + rc = -ENOMEM; + goto out_free; + } - /* determine parameters and allocate */ - pcpue_size = PFN_ALIGN(static_size + reserved_size + - (dyn_size >= 0 ? dyn_size : 0)); - if (dyn_size != 0) - dyn_size = pcpue_size - static_size - reserved_size; + /* allocate, copy and determine base address */ + for (group = 0; group < ai->nr_groups; group++) { + struct pcpu_group_info *gi = &ai->groups[group]; + unsigned int cpu = NR_CPUS; + void *ptr; + + for (i = 0; i < gi->nr_units && cpu == NR_CPUS; i++) + cpu = gi->cpu_map[i]; + BUG_ON(cpu == NR_CPUS); + + /* allocate space for the whole group */ + ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size); + if (!ptr) { + rc = -ENOMEM; + goto out_free_areas; + } + areas[group] = ptr; - if (unit_size >= 0) { - BUG_ON(unit_size < pcpue_size); - pcpue_unit_size = unit_size; - } else - pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE); + base = min(ptr, base); - pcpue_ptr = __alloc_bootmem_nopanic( - num_possible_cpus() * pcpue_unit_size, - PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); - if (!pcpue_ptr) - return -ENOMEM; + for (i = 0; i < gi->nr_units; i++, ptr += ai->unit_size) { + if (gi->cpu_map[i] == NR_CPUS) { + /* unused unit, free whole */ + free_fn(ptr, ai->unit_size); + continue; + } + /* copy and return the unused part */ + memcpy(ptr, __per_cpu_load, ai->static_size); + free_fn(ptr + size_sum, ai->unit_size - size_sum); + } + } - /* return the leftover and copy */ - for_each_possible_cpu(cpu) { - void *ptr = pcpue_ptr + cpu * pcpue_unit_size; + /* base address is now known, determine group base offsets */ + max_distance = 0; + for (group = 0; group < ai->nr_groups; group++) { + ai->groups[group].base_offset = areas[group] - base; + max_distance = max_t(size_t, max_distance, + ai->groups[group].base_offset); + } + max_distance += ai->unit_size; + + /* warn if maximum distance is further than 75% of vmalloc space */ + if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { + pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " + "space 0x%lx\n", + max_distance, VMALLOC_END - VMALLOC_START); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + /* and fail if we have fallback */ + rc = -EINVAL; + goto out_free; +#endif + } + + pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + ai->dyn_size, ai->unit_size); + + rc = pcpu_setup_first_chunk(ai, base); + goto out_free; + +out_free_areas: + for (group = 0; group < ai->nr_groups; group++) + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); +out_free: + pcpu_free_alloc_info(ai); + if (areas) + free_bootmem(__pa(areas), areas_size); + return rc; +} +#endif /* BUILD_EMBED_FIRST_CHUNK */ + +#ifdef BUILD_PAGE_FIRST_CHUNK +/** + * pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages + * @reserved_size: the size of reserved percpu area in bytes + * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE + * @free_fn: funtion to free percpu page, always called with PAGE_SIZE + * @populate_pte_fn: function to populate pte + * + * This is a helper to ease setting up page-remapped first percpu + * chunk and can be called where pcpu_setup_first_chunk() is expected. + * + * This is the basic allocator. Static percpu area is allocated + * page-by-page into vmalloc area. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_alloc_fn_t alloc_fn, + pcpu_fc_free_fn_t free_fn, + pcpu_fc_populate_pte_fn_t populate_pte_fn) +{ + static struct vm_struct vm; + struct pcpu_alloc_info *ai; + char psize_str[16]; + int unit_pages; + size_t pages_size; + struct page **pages; + int unit, i, j, rc; + + snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10); + + ai = pcpu_build_alloc_info(reserved_size, 0, PAGE_SIZE, NULL); + if (IS_ERR(ai)) + return PTR_ERR(ai); + BUG_ON(ai->nr_groups != 1); + BUG_ON(ai->groups[0].nr_units != num_possible_cpus()); + + unit_pages = ai->unit_size >> PAGE_SHIFT; + + /* unaligned allocations can't be freed, round up to page size */ + pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() * + sizeof(pages[0])); + pages = alloc_bootmem(pages_size); + + /* allocate pages */ + j = 0; + for (unit = 0; unit < num_possible_cpus(); unit++) + for (i = 0; i < unit_pages; i++) { + unsigned int cpu = ai->groups[0].cpu_map[unit]; + void *ptr; + + ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE); + if (!ptr) { + pr_warning("PERCPU: failed to allocate %s page " + "for cpu%u\n", psize_str, cpu); + goto enomem; + } + pages[j++] = virt_to_page(ptr); + } + + /* allocate vm area, map the pages and copy static data */ + vm.flags = VM_ALLOC; + vm.size = num_possible_cpus() * ai->unit_size; + vm_area_register_early(&vm, PAGE_SIZE); + + for (unit = 0; unit < num_possible_cpus(); unit++) { + unsigned long unit_addr = + (unsigned long)vm.addr + unit * ai->unit_size; + + for (i = 0; i < unit_pages; i++) + populate_pte_fn(unit_addr + (i << PAGE_SHIFT)); + + /* pte already populated, the following shouldn't fail */ + rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages], + unit_pages); + if (rc < 0) + panic("failed to map percpu area, err=%d\n", rc); - free_bootmem(__pa(ptr + pcpue_size), - pcpue_unit_size - pcpue_size); - memcpy(ptr, __per_cpu_load, static_size); + /* + * FIXME: Archs with virtual cache should flush local + * cache for the linear mapping here - something + * equivalent to flush_cache_vmap() on the local cpu. + * flush_cache_vmap() can't be used as most supporting + * data structures are not set up yet. + */ + + /* copy static data */ + memcpy((void *)unit_addr, __per_cpu_load, ai->static_size); } /* we're ready, commit */ - pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n", - pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size); + pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n", + unit_pages, psize_str, vm.addr, ai->static_size, + ai->reserved_size, ai->dyn_size); + + rc = pcpu_setup_first_chunk(ai, vm.addr); + goto out_free_ar; + +enomem: + while (--j >= 0) + free_fn(page_address(pages[j]), PAGE_SIZE); + rc = -ENOMEM; +out_free_ar: + free_bootmem(__pa(pages), pages_size); + pcpu_free_alloc_info(ai); + return rc; +} +#endif /* BUILD_PAGE_FIRST_CHUNK */ + +#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA +/* + * Generic SMP percpu area setup. + * + * The embedding helper is used because its behavior closely resembles + * the original non-dynamic generic percpu area setup. This is + * important because many archs have addressing restrictions and might + * fail if the percpu area is located far away from the previous + * location. As an added bonus, in non-NUMA cases, embedding is + * generally a good idea TLB-wise because percpu area can piggy back + * on the physical linear memory mapping which uses large page + * mappings on applicable archs. + */ +unsigned long __per_cpu_offset[NR_CPUS] __read_mostly; +EXPORT_SYMBOL(__per_cpu_offset); - return pcpu_setup_first_chunk(pcpue_get_page, static_size, - reserved_size, dyn_size, - pcpue_unit_size, pcpue_ptr, NULL); +static void * __init pcpu_dfl_fc_alloc(unsigned int cpu, size_t size, + size_t align) +{ + return __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); +} + +static void __init pcpu_dfl_fc_free(void *ptr, size_t size) +{ + free_bootmem(__pa(ptr), size); +} + +void __init setup_per_cpu_areas(void) +{ + unsigned long delta; + unsigned int cpu; + int rc; + + /* + * Always reserve area for module percpu variables. That's + * what the legacy allocator did. + */ + rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE, + PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL, + pcpu_dfl_fc_alloc, pcpu_dfl_fc_free); + if (rc < 0) + panic("Failed to initialize percpu areas."); + + delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start; + for_each_possible_cpu(cpu) + __per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu]; +} +#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */ + +#else /* CONFIG_SMP */ + +/* + * UP percpu area setup. + * + * UP always uses km-based percpu allocator with identity mapping. + * Static percpu variables are indistinguishable from the usual static + * variables and don't require any special preparation. + */ +void __init setup_per_cpu_areas(void) +{ + const size_t unit_size = + roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE, + PERCPU_DYNAMIC_RESERVE)); + struct pcpu_alloc_info *ai; + void *fc; + + ai = pcpu_alloc_alloc_info(1, 1); + fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!ai || !fc) + panic("Failed to allocate memory for percpu areas."); + + ai->dyn_size = unit_size; + ai->unit_size = unit_size; + ai->atom_size = unit_size; + ai->alloc_size = unit_size; + ai->groups[0].nr_units = 1; + ai->groups[0].cpu_map[0] = 0; + + if (pcpu_setup_first_chunk(ai, fc) < 0) + panic("Failed to initialize percpu areas."); +} + +#endif /* CONFIG_SMP */ + +/* + * First and reserved chunks are initialized with temporary allocation + * map in initdata so that they can be used before slab is online. + * This function is called after slab is brought up and replaces those + * with properly allocated maps. + */ +void __init percpu_init_late(void) +{ + struct pcpu_chunk *target_chunks[] = + { pcpu_first_chunk, pcpu_reserved_chunk, NULL }; + struct pcpu_chunk *chunk; + unsigned long flags; + int i; + + for (i = 0; (chunk = target_chunks[i]); i++) { + int *map; + const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]); + + BUILD_BUG_ON(size > PAGE_SIZE); + + map = pcpu_mem_alloc(size); + BUG_ON(!map); + + spin_lock_irqsave(&pcpu_lock, flags); + memcpy(map, chunk->map, size); + chunk->map = map; + spin_unlock_irqrestore(&pcpu_lock, flags); + } } diff --git a/mm/quicklist.c b/mm/quicklist.c index e66d07d..2876349 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -14,12 +14,13 @@ */ #include <linux/kernel.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/module.h> #include <linux/quicklist.h> -DEFINE_PER_CPU(struct quicklist, quicklist)[CONFIG_NR_QUICK]; +DEFINE_PER_CPU(struct quicklist [CONFIG_NR_QUICK], quicklist); #define FRACTION_OF_NODE_MEM 16 @@ -29,7 +30,6 @@ static unsigned long max_pages(unsigned long min_pages) int node = numa_node_id(); struct zone *zones = NODE_DATA(node)->node_zones; int num_cpus_on_node; - const struct cpumask *cpumask_on_node = cpumask_of_node(node); node_free_pages = #ifdef CONFIG_ZONE_DMA @@ -42,7 +42,7 @@ static unsigned long max_pages(unsigned long min_pages) max = node_free_pages / FRACTION_OF_NODE_MEM; - num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); + num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); max /= num_cpus_on_node; return max(max, min_pages); diff --git a/mm/readahead.c b/mm/readahead.c index aa1aa23..77506a2 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/fs.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/blkdev.h> @@ -501,6 +502,12 @@ void page_cache_sync_readahead(struct address_space *mapping, if (!ra->ra_pages) return; + /* be dumb */ + if (filp && (filp->f_mode & FMODE_RANDOM)) { + force_page_cache_readahead(mapping, filp, offset, req_size); + return; + } + /* do read-ahead */ ondemand_readahead(mapping, ra, filp, false, offset, req_size); } @@ -516,7 +523,7 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead); * @req_size: hint: total size of the read which the caller is performing in * pagecache pages * - * page_cache_async_ondemand() should be called when a page is used which + * page_cache_async_readahead() should be called when a page is used which * has the PG_readahead flag; this is a marker to suggest that the application * has used up enough of the readahead window that we should start pulling in * more pages. @@ -547,5 +554,17 @@ page_cache_async_readahead(struct address_space *mapping, /* do read-ahead */ ondemand_readahead(mapping, ra, filp, true, offset, req_size); + +#ifdef CONFIG_BLOCK + /* + * Normally the current page is !uptodate and lock_page() will be + * immediately called to implicitly unplug the device. However this + * is not always true for RAID conifgurations, where data arrives + * not strictly in their submission order. In this case we need to + * explicitly kick off the IO. + */ + if (PageUptodate(page)) + blk_run_backing_dev(mapping->backing_dev_info, NULL); +#endif } EXPORT_SYMBOL_GPL(page_cache_async_readahead); @@ -36,6 +36,11 @@ * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within inode_lock in __sync_single_inode) + * + * (code doesn't rely on that order so it could be switched around) + * ->tasklist_lock + * anon_vma->lock (memory_failure, collect_procs_anon) + * pte map lock */ #include <linux/mm.h> @@ -44,29 +49,42 @@ #include <linux/swapops.h> #include <linux/slab.h> #include <linux/init.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/rcupdate.h> #include <linux/module.h> #include <linux/memcontrol.h> #include <linux/mmu_notifier.h> #include <linux/migrate.h> +#include <linux/hugetlb.h> #include <asm/tlbflush.h> #include "internal.h" static struct kmem_cache *anon_vma_cachep; +static struct kmem_cache *anon_vma_chain_cachep; static inline struct anon_vma *anon_vma_alloc(void) { return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); } -static inline void anon_vma_free(struct anon_vma *anon_vma) +void anon_vma_free(struct anon_vma *anon_vma) { kmem_cache_free(anon_vma_cachep, anon_vma); } +static inline struct anon_vma_chain *anon_vma_chain_alloc(void) +{ + return kmem_cache_alloc(anon_vma_chain_cachep, GFP_KERNEL); +} + +static void anon_vma_chain_free(struct anon_vma_chain *anon_vma_chain) +{ + kmem_cache_free(anon_vma_chain_cachep, anon_vma_chain); +} + /** * anon_vma_prepare - attach an anon_vma to a memory region * @vma: the memory region in question @@ -97,80 +115,183 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) int anon_vma_prepare(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; might_sleep(); if (unlikely(!anon_vma)) { struct mm_struct *mm = vma->vm_mm; struct anon_vma *allocated; + avc = anon_vma_chain_alloc(); + if (!avc) + goto out_enomem; + anon_vma = find_mergeable_anon_vma(vma); allocated = NULL; if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) - return -ENOMEM; + goto out_enomem_free_avc; allocated = anon_vma; + /* + * This VMA had no anon_vma yet. This anon_vma is + * the root of any anon_vma tree that might form. + */ + anon_vma->root = anon_vma; } - spin_lock(&anon_vma->lock); + anon_vma_lock(anon_vma); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); if (likely(!vma->anon_vma)) { vma->anon_vma = anon_vma; - list_add_tail(&vma->anon_vma_node, &anon_vma->head); + avc->anon_vma = anon_vma; + avc->vma = vma; + list_add(&avc->same_vma, &vma->anon_vma_chain); + list_add_tail(&avc->same_anon_vma, &anon_vma->head); allocated = NULL; + avc = NULL; } spin_unlock(&mm->page_table_lock); + anon_vma_unlock(anon_vma); - spin_unlock(&anon_vma->lock); if (unlikely(allocated)) anon_vma_free(allocated); + if (unlikely(avc)) + anon_vma_chain_free(avc); } return 0; + + out_enomem_free_avc: + anon_vma_chain_free(avc); + out_enomem: + return -ENOMEM; } -void __anon_vma_merge(struct vm_area_struct *vma, struct vm_area_struct *next) +static void anon_vma_chain_link(struct vm_area_struct *vma, + struct anon_vma_chain *avc, + struct anon_vma *anon_vma) { - BUG_ON(vma->anon_vma != next->anon_vma); - list_del(&next->anon_vma_node); + avc->vma = vma; + avc->anon_vma = anon_vma; + list_add(&avc->same_vma, &vma->anon_vma_chain); + + anon_vma_lock(anon_vma); + list_add_tail(&avc->same_anon_vma, &anon_vma->head); + anon_vma_unlock(anon_vma); } -void __anon_vma_link(struct vm_area_struct *vma) +/* + * Attach the anon_vmas from src to dst. + * Returns 0 on success, -ENOMEM on failure. + */ +int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc, *pavc; - if (anon_vma) - list_add_tail(&vma->anon_vma_node, &anon_vma->head); + list_for_each_entry_reverse(pavc, &src->anon_vma_chain, same_vma) { + avc = anon_vma_chain_alloc(); + if (!avc) + goto enomem_failure; + anon_vma_chain_link(dst, avc, pavc->anon_vma); + } + return 0; + + enomem_failure: + unlink_anon_vmas(dst); + return -ENOMEM; } -void anon_vma_link(struct vm_area_struct *vma) +/* + * Attach vma to its own anon_vma, as well as to the anon_vmas that + * the corresponding VMA in the parent process is attached to. + * Returns 0 on success, non-zero on failure. + */ +int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma_chain *avc; + struct anon_vma *anon_vma; - if (anon_vma) { - spin_lock(&anon_vma->lock); - list_add_tail(&vma->anon_vma_node, &anon_vma->head); - spin_unlock(&anon_vma->lock); - } + /* Don't bother if the parent process has no anon_vma here. */ + if (!pvma->anon_vma) + return 0; + + /* + * First, attach the new VMA to the parent VMA's anon_vmas, + * so rmap can find non-COWed pages in child processes. + */ + if (anon_vma_clone(vma, pvma)) + return -ENOMEM; + + /* Then add our own anon_vma. */ + anon_vma = anon_vma_alloc(); + if (!anon_vma) + goto out_error; + avc = anon_vma_chain_alloc(); + if (!avc) + goto out_error_free_anon_vma; + + /* + * The root anon_vma's spinlock is the lock actually used when we + * lock any of the anon_vmas in this anon_vma tree. + */ + anon_vma->root = pvma->anon_vma->root; + /* + * With KSM refcounts, an anon_vma can stay around longer than the + * process it belongs to. The root anon_vma needs to be pinned + * until this anon_vma is freed, because the lock lives in the root. + */ + get_anon_vma(anon_vma->root); + /* Mark this anon_vma as the one where our new (COWed) pages go. */ + vma->anon_vma = anon_vma; + anon_vma_chain_link(vma, avc, anon_vma); + + return 0; + + out_error_free_anon_vma: + anon_vma_free(anon_vma); + out_error: + unlink_anon_vmas(vma); + return -ENOMEM; } -void anon_vma_unlink(struct vm_area_struct *vma) +static void anon_vma_unlink(struct anon_vma_chain *anon_vma_chain) { - struct anon_vma *anon_vma = vma->anon_vma; + struct anon_vma *anon_vma = anon_vma_chain->anon_vma; int empty; + /* If anon_vma_fork fails, we can get an empty anon_vma_chain. */ if (!anon_vma) return; - spin_lock(&anon_vma->lock); - list_del(&vma->anon_vma_node); + anon_vma_lock(anon_vma); + list_del(&anon_vma_chain->same_anon_vma); /* We must garbage collect the anon_vma if it's empty */ - empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); + empty = list_empty(&anon_vma->head) && !anonvma_external_refcount(anon_vma); + anon_vma_unlock(anon_vma); - if (empty) + if (empty) { + /* We no longer need the root anon_vma */ + if (anon_vma->root != anon_vma) + drop_anon_vma(anon_vma->root); anon_vma_free(anon_vma); + } +} + +void unlink_anon_vmas(struct vm_area_struct *vma) +{ + struct anon_vma_chain *avc, *next; + + /* + * Unlink each anon_vma chained to the VMA. This list is ordered + * from newest to oldest, ensuring the root anon_vma gets freed last. + */ + list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { + anon_vma_unlink(avc); + list_del(&avc->same_vma); + anon_vma_chain_free(avc); + } } static void anon_vma_ctor(void *data) @@ -178,6 +299,7 @@ static void anon_vma_ctor(void *data) struct anon_vma *anon_vma = data; spin_lock_init(&anon_vma->lock); + anonvma_external_refcount_init(anon_vma); INIT_LIST_HEAD(&anon_vma->head); } @@ -185,35 +307,51 @@ void __init anon_vma_init(void) { anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma), 0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor); + anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC); } /* * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *__page_lock_anon_vma(struct page *page) { - struct anon_vma *anon_vma; + struct anon_vma *anon_vma, *root_anon_vma; unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long) page->mapping; - if (!(anon_mapping & PAGE_MAPPING_ANON)) + anon_mapping = (unsigned long) ACCESS_ONCE(page->mapping); + if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; if (!page_mapped(page)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); - spin_lock(&anon_vma->lock); - return anon_vma; + root_anon_vma = ACCESS_ONCE(anon_vma->root); + spin_lock(&root_anon_vma->lock); + + /* + * If this page is still mapped, then its anon_vma cannot have been + * freed. But if it has been unmapped, we have no security against + * the anon_vma structure being freed and reused (for another anon_vma: + * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot + * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting + * anon_vma->root before page_unlock_anon_vma() is called to unlock. + */ + if (page_mapped(page)) + return anon_vma; + + spin_unlock(&root_anon_vma->lock); out: rcu_read_unlock(); return NULL; } -static void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) + __releases(&anon_vma->root->lock) + __releases(RCU) { - spin_unlock(&anon_vma->lock); + anon_vma_unlock(anon_vma); rcu_read_unlock(); } @@ -228,6 +366,8 @@ vma_address(struct page *page, struct vm_area_struct *vma) pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); unsigned long address; + if (unlikely(is_vm_hugetlb_page(vma))) + pgoff = page->index << huge_page_order(page_hstate(page)); address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { /* page should be within @vma mapping range */ @@ -237,14 +377,19 @@ vma_address(struct page *page, struct vm_area_struct *vma) } /* - * At what user virtual address is page expected in vma? checking that the - * page matches the vma: currently only used on anon pages, by unuse_vma; + * At what user virtual address is page expected in vma? + * Caller should check the page is actually part of the vma. */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { if (PageAnon(page)) { - if ((void *)vma->anon_vma != - (void *)page->mapping - PAGE_MAPPING_ANON) + struct anon_vma *page__anon_vma = page_anon_vma(page); + /* + * Note: swapoff's unuse_vma() is more efficient with this + * check, and needs it to match anon_vma when KSM is active. + */ + if (!vma->anon_vma || !page__anon_vma || + vma->anon_vma->root != page__anon_vma->root) return -EFAULT; } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { if (!vma->vm_file || @@ -264,7 +409,7 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) * * On success returns with pte mapped and locked. */ -pte_t *page_check_address(struct page *page, struct mm_struct *mm, +pte_t *__page_check_address(struct page *page, struct mm_struct *mm, unsigned long address, spinlock_t **ptlp, int sync) { pgd_t *pgd; @@ -273,6 +418,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, pte_t *pte; spinlock_t *ptl; + if (unlikely(PageHuge(page))) { + pte = huge_pte_offset(mm, address); + ptl = &mm->page_table_lock; + goto check; + } + pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) return NULL; @@ -293,6 +444,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, } ptl = pte_lockptr(mm, pmd); +check: spin_lock(ptl); if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { *ptlp = ptl; @@ -311,7 +463,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, * if the page is not mapped into the page tables of this VMA. Only * valid for normal file or anonymous VMAs. */ -static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { unsigned long address; pte_t *pte; @@ -332,21 +484,15 @@ static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. */ -static int page_referenced_one(struct page *page, - struct vm_area_struct *vma, - unsigned int *mapcount, - unsigned long *vm_flags) +int page_referenced_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, unsigned int *mapcount, + unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; - unsigned long address; pte_t *pte; spinlock_t *ptl; int referenced = 0; - address = vma_address(page, vma); - if (address == -EFAULT) - goto out; - pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -358,6 +504,7 @@ static int page_referenced_one(struct page *page, */ if (vma->vm_flags & VM_LOCKED) { *mapcount = 1; /* break early from loop */ + *vm_flags |= VM_LOCKED; goto out_unmap; } @@ -382,9 +529,10 @@ static int page_referenced_one(struct page *page, out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); -out: + if (referenced) *vm_flags |= vma->vm_flags; +out: return referenced; } @@ -394,7 +542,7 @@ static int page_referenced_anon(struct page *page, { unsigned int mapcount; struct anon_vma *anon_vma; - struct vm_area_struct *vma; + struct anon_vma_chain *avc; int referenced = 0; anon_vma = page_lock_anon_vma(page); @@ -402,7 +550,11 @@ static int page_referenced_anon(struct page *page, return referenced; mapcount = page_mapcount(page); - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -410,7 +562,7 @@ static int page_referenced_anon(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, + referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); if (!mapcount) break; @@ -468,6 +620,9 @@ static int page_referenced_file(struct page *page, mapcount = page_mapcount(page); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; /* * If we are reclaiming on behalf of a cgroup, skip * counting on behalf of references from different @@ -475,7 +630,7 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - referenced += page_referenced_one(page, vma, + referenced += page_referenced_one(page, vma, address, &mapcount, vm_flags); if (!mapcount) break; @@ -501,46 +656,44 @@ int page_referenced(struct page *page, unsigned long *vm_flags) { int referenced = 0; - - if (TestClearPageReferenced(page)) - referenced++; + int we_locked = 0; *vm_flags = 0; - if (page_mapped(page) && page->mapping) { - if (PageAnon(page)) + if (page_mapped(page) && page_rmapping(page)) { + if (!is_locked && (!PageAnon(page) || PageKsm(page))) { + we_locked = trylock_page(page); + if (!we_locked) { + referenced++; + goto out; + } + } + if (unlikely(PageKsm(page))) + referenced += page_referenced_ksm(page, mem_cont, + vm_flags); + else if (PageAnon(page)) referenced += page_referenced_anon(page, mem_cont, vm_flags); - else if (is_locked) + else if (page->mapping) referenced += page_referenced_file(page, mem_cont, vm_flags); - else if (!trylock_page(page)) - referenced++; - else { - if (page->mapping) - referenced += page_referenced_file(page, - mem_cont, vm_flags); + if (we_locked) unlock_page(page); - } } - +out: if (page_test_and_clear_young(page)) referenced++; return referenced; } -static int page_mkclean_one(struct page *page, struct vm_area_struct *vma) +static int page_mkclean_one(struct page *page, struct vm_area_struct *vma, + unsigned long address) { struct mm_struct *mm = vma->vm_mm; - unsigned long address; pte_t *pte; spinlock_t *ptl; int ret = 0; - address = vma_address(page, vma); - if (address == -EFAULT) - goto out; - pte = page_check_address(page, mm, address, &ptl, 1); if (!pte) goto out; @@ -572,8 +725,12 @@ static int page_mkclean_file(struct address_space *mapping, struct page *page) spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - if (vma->vm_flags & VM_SHARED) - ret += page_mkclean_one(page, vma); + if (vma->vm_flags & VM_SHARED) { + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret += page_mkclean_one(page, vma, address); + } } spin_unlock(&mapping->i_mmap_lock); return ret; @@ -590,7 +747,7 @@ int page_mkclean(struct page *page) if (mapping) { ret = page_mkclean_file(mapping, page); if (page_test_dirty(page)) { - page_clear_dirty(page); + page_clear_dirty(page, 1); ret = 1; } } @@ -601,27 +758,57 @@ int page_mkclean(struct page *page) EXPORT_SYMBOL_GPL(page_mkclean); /** - * __page_set_anon_rmap - setup new anonymous rmap - * @page: the page to add the mapping to - * @vma: the vm area in which the mapping is added + * page_move_anon_rmap - move a page to our anon_vma + * @page: the page to move to our anon_vma + * @vma: the vma the page belongs to * @address: the user virtual address mapped + * + * When a page belongs exclusively to one process after a COW event, + * that page can be moved into the anon_vma that belongs to just that + * process, so the rmap code will not search the parent or sibling + * processes. */ -static void __page_set_anon_rmap(struct page *page, +void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { struct anon_vma *anon_vma = vma->anon_vma; - BUG_ON(!anon_vma); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(!anon_vma); + VM_BUG_ON(page->index != linear_page_index(vma, address)); + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; +} - page->index = linear_page_index(vma, address); +/** + * __page_set_anon_rmap - set up new anonymous rmap + * @page: Page to add to rmap + * @vma: VM area to add page to. + * @address: User virtual address of the mapping + * @exclusive: the page is exclusively owned by the current process + */ +static void __page_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + BUG_ON(!anon_vma); + + if (PageAnon(page)) + return; /* - * nr_mapped state can be updated without turning off - * interrupts because it is not modified via interrupt. + * If the page isn't exclusively mapped into this vma, + * we must use the _oldest_ possible anon_vma for the + * page mapping! */ - __inc_zone_page_state(page, NR_ANON_PAGES); + if (!exclusive) + anon_vma = anon_vma->root; + + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + page->index = linear_page_index(vma, address); } /** @@ -646,9 +833,7 @@ static void __page_check_anon_rmap(struct page *page, * are initially only visible via the pagetables, and the pte is locked * over the call to page_add_new_anon_rmap. */ - struct anon_vma *anon_vma = vma->anon_vma; - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; - BUG_ON(page->mapping != (struct address_space *)anon_vma); + BUG_ON(page_anon_vma(page)->root != vma->anon_vma->root); BUG_ON(page->index != linear_page_index(vma, address)); #endif } @@ -659,15 +844,35 @@ static void __page_check_anon_rmap(struct page *page, * @vma: the vm area in which the mapping is added * @address: the user virtual address mapped * - * The caller needs to hold the pte lock and the page must be locked. + * The caller needs to hold the pte lock, and the page must be locked in + * the anon_vma case: to serialize mapping,index checking after setting, + * and to ensure that PageAnon is not being upgraded racily to PageKsm + * (but PageKsm is never downgraded to PageAnon). */ void page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) { + do_page_add_anon_rmap(page, vma, address, 0); +} + +/* + * Special version of the above for do_swap_page, which often runs + * into pages that are exclusively owned by the current process. + * Everybody else should continue to use page_add_anon_rmap above. + */ +void do_page_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ + int first = atomic_inc_and_test(&page->_mapcount); + if (first) + __inc_zone_page_state(page, NR_ANON_PAGES); + if (unlikely(PageKsm(page))) + return; + VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); - if (atomic_inc_and_test(&page->_mapcount)) - __page_set_anon_rmap(page, vma, address); + if (first) + __page_set_anon_rmap(page, vma, address, exclusive); else __page_check_anon_rmap(page, vma, address); } @@ -688,7 +893,8 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - __page_set_anon_rmap(page, vma, address); + __inc_zone_page_state(page, NR_ANON_PAGES); + __page_set_anon_rmap(page, vma, address, 1); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); else @@ -705,31 +911,10 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_mapped_file_stat(page, 1); + mem_cgroup_update_file_mapped(page, 1); } } -#ifdef CONFIG_DEBUG_VM -/** - * page_dup_rmap - duplicate pte mapping to a page - * @page: the page to add the mapping to - * @vma: the vm area being duplicated - * @address: the user virtual address mapped - * - * For copy_page_range only: minimal extract from page_add_file_rmap / - * page_add_anon_rmap, avoiding unnecessary tests (already checked) so it's - * quicker. - * - * The caller needs to hold the pte lock. - */ -void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address) -{ - if (PageAnon(page)) - __page_check_anon_rmap(page, vma, address); - atomic_inc(&page->_mapcount); -} -#endif - /** * page_remove_rmap - take down pte mapping from a page * @page: page to remove mapping from @@ -738,54 +923,58 @@ void page_dup_rmap(struct page *page, struct vm_area_struct *vma, unsigned long */ void page_remove_rmap(struct page *page) { - if (atomic_add_negative(-1, &page->_mapcount)) { - /* - * Now that the last pte has gone, s390 must transfer dirty - * flag from storage key to struct page. We can usually skip - * this if the page is anon, so about to be freed; but perhaps - * not if it's in swapcache - there might be another pte slot - * containing the swap entry, but page not yet written to swap. - */ - if ((!PageAnon(page) || PageSwapCache(page)) && - page_test_dirty(page)) { - page_clear_dirty(page); - set_page_dirty(page); - } - if (PageAnon(page)) - mem_cgroup_uncharge_page(page); - __dec_zone_page_state(page, - PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); - mem_cgroup_update_mapped_file_stat(page, -1); - /* - * It would be tidy to reset the PageAnon mapping here, - * but that might overwrite a racing page_add_anon_rmap - * which increments mapcount after us but sets mapping - * before us: so leave the reset to free_hot_cold_page, - * and remember that it's only reliable while mapped. - * Leaving it set also helps swapoff to reinstate ptes - * faster for those pages still in swapcache. - */ + /* page still mapped by someone else? */ + if (!atomic_add_negative(-1, &page->_mapcount)) + return; + + /* + * Now that the last pte has gone, s390 must transfer dirty + * flag from storage key to struct page. We can usually skip + * this if the page is anon, so about to be freed; but perhaps + * not if it's in swapcache - there might be another pte slot + * containing the swap entry, but page not yet written to swap. + */ + if ((!PageAnon(page) || PageSwapCache(page)) && page_test_dirty(page)) { + page_clear_dirty(page, 1); + set_page_dirty(page); + } + /* + * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED + * and not charged by memcg for now. + */ + if (unlikely(PageHuge(page))) + return; + if (PageAnon(page)) { + mem_cgroup_uncharge_page(page); + __dec_zone_page_state(page, NR_ANON_PAGES); + } else { + __dec_zone_page_state(page, NR_FILE_MAPPED); + mem_cgroup_update_file_mapped(page, -1); } + /* + * It would be tidy to reset the PageAnon mapping here, + * but that might overwrite a racing page_add_anon_rmap + * which increments mapcount after us but sets mapping + * before us: so leave the reset to free_hot_cold_page, + * and remember that it's only reliable while mapped. + * Leaving it set also helps swapoff to reinstate ptes + * faster for those pages still in swapcache. + */ } /* * Subfunctions of try_to_unmap: try_to_unmap_one called * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ -static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - int migration) +int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, + unsigned long address, enum ttu_flags flags) { struct mm_struct *mm = vma->vm_mm; - unsigned long address; pte_t *pte; pte_t pteval; spinlock_t *ptl; int ret = SWAP_AGAIN; - address = vma_address(page, vma); - if (address == -EFAULT) - goto out; - pte = page_check_address(page, mm, address, &ptl, 0); if (!pte) goto out; @@ -795,11 +984,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration) { - if (vma->vm_flags & VM_LOCKED) { - ret = SWAP_MLOCK; + if (!(flags & TTU_IGNORE_MLOCK)) { + if (vma->vm_flags & VM_LOCKED) + goto out_mlock; + + if (TTU_ACTION(flags) == TTU_MUNLOCK) goto out_unmap; - } + } + if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pte)) { ret = SWAP_FAIL; goto out_unmap; @@ -817,7 +1009,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (PageAnon(page)) { + if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (PageAnon(page)) + dec_mm_counter(mm, MM_ANONPAGES); + else + dec_mm_counter(mm, MM_FILEPAGES); + set_pte_at(mm, address, pte, + swp_entry_to_pte(make_hwpoison_entry(page))); + } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; if (PageSwapCache(page)) { @@ -825,33 +1024,37 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Store the swap location in the pte. * See handle_pte_fault() ... */ - swap_duplicate(entry); + if (swap_duplicate(entry) < 0) { + set_pte_at(mm, address, pte, pteval); + ret = SWAP_FAIL; + goto out_unmap; + } if (list_empty(&mm->mmlist)) { spin_lock(&mmlist_lock); if (list_empty(&mm->mmlist)) list_add(&mm->mmlist, &init_mm.mmlist); spin_unlock(&mmlist_lock); } - dec_mm_counter(mm, anon_rss); + dec_mm_counter(mm, MM_ANONPAGES); + inc_mm_counter(mm, MM_SWAPENTS); } else if (PAGE_MIGRATION) { /* * Store the pfn of the page in a special migration * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - BUG_ON(!migration); + BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); - } else if (PAGE_MIGRATION && migration) { + } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); } else - dec_mm_counter(mm, file_rss); - + dec_mm_counter(mm, MM_FILEPAGES); page_remove_rmap(page); page_cache_release(page); @@ -860,6 +1063,27 @@ out_unmap: pte_unmap_unlock(pte, ptl); out: return ret; + +out_mlock: + pte_unmap_unlock(pte, ptl); + + + /* + * We need mmap_sem locking, Otherwise VM_LOCKED check makes + * unstable result and race. Plus, We can't wait here because + * we now hold anon_vma->lock or mapping->i_mmap_lock. + * if trylock failed, the page remain in evictable lru and later + * vmscan could retry to move the page to unevictable lru if the + * page is actually mlocked. + */ + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (vma->vm_flags & VM_LOCKED) { + mlock_vma_page(page); + ret = SWAP_MLOCK; + } + up_read(&vma->vm_mm->mmap_sem); + } + return ret; } /* @@ -925,11 +1149,10 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; /* - * MLOCK_PAGES => feature is configured. - * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, + * If we can acquire the mmap_sem for read, and vma is VM_LOCKED, * keep the sem while scanning the cluster for mlocking pages. */ - if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { locked_vma = (vma->vm_flags & VM_LOCKED); if (!locked_vma) up_read(&vma->vm_mm->mmap_sem); /* don't need it */ @@ -970,7 +1193,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, page_remove_rmap(page); page_cache_release(page); - dec_mm_counter(mm, file_rss); + dec_mm_counter(mm, MM_FILEPAGES); (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); @@ -979,29 +1202,25 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } -/* - * common handling for pages mapped in VM_LOCKED vmas - */ -static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) +static bool is_vma_temporary_stack(struct vm_area_struct *vma) { - int mlocked = 0; + int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); - if (down_read_trylock(&vma->vm_mm->mmap_sem)) { - if (vma->vm_flags & VM_LOCKED) { - mlock_vma_page(page); - mlocked++; /* really mlocked the page */ - } - up_read(&vma->vm_mm->mmap_sem); - } - return mlocked; + if (!maybe_stack) + return false; + + if ((vma->vm_flags & VM_STACK_INCOMPLETE_SETUP) == + VM_STACK_INCOMPLETE_SETUP) + return true; + + return false; } /** * try_to_unmap_anon - unmap or unlock anonymous page using the object-based * rmap method * @page: the page to unmap/unlock - * @unlock: request for unlock rather than unmap [unlikely] - * @migration: unmapping for migration - ignored if @unlock + * @flags: action and flags * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the anon_vma struct it points to. @@ -1013,53 +1232,48 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_anon(struct page *page, int unlock, int migration) +static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; - struct vm_area_struct *vma; - unsigned int mlocked = 0; + struct anon_vma_chain *avc; int ret = SWAP_AGAIN; - if (MLOCK_PAGES && unlikely(unlock)) - ret = SWAP_SUCCESS; /* default for try_to_munlock() */ - anon_vma = page_lock_anon_vma(page); if (!anon_vma) return ret; - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - if (MLOCK_PAGES && unlikely(unlock)) { - if (!((vma->vm_flags & VM_LOCKED) && - page_mapped_in_vma(page, vma))) - continue; /* must visit all unlocked vmas */ - ret = SWAP_MLOCK; /* saw at least one mlocked vma */ - } else { - ret = try_to_unmap_one(page, vma, migration); - if (ret == SWAP_FAIL || !page_mapped(page)) - break; - } - if (ret == SWAP_MLOCK) { - mlocked = try_to_mlock_page(page, vma); - if (mlocked) - break; /* stop if actually mlocked page */ - } - } + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long address; - page_unlock_anon_vma(anon_vma); + /* + * During exec, a temporary VMA is setup and later moved. + * The VMA is moved under the anon_vma lock but not the + * page tables leading to a race where migration cannot + * find the migration ptes. Rather than increasing the + * locking requirements of exec(), migration skips + * temporary VMAs until after exec() completes. + */ + if (PAGE_MIGRATION && (flags & TTU_MIGRATION) && + is_vma_temporary_stack(vma)) + continue; - if (mlocked) - ret = SWAP_MLOCK; /* actually mlocked the page */ - else if (ret == SWAP_MLOCK) - ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ + address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = try_to_unmap_one(page, vma, address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) + break; + } + page_unlock_anon_vma(anon_vma); return ret; } /** * try_to_unmap_file - unmap/unlock file page using the object-based rmap method * @page: the page to unmap/unlock - * @unlock: request for unlock rather than unmap [unlikely] - * @migration: unmapping for migration - ignored if @unlock + * @flags: action and flags * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. @@ -1071,7 +1285,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_file(struct page *page, int unlock, int migration) +static int try_to_unmap_file(struct page *page, enum ttu_flags flags) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -1082,46 +1296,30 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) unsigned long max_nl_cursor = 0; unsigned long max_nl_size = 0; unsigned int mapcount; - unsigned int mlocked = 0; - - if (MLOCK_PAGES && unlikely(unlock)) - ret = SWAP_SUCCESS; /* default for try_to_munlock() */ spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - if (MLOCK_PAGES && unlikely(unlock)) { - if (!((vma->vm_flags & VM_LOCKED) && - page_mapped_in_vma(page, vma))) - continue; /* must visit all vmas */ - ret = SWAP_MLOCK; - } else { - ret = try_to_unmap_one(page, vma, migration); - if (ret == SWAP_FAIL || !page_mapped(page)) - goto out; - } - if (ret == SWAP_MLOCK) { - mlocked = try_to_mlock_page(page, vma); - if (mlocked) - break; /* stop if actually mlocked page */ - } + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = try_to_unmap_one(page, vma, address, flags); + if (ret != SWAP_AGAIN || !page_mapped(page)) + goto out; } - if (mlocked) + if (list_empty(&mapping->i_mmap_nonlinear)) goto out; - if (list_empty(&mapping->i_mmap_nonlinear)) + /* + * We don't bother to try to find the munlocked page in nonlinears. + * It's costly. Instead, later, page reclaim logic may call + * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily. + */ + if (TTU_ACTION(flags) == TTU_MUNLOCK) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (MLOCK_PAGES && unlikely(unlock)) { - if (!(vma->vm_flags & VM_LOCKED)) - continue; /* must visit all vmas */ - ret = SWAP_MLOCK; /* leave mlocked == 0 */ - goto out; /* no need to look further */ - } - if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) - continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) max_nl_cursor = cursor; @@ -1154,16 +1352,12 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (!MLOCK_PAGES && !migration && - (vma->vm_flags & VM_LOCKED)) - continue; cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { - ret = try_to_unmap_cluster(cursor, &mapcount, - vma, page); - if (ret == SWAP_MLOCK) - mlocked = 2; /* to return below */ + if (try_to_unmap_cluster(cursor, &mapcount, + vma, page) == SWAP_MLOCK) + ret = SWAP_MLOCK; cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) @@ -1184,17 +1378,13 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) vma->vm_private_data = NULL; out: spin_unlock(&mapping->i_mmap_lock); - if (mlocked) - ret = SWAP_MLOCK; /* actually mlocked the page */ - else if (ret == SWAP_MLOCK) - ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ return ret; } /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped - * @migration: migration flag + * @flags: action and flags * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. @@ -1205,16 +1395,18 @@ out: * SWAP_FAIL - the page is unswappable * SWAP_MLOCK - page is mlocked. */ -int try_to_unmap(struct page *page, int migration) +int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; BUG_ON(!PageLocked(page)); - if (PageAnon(page)) - ret = try_to_unmap_anon(page, 0, migration); + if (unlikely(PageKsm(page))) + ret = try_to_unmap_ksm(page, flags); + else if (PageAnon(page)) + ret = try_to_unmap_anon(page, flags); else - ret = try_to_unmap_file(page, 0, migration); + ret = try_to_unmap_file(page, flags); if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; @@ -1230,17 +1422,179 @@ int try_to_unmap(struct page *page, int migration) * * Return values are: * - * SWAP_SUCCESS - no vma's holding page mlocked. + * SWAP_AGAIN - no vma is holding page mlocked, or, * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem + * SWAP_FAIL - page cannot be located at present * SWAP_MLOCK - page is now mlocked. */ int try_to_munlock(struct page *page) { VM_BUG_ON(!PageLocked(page) || PageLRU(page)); - if (PageAnon(page)) - return try_to_unmap_anon(page, 1, 0); + if (unlikely(PageKsm(page))) + return try_to_unmap_ksm(page, TTU_MUNLOCK); + else if (PageAnon(page)) + return try_to_unmap_anon(page, TTU_MUNLOCK); + else + return try_to_unmap_file(page, TTU_MUNLOCK); +} + +#if defined(CONFIG_KSM) || defined(CONFIG_MIGRATION) +/* + * Drop an anon_vma refcount, freeing the anon_vma and anon_vma->root + * if necessary. Be careful to do all the tests under the lock. Once + * we know we are the last user, nobody else can get a reference and we + * can do the freeing without the lock. + */ +void drop_anon_vma(struct anon_vma *anon_vma) +{ + BUG_ON(atomic_read(&anon_vma->external_refcount) <= 0); + if (atomic_dec_and_lock(&anon_vma->external_refcount, &anon_vma->root->lock)) { + struct anon_vma *root = anon_vma->root; + int empty = list_empty(&anon_vma->head); + int last_root_user = 0; + int root_empty = 0; + + /* + * The refcount on a non-root anon_vma got dropped. Drop + * the refcount on the root and check if we need to free it. + */ + if (empty && anon_vma != root) { + BUG_ON(atomic_read(&root->external_refcount) <= 0); + last_root_user = atomic_dec_and_test(&root->external_refcount); + root_empty = list_empty(&root->head); + } + anon_vma_unlock(anon_vma); + + if (empty) { + anon_vma_free(anon_vma); + if (root_empty && last_root_user) + anon_vma_free(root); + } + } +} +#endif + +#ifdef CONFIG_MIGRATION +/* + * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file(): + * Called by migrate.c to remove migration ptes, but might be used more later. + */ +static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + struct anon_vma *anon_vma; + struct anon_vma_chain *avc; + int ret = SWAP_AGAIN; + + /* + * Note: remove_migration_ptes() cannot use page_lock_anon_vma() + * because that depends on page_mapped(); but not all its usages + * are holding mmap_sem. Users without mmap_sem are required to + * take a reference count to prevent the anon_vma disappearing + */ + anon_vma = page_anon_vma(page); + if (!anon_vma) + return ret; + anon_vma_lock(anon_vma); + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = rmap_one(page, vma, address, arg); + if (ret != SWAP_AGAIN) + break; + } + anon_vma_unlock(anon_vma); + return ret; +} + +static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + struct address_space *mapping = page->mapping; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct vm_area_struct *vma; + struct prio_tree_iter iter; + int ret = SWAP_AGAIN; + + if (!mapping) + return ret; + spin_lock(&mapping->i_mmap_lock); + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { + unsigned long address = vma_address(page, vma); + if (address == -EFAULT) + continue; + ret = rmap_one(page, vma, address, arg); + if (ret != SWAP_AGAIN) + break; + } + /* + * No nonlinear handling: being always shared, nonlinear vmas + * never contain migration ptes. Decide what to do about this + * limitation to linear when we need rmap_walk() on nonlinear. + */ + spin_unlock(&mapping->i_mmap_lock); + return ret; +} + +int rmap_walk(struct page *page, int (*rmap_one)(struct page *, + struct vm_area_struct *, unsigned long, void *), void *arg) +{ + VM_BUG_ON(!PageLocked(page)); + + if (unlikely(PageKsm(page))) + return rmap_walk_ksm(page, rmap_one, arg); + else if (PageAnon(page)) + return rmap_walk_anon(page, rmap_one, arg); else - return try_to_unmap_file(page, 1, 0); + return rmap_walk_file(page, rmap_one, arg); +} +#endif /* CONFIG_MIGRATION */ + +#ifdef CONFIG_HUGETLB_PAGE +/* + * The following three functions are for anonymous (private mapped) hugepages. + * Unlike common anonymous pages, anonymous hugepages have no accounting code + * and no lru code, because we handle hugepages differently from common pages. + */ +static void __hugepage_set_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address, int exclusive) +{ + struct anon_vma *anon_vma = vma->anon_vma; + + BUG_ON(!anon_vma); + + if (PageAnon(page)) + return; + if (!exclusive) + anon_vma = anon_vma->root; + + anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + page->mapping = (struct address_space *) anon_vma; + page->index = linear_page_index(vma, address); } +void hugepage_add_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + struct anon_vma *anon_vma = vma->anon_vma; + int first; + + BUG_ON(!PageLocked(page)); + BUG_ON(!anon_vma); + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + first = atomic_inc_and_test(&page->_mapcount); + if (first) + __hugepage_set_anon_rmap(page, vma, address, 0); +} + +void hugepage_add_new_anon_rmap(struct page *page, + struct vm_area_struct *vma, unsigned long address) +{ + BUG_ON(address < vma->vm_start || address >= vma->vm_end); + atomic_set(&page->_mapcount, 0); + __hugepage_set_anon_rmap(page, vma, address, 1); +} +#endif /* CONFIG_HUGETLB_PAGE */ @@ -28,8 +28,8 @@ #include <linux/file.h> #include <linux/mm.h> #include <linux/module.h> +#include <linux/percpu_counter.h> #include <linux/swap.h> -#include <linux/ima.h> static struct vfsmount *shm_mnt; @@ -42,6 +42,7 @@ static struct vfsmount *shm_mnt; #include <linux/xattr.h> #include <linux/exportfs.h> +#include <linux/posix_acl.h> #include <linux/generic_acl.h> #include <linux/mman.h> #include <linux/string.h> @@ -49,7 +50,6 @@ static struct vfsmount *shm_mnt; #include <linux/backing-dev.h> #include <linux/shmem_fs.h> #include <linux/writeback.h> -#include <linux/vfs.h> #include <linux/blkdev.h> #include <linux/security.h> #include <linux/swapops.h> @@ -219,7 +219,7 @@ static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; -static struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ @@ -234,10 +234,10 @@ static void shmem_free_blocks(struct inode *inode, long pages) { struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { - spin_lock(&sbinfo->stat_lock); - sbinfo->free_blocks += pages; + percpu_counter_add(&sbinfo->used_blocks, -pages); + spin_lock(&inode->i_lock); inode->i_blocks -= pages*BLOCKS_PER_PAGE; - spin_unlock(&sbinfo->stat_lock); + spin_unlock(&inode->i_lock); } } @@ -417,25 +417,21 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long if (sgp == SGP_READ) return shmem_swp_map(ZERO_PAGE(0)); /* - * Test free_blocks against 1 not 0, since we have 1 data + * Test used_blocks against 1 less max_blocks, since we have 1 data * page (and perhaps indirect index pages) yet to allocate: * a waste to allocate index if we cannot allocate data. */ if (sbinfo->max_blocks) { - spin_lock(&sbinfo->stat_lock); - if (sbinfo->free_blocks <= 1) { - spin_unlock(&sbinfo->stat_lock); + if (percpu_counter_compare(&sbinfo->used_blocks, (sbinfo->max_blocks - 1)) > 0) return ERR_PTR(-ENOSPC); - } - sbinfo->free_blocks--; + percpu_counter_inc(&sbinfo->used_blocks); + spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&sbinfo->stat_lock); + spin_unlock(&inode->i_lock); } spin_unlock(&info->lock); page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); - if (page) - set_page_private(page, 0); spin_lock(&info->lock); if (!page) { @@ -730,10 +726,11 @@ done2: if (inode->i_mapping->nrpages && (info->flags & SHMEM_PAGEIN)) { /* * Call truncate_inode_pages again: racing shmem_unuse_inode - * may have swizzled a page in from swap since vmtruncate or - * generic_delete_inode did it, before we lowered next_index. - * Also, though shmem_getpage checks i_size before adding to - * cache, no recheck after: so fix the narrow window there too. + * may have swizzled a page in from swap since + * truncate_pagecache or generic_delete_inode did it, before we + * lowered next_index. Also, though shmem_getpage checks + * i_size before adding to cache, no recheck after: so fix the + * narrow window there too. * * Recalling truncate_inode_pages_range and unmap_mapping_range * every time for punch_hole (which never got a chance to clear @@ -763,19 +760,21 @@ done2: } } -static void shmem_truncate(struct inode *inode) -{ - shmem_truncate_range(inode, inode->i_size, (loff_t)-1); -} - static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) { struct inode *inode = dentry->d_inode; - struct page *page = NULL; + loff_t newsize = attr->ia_size; int error; - if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) { - if (attr->ia_size < inode->i_size) { + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE) + && newsize != inode->i_size) { + struct page *page = NULL; + + if (newsize < inode->i_size) { /* * If truncating down to a partial page, then * if that page is already allocated, hold it @@ -783,9 +782,9 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) * truncate_partial_page cannnot miss it were * it assigned to swap. */ - if (attr->ia_size & (PAGE_CACHE_SIZE-1)) { + if (newsize & (PAGE_CACHE_SIZE-1)) { (void) shmem_getpage(inode, - attr->ia_size>>PAGE_CACHE_SHIFT, + newsize >> PAGE_CACHE_SHIFT, &page, SGP_READ, NULL); if (page) unlock_page(page); @@ -797,36 +796,38 @@ static int shmem_notify_change(struct dentry *dentry, struct iattr *attr) * if it's being fully truncated to zero-length: the * nrpages check is efficient enough in that case. */ - if (attr->ia_size) { + if (newsize) { struct shmem_inode_info *info = SHMEM_I(inode); spin_lock(&info->lock); info->flags &= ~SHMEM_PAGEIN; spin_unlock(&info->lock); } } + + /* XXX(truncate): truncate_setsize should be called last */ + truncate_setsize(inode, newsize); + if (page) + page_cache_release(page); + shmem_truncate_range(inode, newsize, (loff_t)-1); } - error = inode_change_ok(inode, attr); - if (!error) - error = inode_setattr(inode, attr); + setattr_copy(inode, attr); #ifdef CONFIG_TMPFS_POSIX_ACL - if (!error && (attr->ia_valid & ATTR_MODE)) - error = generic_acl_chmod(inode, &shmem_acl_ops); + if (attr->ia_valid & ATTR_MODE) + error = generic_acl_chmod(inode); #endif - if (page) - page_cache_release(page); return error; } -static void shmem_delete_inode(struct inode *inode) +static void shmem_evict_inode(struct inode *inode) { struct shmem_inode_info *info = SHMEM_I(inode); - if (inode->i_op->truncate == shmem_truncate) { + if (inode->i_mapping->a_ops == &shmem_aops) { truncate_inode_pages(inode->i_mapping, 0); shmem_unacct_size(info->flags, inode->i_size); inode->i_size = 0; - shmem_truncate(inode); + shmem_truncate_range(inode, 0, (loff_t)-1); if (!list_empty(&info->swaplist)) { mutex_lock(&shmem_swaplist_mutex); list_del_init(&info->swaplist); @@ -835,7 +836,7 @@ static void shmem_delete_inode(struct inode *inode) } BUG_ON(inode->i_blocks); shmem_free_inode(inode->i_sb); - clear_inode(inode); + end_writeback(inode); } static inline int shmem_find_swp(swp_entry_t entry, swp_entry_t *dir, swp_entry_t *edir) @@ -932,7 +933,7 @@ found: /* * Move _head_ to start search for next from here. - * But be careful: shmem_delete_inode checks list_empty without taking + * But be careful: shmem_evict_inode checks list_empty without taking * mutex, and there's an instant in list_move_tail when info->swaplist * would appear empty, if it were the only one on shmem_swaplist. We * could avoid doing it if inode NULL; or use this minor optimization. @@ -1018,7 +1019,14 @@ int shmem_unuse(swp_entry_t entry, struct page *page) goto out; } mutex_unlock(&shmem_swaplist_mutex); -out: return found; /* 0 or 1 or -ENOMEM */ + /* + * Can some race bring us here? We've been holding page lock, + * so I think not; but would rather try again later than BUG() + */ + unlock_page(page); + page_cache_release(page); +out: + return (found < 0) ? found : 0; } /* @@ -1047,8 +1055,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * sync from ever calling shmem_writepage; but a stacking filesystem * may use the ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, - * and not for pdflush or sync. However, in those cases, we do still - * want to check if there's a redundant swappage to be discarded. + * and not for the writeback threads or sync. However, in those cases, + * we do still want to check if there's a redundant swappage to be + * discarded. */ if (wbc->for_reclaim) swap = get_swap_page(); @@ -1080,7 +1089,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) else inode = NULL; spin_unlock(&info->lock); - swap_duplicate(swap); + swap_shmem_alloc(swap); BUG_ON(page_mapped(page)); page_cache_release(page); /* pagecache ref */ swap_writepage(page, wbc); @@ -1097,6 +1106,10 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) shmem_swp_unmap(entry); unlock: spin_unlock(&info->lock); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ swapcache_free(swap, NULL); redirty: set_page_dirty(page); @@ -1210,6 +1223,7 @@ static int shmem_getpage(struct inode *inode, unsigned long idx, struct shmem_sb_info *sbinfo; struct page *filepage = *pagep; struct page *swappage; + struct page *prealloc_page = NULL; swp_entry_t *entry; swp_entry_t swap; gfp_t gfp; @@ -1234,7 +1248,6 @@ repeat: filepage = find_lock_page(mapping, idx); if (filepage && PageUptodate(filepage)) goto done; - error = 0; gfp = mapping_gfp_mask(mapping); if (!filepage) { /* @@ -1245,7 +1258,19 @@ repeat: if (error) goto failed; radix_tree_preload_end(); + if (sgp != SGP_READ && !prealloc_page) { + /* We don't care if this fails */ + prealloc_page = shmem_alloc_page(gfp, info, idx); + if (prealloc_page) { + if (mem_cgroup_cache_charge(prealloc_page, + current->mm, GFP_KERNEL)) { + page_cache_release(prealloc_page); + prealloc_page = NULL; + } + } + } } + error = 0; spin_lock(&info->lock); shmem_recalc_inode(inode); @@ -1374,17 +1399,16 @@ repeat: shmem_swp_unmap(entry); sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { - spin_lock(&sbinfo->stat_lock); - if (sbinfo->free_blocks == 0 || + if ((percpu_counter_compare(&sbinfo->used_blocks, sbinfo->max_blocks) > 0) || shmem_acct_block(info->flags)) { - spin_unlock(&sbinfo->stat_lock); spin_unlock(&info->lock); error = -ENOSPC; goto failed; } - sbinfo->free_blocks--; + percpu_counter_inc(&sbinfo->used_blocks); + spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&sbinfo->stat_lock); + spin_unlock(&inode->i_lock); } else if (shmem_acct_block(info->flags)) { spin_unlock(&info->lock); error = -ENOSPC; @@ -1394,28 +1418,38 @@ repeat: if (!filepage) { int ret; - spin_unlock(&info->lock); - filepage = shmem_alloc_page(gfp, info, idx); - if (!filepage) { - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - error = -ENOMEM; - goto failed; - } - SetPageSwapBacked(filepage); + if (!prealloc_page) { + spin_unlock(&info->lock); + filepage = shmem_alloc_page(gfp, info, idx); + if (!filepage) { + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + error = -ENOMEM; + goto failed; + } + SetPageSwapBacked(filepage); - /* Precharge page while we can wait, compensate after */ - error = mem_cgroup_cache_charge(filepage, current->mm, - GFP_KERNEL); - if (error) { - page_cache_release(filepage); - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - filepage = NULL; - goto failed; + /* + * Precharge page while we can wait, compensate + * after + */ + error = mem_cgroup_cache_charge(filepage, + current->mm, GFP_KERNEL); + if (error) { + page_cache_release(filepage); + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + filepage = NULL; + goto failed; + } + + spin_lock(&info->lock); + } else { + filepage = prealloc_page; + prealloc_page = NULL; + SetPageSwapBacked(filepage); } - spin_lock(&info->lock); entry = shmem_swp_alloc(info, idx, sgp); if (IS_ERR(entry)) error = PTR_ERR(entry); @@ -1456,13 +1490,19 @@ repeat: } done: *pagep = filepage; - return 0; + error = 0; + goto out; failed: if (*pagep != filepage) { unlock_page(filepage); page_cache_release(filepage); } +out: + if (prealloc_page) { + mem_cgroup_uncharge_cache_page(prealloc_page); + page_cache_release(prealloc_page); + } return error; } @@ -1534,8 +1574,8 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -static struct inode *shmem_get_inode(struct super_block *sb, int mode, - dev_t dev, unsigned long flags) +static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir, + int mode, dev_t dev, unsigned long flags) { struct inode *inode; struct shmem_inode_info *info; @@ -1546,9 +1586,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode, inode = new_inode(sb); if (inode) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); + inode->i_ino = get_next_ino(); + inode_init_owner(inode, dir, mode); inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &shmem_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; @@ -1558,6 +1597,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, int mode, spin_lock_init(&info->lock); info->flags = flags & VM_NORESERVE; INIT_LIST_HEAD(&info->swaplist); + cache_no_acl(inode); switch (mode & S_IFMT) { default: @@ -1629,8 +1669,8 @@ shmem_write_end(struct file *file, struct address_space *mapping, if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); - unlock_page(page); set_page_dirty(page); + unlock_page(page); page_cache_release(page); return copied; @@ -1779,17 +1819,16 @@ static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = TMPFS_MAGIC; buf->f_bsize = PAGE_CACHE_SIZE; buf->f_namelen = NAME_MAX; - spin_lock(&sbinfo->stat_lock); if (sbinfo->max_blocks) { buf->f_blocks = sbinfo->max_blocks; - buf->f_bavail = buf->f_bfree = sbinfo->free_blocks; + buf->f_bavail = buf->f_bfree = + sbinfo->max_blocks - percpu_counter_sum(&sbinfo->used_blocks); } if (sbinfo->max_inodes) { buf->f_files = sbinfo->max_inodes; buf->f_ffree = sbinfo->free_inodes; } /* else leave those fields 0 like simple_statfs */ - spin_unlock(&sbinfo->stat_lock); return 0; } @@ -1802,7 +1841,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) struct inode *inode; int error = -ENOSPC; - inode = shmem_get_inode(dir->i_sb, mode, dev, VM_NORESERVE); + inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { error = security_inode_init_security(inode, dir, NULL, NULL, NULL); @@ -1812,16 +1851,15 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) return error; } } - error = shmem_acl_init(inode, dir); +#ifdef CONFIG_TMPFS_POSIX_ACL + error = generic_acl_init(inode, dir); if (error) { iput(inode); return error; } - if (dir->i_mode & S_ISGID) { - inode->i_gid = dir->i_gid; - if (S_ISDIR(mode)) - inode->i_mode |= S_ISGID; - } +#else + error = 0; +#endif dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); @@ -1866,7 +1904,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME; inc_nlink(inode); - atomic_inc(&inode->i_count); /* New dentry reference */ + ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ d_instantiate(dentry, inode); out: @@ -1941,7 +1979,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s if (len > PAGE_CACHE_SIZE) return -ENAMETOOLONG; - inode = shmem_get_inode(dir->i_sb, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); + inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE); if (!inode) return -ENOSPC; @@ -1967,17 +2005,15 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s iput(inode); return error; } - unlock_page(page); inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); kunmap_atomic(kaddr, KM_USER0); set_page_dirty(page); + unlock_page(page); page_cache_release(page); } - if (dir->i_mode & S_ISGID) - inode->i_gid = dir->i_gid; dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); @@ -2017,7 +2053,6 @@ static const struct inode_operations shmem_symlink_inline_operations = { }; static const struct inode_operations shmem_symlink_inode_operations = { - .truncate = shmem_truncate, .readlink = generic_readlink, .follow_link = shmem_follow_link, .put_link = shmem_put_link, @@ -2031,39 +2066,40 @@ static const struct inode_operations shmem_symlink_inode_operations = { * filesystem level, though. */ -static size_t shmem_xattr_security_list(struct inode *inode, char *list, +static size_t shmem_xattr_security_list(struct dentry *dentry, char *list, size_t list_len, const char *name, - size_t name_len) + size_t name_len, int handler_flags) { - return security_inode_listsecurity(inode, list, list_len); + return security_inode_listsecurity(dentry->d_inode, list, list_len); } -static int shmem_xattr_security_get(struct inode *inode, const char *name, - void *buffer, size_t size) +static int shmem_xattr_security_get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int handler_flags) { if (strcmp(name, "") == 0) return -EINVAL; - return xattr_getsecurity(inode, name, buffer, size); + return xattr_getsecurity(dentry->d_inode, name, buffer, size); } -static int shmem_xattr_security_set(struct inode *inode, const char *name, - const void *value, size_t size, int flags) +static int shmem_xattr_security_set(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags, int handler_flags) { if (strcmp(name, "") == 0) return -EINVAL; - return security_inode_setsecurity(inode, name, value, size, flags); + return security_inode_setsecurity(dentry->d_inode, name, value, + size, flags); } -static struct xattr_handler shmem_xattr_security_handler = { +static const struct xattr_handler shmem_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = shmem_xattr_security_list, .get = shmem_xattr_security_get, .set = shmem_xattr_security_set, }; -static struct xattr_handler *shmem_xattr_handlers[] = { - &shmem_xattr_acl_access_handler, - &shmem_xattr_acl_default_handler, +static const struct xattr_handler *shmem_xattr_handlers[] = { + &generic_acl_access_handler, + &generic_acl_default_handler, &shmem_xattr_security_handler, NULL }; @@ -2111,7 +2147,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, if (*len < 3) return 255; - if (hlist_unhashed(&inode->i_hash)) { + if (inode_unhashed(inode)) { /* Unfortunately insert_inode_hash is not idempotent, * so as we hash inodes here rather than at creation * time, we need a lock to ensure we only try @@ -2119,7 +2155,7 @@ static int shmem_encode_fh(struct dentry *dentry, __u32 *fh, int *len, */ static DEFINE_SPINLOCK(lock); spin_lock(&lock); - if (hlist_unhashed(&inode->i_hash)) + if (inode_unhashed(inode)) __insert_inode_hash(inode, inode->i_ino + inode->i_generation); spin_unlock(&lock); @@ -2233,7 +2269,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); struct shmem_sb_info config = *sbinfo; - unsigned long blocks; unsigned long inodes; int error = -EINVAL; @@ -2241,9 +2276,8 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) return error; spin_lock(&sbinfo->stat_lock); - blocks = sbinfo->max_blocks - sbinfo->free_blocks; inodes = sbinfo->max_inodes - sbinfo->free_inodes; - if (config.max_blocks < blocks) + if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) goto out; if (config.max_inodes < inodes) goto out; @@ -2260,7 +2294,6 @@ static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) error = 0; sbinfo->max_blocks = config.max_blocks; - sbinfo->free_blocks = config.max_blocks - blocks; sbinfo->max_inodes = config.max_inodes; sbinfo->free_inodes = config.max_inodes - inodes; @@ -2293,12 +2326,14 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs) static void shmem_put_super(struct super_block *sb) { - kfree(sb->s_fs_info); + struct shmem_sb_info *sbinfo = SHMEM_SB(sb); + + percpu_counter_destroy(&sbinfo->used_blocks); + kfree(sbinfo); sb->s_fs_info = NULL; } -static int shmem_fill_super(struct super_block *sb, - void *data, int silent) +int shmem_fill_super(struct super_block *sb, void *data, int silent) { struct inode *inode; struct dentry *root; @@ -2306,17 +2341,14 @@ static int shmem_fill_super(struct super_block *sb, int err = -ENOMEM; /* Round up to L1_CACHE_BYTES to resist false sharing */ - sbinfo = kmalloc(max((int)sizeof(struct shmem_sb_info), + sbinfo = kzalloc(max((int)sizeof(struct shmem_sb_info), L1_CACHE_BYTES), GFP_KERNEL); if (!sbinfo) return -ENOMEM; - sbinfo->max_blocks = 0; - sbinfo->max_inodes = 0; sbinfo->mode = S_IRWXUGO | S_ISVTX; sbinfo->uid = current_fsuid(); sbinfo->gid = current_fsgid(); - sbinfo->mpol = NULL; sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS @@ -2339,7 +2371,8 @@ static int shmem_fill_super(struct super_block *sb, #endif spin_lock_init(&sbinfo->stat_lock); - sbinfo->free_blocks = sbinfo->max_blocks; + if (percpu_counter_init(&sbinfo->used_blocks, 0)) + goto failed; sbinfo->free_inodes = sbinfo->max_inodes; sb->s_maxbytes = SHMEM_MAX_BYTES; @@ -2353,7 +2386,7 @@ static int shmem_fill_super(struct super_block *sb, sb->s_flags |= MS_POSIXACL; #endif - inode = shmem_get_inode(sb, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); + inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE); if (!inode) goto failed; inode->i_uid = sbinfo->uid; @@ -2382,14 +2415,20 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) return &p->vfs_inode; } +static void shmem_i_callback(struct rcu_head *head) +{ + struct inode *inode = container_of(head, struct inode, i_rcu); + INIT_LIST_HEAD(&inode->i_dentry); + kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); +} + static void shmem_destroy_inode(struct inode *inode) { if ((inode->i_mode & S_IFMT) == S_IFREG) { /* only struct inode is valid if it's an inline symlink */ mpol_free_shared_policy(&SHMEM_I(inode)->policy); } - shmem_acl_destroy_inode(inode); - kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); + call_rcu(&inode->i_rcu, shmem_i_callback); } static void init_once(void *foo) @@ -2397,10 +2436,6 @@ static void init_once(void *foo) struct shmem_inode_info *p = (struct shmem_inode_info *) foo; inode_init_once(&p->vfs_inode); -#ifdef CONFIG_TMPFS_POSIX_ACL - p->i_acl = NULL; - p->i_default_acl = NULL; -#endif } static int init_inodecache(void) @@ -2425,6 +2460,7 @@ static const struct address_space_operations shmem_aops = { .write_end = shmem_write_end, #endif .migratepage = migrate_page, + .error_remove_page = generic_error_remove_page, }; static const struct file_operations shmem_file_operations = { @@ -2435,14 +2471,13 @@ static const struct file_operations shmem_file_operations = { .write = do_sync_write, .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, - .fsync = simple_sync_file, + .fsync = noop_fsync, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, #endif }; static const struct inode_operations shmem_inode_operations = { - .truncate = shmem_truncate, .setattr = shmem_notify_change, .truncate_range = shmem_truncate_range, #ifdef CONFIG_TMPFS_POSIX_ACL @@ -2450,7 +2485,7 @@ static const struct inode_operations shmem_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = generic_check_acl, #endif }; @@ -2473,7 +2508,7 @@ static const struct inode_operations shmem_dir_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = generic_check_acl, #endif }; @@ -2484,7 +2519,7 @@ static const struct inode_operations shmem_special_inode_operations = { .getxattr = generic_getxattr, .listxattr = generic_listxattr, .removexattr = generic_removexattr, - .permission = shmem_permission, + .check_acl = generic_check_acl, #endif }; @@ -2496,12 +2531,12 @@ static const struct super_operations shmem_ops = { .remount_fs = shmem_remount_fs, .show_options = shmem_show_options, #endif - .delete_inode = shmem_delete_inode, + .evict_inode = shmem_evict_inode, .drop_inode = generic_delete_inode, .put_super = shmem_put_super, }; -static struct vm_operations_struct shmem_vm_ops = { +static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, @@ -2510,20 +2545,20 @@ static struct vm_operations_struct shmem_vm_ops = { }; -static int shmem_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, struct vfsmount *mnt) +static struct dentry *shmem_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) { - return get_sb_nodev(fs_type, flags, data, shmem_fill_super, mnt); + return mount_nodev(fs_type, flags, data, shmem_fill_super); } static struct file_system_type tmpfs_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", - .get_sb = shmem_get_sb, + .mount = shmem_mount, .kill_sb = kill_litter_super, }; -static int __init init_tmpfs(void) +int __init init_tmpfs(void) { int error; @@ -2561,6 +2596,45 @@ out4: return error; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * @inode: the inode to be searched + * @pgoff: the offset to be searched + * @pagep: the pointer for the found page to be stored + * @ent: the pointer for the found swap entry to be stored + * + * If a page is found, refcount of it is incremented. Callers should handle + * these refcount. + */ +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent) +{ + swp_entry_t entry = { .val = 0 }, *ptr; + struct page *page = NULL; + struct shmem_inode_info *info = SHMEM_I(inode); + + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + goto out; + + spin_lock(&info->lock); + ptr = shmem_swp_entry(info, pgoff, NULL); +#ifdef CONFIG_SWAP + if (ptr && ptr->val) { + entry.val = ptr->val; + page = find_get_page(&swapper_space, entry.val); + } else +#endif + page = find_get_page(inode->i_mapping, pgoff); + if (ptr) + shmem_swp_unmap(ptr); + spin_unlock(&info->lock); +out: + *pagep = page; + *ent = entry; +} +#endif + #else /* !CONFIG_SHMEM */ /* @@ -2576,11 +2650,11 @@ out4: static struct file_system_type tmpfs_fs_type = { .name = "tmpfs", - .get_sb = ramfs_get_sb, + .mount = ramfs_mount, .kill_sb = kill_litter_super, }; -static int __init init_tmpfs(void) +int __init init_tmpfs(void) { BUG_ON(register_filesystem(&tmpfs_fs_type) != 0); @@ -2595,9 +2669,39 @@ int shmem_unuse(swp_entry_t entry, struct page *page) return 0; } +int shmem_lock(struct file *file, int lock, struct user_struct *user) +{ + return 0; +} + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_get_shmem_target - find a page or entry assigned to the shmem file + * @inode: the inode to be searched + * @pgoff: the offset to be searched + * @pagep: the pointer for the found page to be stored + * @ent: the pointer for the found swap entry to be stored + * + * If a page is found, refcount of it is incremented. Callers should handle + * these refcount. + */ +void mem_cgroup_get_shmem_target(struct inode *inode, pgoff_t pgoff, + struct page **pagep, swp_entry_t *ent) +{ + struct page *page = NULL; + + if ((pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) + goto out; + page = find_get_page(inode->i_mapping, pgoff); +out: + *pagep = page; + *ent = (swp_entry_t){ .val = 0 }; +} +#endif + #define shmem_vm_ops generic_file_vm_ops #define shmem_file_operations ramfs_file_operations -#define shmem_get_inode(sb, mode, dev, flags) ramfs_get_inode(sb, mode, dev) +#define shmem_get_inode(sb, dir, mode, dev, flags) ramfs_get_inode(sb, dir, mode, dev) #define shmem_acct_size(flags, size) 0 #define shmem_unacct_size(flags, size) do {} while (0) #define SHMEM_MAX_BYTES MAX_LFS_FILESIZE @@ -2617,7 +2721,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags int error; struct file *file; struct inode *inode; - struct dentry *dentry, *root; + struct path path; + struct dentry *root; struct qstr this; if (IS_ERR(shm_mnt)) @@ -2634,38 +2739,35 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags this.len = strlen(name); this.hash = 0; /* will go */ root = shm_mnt->mnt_root; - dentry = d_alloc(root, &this); - if (!dentry) + path.dentry = d_alloc(root, &this); + if (!path.dentry) goto put_memory; - - error = -ENFILE; - file = get_empty_filp(); - if (!file) - goto put_dentry; + path.mnt = mntget(shm_mnt); error = -ENOSPC; - inode = shmem_get_inode(root->d_sb, S_IFREG | S_IRWXUGO, 0, flags); + inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags); if (!inode) - goto close_file; + goto put_dentry; - d_instantiate(dentry, inode); + d_instantiate(path.dentry, inode); inode->i_size = size; inode->i_nlink = 0; /* It is unlinked */ - init_file(file, shm_mnt, dentry, FMODE_WRITE | FMODE_READ, - &shmem_file_operations); - #ifndef CONFIG_MMU error = ramfs_nommu_expand_for_mapping(inode, size); if (error) - goto close_file; + goto put_dentry; #endif - ima_counts_get(file); + + error = -ENFILE; + file = alloc_file(&path, FMODE_WRITE | FMODE_READ, + &shmem_file_operations); + if (!file) + goto put_dentry; + return file; -close_file: - put_filp(file); put_dentry: - dput(dentry); + path_put(&path); put_memory: shmem_unacct_size(flags, size); return ERR_PTR(error); @@ -2691,5 +2793,3 @@ int shmem_zero_setup(struct vm_area_struct *vma) vma->vm_ops = &shmem_vm_ops; return 0; } - -module_init(init_tmpfs) diff --git a/mm/shmem_acl.c b/mm/shmem_acl.c deleted file mode 100644 index 8e5aadd..0000000 --- a/mm/shmem_acl.c +++ /dev/null @@ -1,197 +0,0 @@ -/* - * mm/shmem_acl.c - * - * (C) 2005 Andreas Gruenbacher <agruen@suse.de> - * - * This file is released under the GPL. - */ - -#include <linux/fs.h> -#include <linux/shmem_fs.h> -#include <linux/xattr.h> -#include <linux/generic_acl.h> - -/** - * shmem_get_acl - generic_acl_operations->getacl() operation - */ -static struct posix_acl * -shmem_get_acl(struct inode *inode, int type) -{ - struct posix_acl *acl = NULL; - - spin_lock(&inode->i_lock); - switch(type) { - case ACL_TYPE_ACCESS: - acl = posix_acl_dup(SHMEM_I(inode)->i_acl); - break; - - case ACL_TYPE_DEFAULT: - acl = posix_acl_dup(SHMEM_I(inode)->i_default_acl); - break; - } - spin_unlock(&inode->i_lock); - - return acl; -} - -/** - * shmem_set_acl - generic_acl_operations->setacl() operation - */ -static void -shmem_set_acl(struct inode *inode, int type, struct posix_acl *acl) -{ - struct posix_acl *free = NULL; - - spin_lock(&inode->i_lock); - switch(type) { - case ACL_TYPE_ACCESS: - free = SHMEM_I(inode)->i_acl; - SHMEM_I(inode)->i_acl = posix_acl_dup(acl); - break; - - case ACL_TYPE_DEFAULT: - free = SHMEM_I(inode)->i_default_acl; - SHMEM_I(inode)->i_default_acl = posix_acl_dup(acl); - break; - } - spin_unlock(&inode->i_lock); - posix_acl_release(free); -} - -struct generic_acl_operations shmem_acl_ops = { - .getacl = shmem_get_acl, - .setacl = shmem_set_acl, -}; - -/** - * shmem_list_acl_access, shmem_get_acl_access, shmem_set_acl_access, - * shmem_xattr_acl_access_handler - plumbing code to implement the - * system.posix_acl_access xattr using the generic acl functions. - */ - -static size_t -shmem_list_acl_access(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) -{ - return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, - list, list_size); -} - -static int -shmem_get_acl_access(struct inode *inode, const char *name, void *buffer, - size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, buffer, - size); -} - -static int -shmem_set_acl_access(struct inode *inode, const char *name, const void *value, - size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_ACCESS, value, - size); -} - -struct xattr_handler shmem_xattr_acl_access_handler = { - .prefix = POSIX_ACL_XATTR_ACCESS, - .list = shmem_list_acl_access, - .get = shmem_get_acl_access, - .set = shmem_set_acl_access, -}; - -/** - * shmem_list_acl_default, shmem_get_acl_default, shmem_set_acl_default, - * shmem_xattr_acl_default_handler - plumbing code to implement the - * system.posix_acl_default xattr using the generic acl functions. - */ - -static size_t -shmem_list_acl_default(struct inode *inode, char *list, size_t list_size, - const char *name, size_t name_len) -{ - return generic_acl_list(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, - list, list_size); -} - -static int -shmem_get_acl_default(struct inode *inode, const char *name, void *buffer, - size_t size) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_get(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, buffer, - size); -} - -static int -shmem_set_acl_default(struct inode *inode, const char *name, const void *value, - size_t size, int flags) -{ - if (strcmp(name, "") != 0) - return -EINVAL; - return generic_acl_set(inode, &shmem_acl_ops, ACL_TYPE_DEFAULT, value, - size); -} - -struct xattr_handler shmem_xattr_acl_default_handler = { - .prefix = POSIX_ACL_XATTR_DEFAULT, - .list = shmem_list_acl_default, - .get = shmem_get_acl_default, - .set = shmem_set_acl_default, -}; - -/** - * shmem_acl_init - Inizialize the acl(s) of a new inode - */ -int -shmem_acl_init(struct inode *inode, struct inode *dir) -{ - return generic_acl_init(inode, dir, &shmem_acl_ops); -} - -/** - * shmem_acl_destroy_inode - destroy acls hanging off the in-memory inode - * - * This is done before destroying the actual inode. - */ - -void -shmem_acl_destroy_inode(struct inode *inode) -{ - if (SHMEM_I(inode)->i_acl) - posix_acl_release(SHMEM_I(inode)->i_acl); - SHMEM_I(inode)->i_acl = NULL; - if (SHMEM_I(inode)->i_default_acl) - posix_acl_release(SHMEM_I(inode)->i_default_acl); - SHMEM_I(inode)->i_default_acl = NULL; -} - -/** - * shmem_check_acl - check_acl() callback for generic_permission() - */ -static int -shmem_check_acl(struct inode *inode, int mask) -{ - struct posix_acl *acl = shmem_get_acl(inode, ACL_TYPE_ACCESS); - - if (acl) { - int error = posix_acl_permission(inode, acl, mask); - posix_acl_release(acl); - return error; - } - return -EAGAIN; -} - -/** - * shmem_permission - permission() inode operation - */ -int -shmem_permission(struct inode *inode, int mask) -{ - return generic_permission(inode, mask, shmem_check_acl); -} @@ -102,7 +102,6 @@ #include <linux/cpu.h> #include <linux/sysctl.h> #include <linux/module.h> -#include <linux/kmemtrace.h> #include <linux/rcupdate.h> #include <linux/string.h> #include <linux/uaccess.h> @@ -115,6 +114,7 @@ #include <linux/reciprocal_div.h> #include <linux/debugobjects.h> #include <linux/kmemcheck.h> +#include <linux/memory.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> @@ -144,30 +144,6 @@ #define BYTES_PER_WORD sizeof(void *) #define REDZONE_ALIGN max(BYTES_PER_WORD, __alignof__(unsigned long long)) -#ifndef ARCH_KMALLOC_MINALIGN -/* - * Enforce a minimum alignment for the kmalloc caches. - * Usually, the kmalloc caches are cache_line_size() aligned, except when - * DEBUG and FORCED_DEBUG are enabled, then they are BYTES_PER_WORD aligned. - * Some archs want to perform DMA into kmalloc caches and need a guaranteed - * alignment larger than the alignment of a 64-bit integer. - * ARCH_KMALLOC_MINALIGN allows that. - * Note that increasing this value may disable some debug features. - */ -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -/* - * Enforce a minimum alignment for all caches. - * Intended for archs that get misalignment faults even for BYTES_PER_WORD - * aligned buffers. Includes ARCH_KMALLOC_MINALIGN. - * If possible: Do not enable this flag for CONFIG_DEBUG_SLAB, it disables - * some debug features. - */ -#define ARCH_SLAB_MINALIGN 0 -#endif - #ifndef ARCH_KMALLOC_FLAGS #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN #endif @@ -418,7 +394,7 @@ static void kmem_list3_init(struct kmem_list3 *parent) #define STATS_DEC_ACTIVE(x) do { } while (0) #define STATS_INC_ALLOCED(x) do { } while (0) #define STATS_INC_GROWN(x) do { } while (0) -#define STATS_ADD_REAPED(x,y) do { } while (0) +#define STATS_ADD_REAPED(x,y) do { (void)(y); } while (0) #define STATS_SET_HIGH(x) do { } while (0) #define STATS_INC_ERR(x) do { } while (0) #define STATS_INC_NODEALLOCS(x) do { } while (0) @@ -490,7 +466,7 @@ static void **dbg_userword(struct kmem_cache *cachep, void *objp) #endif -#ifdef CONFIG_KMEMTRACE +#ifdef CONFIG_TRACING size_t slab_buffer_size(struct kmem_cache *cachep) { return cachep->buffer_size; @@ -604,6 +580,26 @@ static struct kmem_cache cache_cache = { #define BAD_ALIEN_MAGIC 0x01020304ul +/* + * chicken and egg problem: delay the per-cpu array allocation + * until the general caches are up. + */ +static enum { + NONE, + PARTIAL_AC, + PARTIAL_L3, + EARLY, + FULL +} g_cpucache_up; + +/* + * used by boot code to determine if it can use slab based allocator + */ +int slab_is_available(void) +{ + return g_cpucache_up >= EARLY; +} + #ifdef CONFIG_LOCKDEP /* @@ -620,40 +616,52 @@ static struct kmem_cache cache_cache = { static struct lock_class_key on_slab_l3_key; static struct lock_class_key on_slab_alc_key; -static inline void init_lock_keys(void) - +static void init_node_lock_keys(int q) { - int q; struct cache_sizes *s = malloc_sizes; - while (s->cs_size != ULONG_MAX) { - for_each_node(q) { - struct array_cache **alc; - int r; - struct kmem_list3 *l3 = s->cs_cachep->nodelists[q]; - if (!l3 || OFF_SLAB(s->cs_cachep)) - continue; - lockdep_set_class(&l3->list_lock, &on_slab_l3_key); - alc = l3->alien; - /* - * FIXME: This check for BAD_ALIEN_MAGIC - * should go away when common slab code is taught to - * work even without alien caches. - * Currently, non NUMA code returns BAD_ALIEN_MAGIC - * for alloc_alien_cache, - */ - if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) - continue; - for_each_node(r) { - if (alc[r]) - lockdep_set_class(&alc[r]->lock, - &on_slab_alc_key); - } + if (g_cpucache_up != FULL) + return; + + for (s = malloc_sizes; s->cs_size != ULONG_MAX; s++) { + struct array_cache **alc; + struct kmem_list3 *l3; + int r; + + l3 = s->cs_cachep->nodelists[q]; + if (!l3 || OFF_SLAB(s->cs_cachep)) + continue; + lockdep_set_class(&l3->list_lock, &on_slab_l3_key); + alc = l3->alien; + /* + * FIXME: This check for BAD_ALIEN_MAGIC + * should go away when common slab code is taught to + * work even without alien caches. + * Currently, non NUMA code returns BAD_ALIEN_MAGIC + * for alloc_alien_cache, + */ + if (!alc || (unsigned long)alc == BAD_ALIEN_MAGIC) + continue; + for_each_node(r) { + if (alc[r]) + lockdep_set_class(&alc[r]->lock, + &on_slab_alc_key); } - s++; } } + +static inline void init_lock_keys(void) +{ + int node; + + for_each_node(node) + init_node_lock_keys(node); +} #else +static void init_node_lock_keys(int q) +{ +} + static inline void init_lock_keys(void) { } @@ -665,27 +673,7 @@ static inline void init_lock_keys(void) static DEFINE_MUTEX(cache_chain_mutex); static struct list_head cache_chain; -/* - * chicken and egg problem: delay the per-cpu array allocation - * until the general caches are up. - */ -static enum { - NONE, - PARTIAL_AC, - PARTIAL_L3, - EARLY, - FULL -} g_cpucache_up; - -/* - * used by boot code to determine if it can use slab based allocator - */ -int slab_is_available(void) -{ - return g_cpucache_up >= EARLY; -} - -static DEFINE_PER_CPU(struct delayed_work, reap_work); +static DEFINE_PER_CPU(struct delayed_work, slab_reap_work); static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) { @@ -826,27 +814,27 @@ __setup("noaliencache", noaliencache_setup); * objects freed on different nodes from which they were allocated) and the * flushing of remote pcps by calling drain_node_pages. */ -static DEFINE_PER_CPU(unsigned long, reap_node); +static DEFINE_PER_CPU(unsigned long, slab_reap_node); static void init_reap_node(int cpu) { int node; - node = next_node(cpu_to_node(cpu), node_online_map); + node = next_node(cpu_to_mem(cpu), node_online_map); if (node == MAX_NUMNODES) node = first_node(node_online_map); - per_cpu(reap_node, cpu) = node; + per_cpu(slab_reap_node, cpu) = node; } static void next_reap_node(void) { - int node = __get_cpu_var(reap_node); + int node = __this_cpu_read(slab_reap_node); node = next_node(node, node_online_map); if (unlikely(node >= MAX_NUMNODES)) node = first_node(node_online_map); - __get_cpu_var(reap_node) = node; + __this_cpu_write(slab_reap_node, node); } #else @@ -863,7 +851,7 @@ static void next_reap_node(void) */ static void __cpuinit start_cpu_timer(int cpu) { - struct delayed_work *reap_work = &per_cpu(reap_work, cpu); + struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); /* * When this gets called from do_initcalls via cpucache_init(), @@ -872,7 +860,7 @@ static void __cpuinit start_cpu_timer(int cpu) */ if (keventd_up() && reap_work->work.func == NULL) { init_reap_node(cpu); - INIT_DELAYED_WORK(reap_work, cache_reap); + INIT_DELAYED_WORK_DEFERRABLE(reap_work, cache_reap); schedule_delayed_work_on(cpu, reap_work, __round_jiffies_relative(HZ, cpu)); } @@ -913,7 +901,7 @@ static int transfer_objects(struct array_cache *to, struct array_cache *from, unsigned int max) { /* Figure out how many entries to transfer */ - int nr = min(min(from->avail, max), to->limit - to->avail); + int nr = min3(from->avail, max, to->limit - to->avail); if (!nr) return 0; @@ -923,7 +911,6 @@ static int transfer_objects(struct array_cache *to, from->avail -= nr; to->avail += nr; - to->touched = 1; return nr; } @@ -971,13 +958,11 @@ static struct array_cache **alloc_alien_cache(int node, int limit, gfp_t gfp) if (limit > 1) limit = 12; - ac_ptr = kmalloc_node(memsize, gfp, node); + ac_ptr = kzalloc_node(memsize, gfp, node); if (ac_ptr) { for_each_node(i) { - if (i == node || !node_online(i)) { - ac_ptr[i] = NULL; + if (i == node || !node_online(i)) continue; - } ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d, gfp); if (!ac_ptr[i]) { for (i--; i >= 0; i--) @@ -1027,7 +1012,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, */ static void reap_alien(struct kmem_cache *cachep, struct kmem_list3 *l3) { - int node = __get_cpu_var(reap_node); + int node = __this_cpu_read(slab_reap_node); if (l3->alien) { struct array_cache *ac = l3->alien[node]; @@ -1064,7 +1049,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) struct array_cache *alien = NULL; int node; - node = numa_node_id(); + node = numa_mem_id(); /* * Make sure we are not freeing a object from another node to the array @@ -1093,11 +1078,57 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) } #endif +/* + * Allocates and initializes nodelists for a node on each slab cache, used for + * either memory or cpu hotplug. If memory is being hot-added, the kmem_list3 + * will be allocated off-node since memory is not yet online for the new node. + * When hotplugging memory or a cpu, existing nodelists are not replaced if + * already in use. + * + * Must hold cache_chain_mutex. + */ +static int init_cache_nodelists_node(int node) +{ + struct kmem_cache *cachep; + struct kmem_list3 *l3; + const int memsize = sizeof(struct kmem_list3); + + list_for_each_entry(cachep, &cache_chain, next) { + /* + * Set up the size64 kmemlist for cpu before we can + * begin anything. Make sure some other cpu on this + * node has not already allocated this + */ + if (!cachep->nodelists[node]) { + l3 = kmalloc_node(memsize, GFP_KERNEL, node); + if (!l3) + return -ENOMEM; + kmem_list3_init(l3); + l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; + + /* + * The l3s don't come and go as CPUs come and + * go. cache_chain_mutex is sufficient + * protection here. + */ + cachep->nodelists[node] = l3; + } + + spin_lock_irq(&cachep->nodelists[node]->list_lock); + cachep->nodelists[node]->free_limit = + (1 + nr_cpus_node(node)) * + cachep->batchcount + cachep->num; + spin_unlock_irq(&cachep->nodelists[node]->list_lock); + } + return 0; +} + static void __cpuinit cpuup_canceled(long cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; - int node = cpu_to_node(cpu); + int node = cpu_to_mem(cpu); const struct cpumask *mask = cpumask_of_node(node); list_for_each_entry(cachep, &cache_chain, next) { @@ -1120,7 +1151,7 @@ static void __cpuinit cpuup_canceled(long cpu) if (nc) free_block(cachep, nc->entry, nc->avail, node); - if (!cpus_empty(*mask)) { + if (!cpumask_empty(mask)) { spin_unlock_irq(&l3->list_lock); goto free_array_cache; } @@ -1162,8 +1193,8 @@ static int __cpuinit cpuup_prepare(long cpu) { struct kmem_cache *cachep; struct kmem_list3 *l3 = NULL; - int node = cpu_to_node(cpu); - const int memsize = sizeof(struct kmem_list3); + int node = cpu_to_mem(cpu); + int err; /* * We need to do this right in the beginning since @@ -1171,35 +1202,9 @@ static int __cpuinit cpuup_prepare(long cpu) * kmalloc_node allows us to add the slab to the right * kmem_list3 and not this cpu's kmem_list3 */ - - list_for_each_entry(cachep, &cache_chain, next) { - /* - * Set up the size64 kmemlist for cpu before we can - * begin anything. Make sure some other cpu on this - * node has not already allocated this - */ - if (!cachep->nodelists[node]) { - l3 = kmalloc_node(memsize, GFP_KERNEL, node); - if (!l3) - goto bad; - kmem_list3_init(l3); - l3->next_reap = jiffies + REAPTIMEOUT_LIST3 + - ((unsigned long)cachep) % REAPTIMEOUT_LIST3; - - /* - * The l3s don't come and go as CPUs come and - * go. cache_chain_mutex is sufficient - * protection here. - */ - cachep->nodelists[node] = l3; - } - - spin_lock_irq(&cachep->nodelists[node]->list_lock); - cachep->nodelists[node]->free_limit = - (1 + nr_cpus_node(node)) * - cachep->batchcount + cachep->num; - spin_unlock_irq(&cachep->nodelists[node]->list_lock); - } + err = init_cache_nodelists_node(node); + if (err < 0) + goto bad; /* * Now we can go ahead with allocating the shared arrays and @@ -1254,6 +1259,8 @@ static int __cpuinit cpuup_prepare(long cpu) kfree(shared); free_alien_cache(alien); } + init_node_lock_keys(node); + return 0; bad: cpuup_canceled(cpu); @@ -1286,9 +1293,9 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, * anything expensive but will only modify reap_work * and reschedule the timer. */ - cancel_rearming_delayed_work(&per_cpu(reap_work, cpu)); + cancel_delayed_work_sync(&per_cpu(slab_reap_work, cpu)); /* Now the cache_reaper is guaranteed to be not running. */ - per_cpu(reap_work, cpu).work.func = NULL; + per_cpu(slab_reap_work, cpu).work.func = NULL; break; case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: @@ -1313,18 +1320,82 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, mutex_unlock(&cache_chain_mutex); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __cpuinitdata cpucache_notifier = { &cpuup_callback, NULL, 0 }; +#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +/* + * Drains freelist for a node on each slab cache, used for memory hot-remove. + * Returns -EBUSY if all objects cannot be drained so that the node is not + * removed. + * + * Must hold cache_chain_mutex. + */ +static int __meminit drain_cache_nodelists_node(int node) +{ + struct kmem_cache *cachep; + int ret = 0; + + list_for_each_entry(cachep, &cache_chain, next) { + struct kmem_list3 *l3; + + l3 = cachep->nodelists[node]; + if (!l3) + continue; + + drain_freelist(cachep, l3, l3->free_objects); + + if (!list_empty(&l3->slabs_full) || + !list_empty(&l3->slabs_partial)) { + ret = -EBUSY; + break; + } + } + return ret; +} + +static int __meminit slab_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mnb = arg; + int ret = 0; + int nid; + + nid = mnb->status_change_nid; + if (nid < 0) + goto out; + + switch (action) { + case MEM_GOING_ONLINE: + mutex_lock(&cache_chain_mutex); + ret = init_cache_nodelists_node(nid); + mutex_unlock(&cache_chain_mutex); + break; + case MEM_GOING_OFFLINE: + mutex_lock(&cache_chain_mutex); + ret = drain_cache_nodelists_node(nid); + mutex_unlock(&cache_chain_mutex); + break; + case MEM_ONLINE: + case MEM_OFFLINE: + case MEM_CANCEL_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } +out: + return ret ? notifier_from_errno(ret) : NOTIFY_OK; +} +#endif /* CONFIG_NUMA && CONFIG_MEMORY_HOTPLUG */ + /* * swap the static kmem_list3 with kmalloced memory */ -static void init_list(struct kmem_cache *cachep, struct kmem_list3 *list, - int nodeid) +static void __init init_list(struct kmem_cache *cachep, struct kmem_list3 *list, + int nodeid) { struct kmem_list3 *ptr; @@ -1384,7 +1455,7 @@ void __init kmem_cache_init(void) * Fragmentation resistance on low memory - only use bigger * page orders on machines with more than 32MB of memory. */ - if (num_physpages > (32 << 20) >> PAGE_SHIFT) + if (totalram_pages > (32 << 20) >> PAGE_SHIFT) slab_break_gfp_order = BREAK_GFP_ORDER_HI; /* Bootstrap is tricky, because several objects are allocated @@ -1407,7 +1478,7 @@ void __init kmem_cache_init(void) * 6) Resize the head arrays of the kmalloc caches to their final sizes. */ - node = numa_node_id(); + node = numa_mem_id(); /* 1) create the cache_cache */ INIT_LIST_HEAD(&cache_chain); @@ -1544,9 +1615,6 @@ void __init kmem_cache_init(void) } g_cpucache_up = EARLY; - - /* Annotate slab for lockdep -- annotate the malloc caches */ - init_lock_keys(); } void __init kmem_cache_init_late(void) @@ -1563,12 +1631,23 @@ void __init kmem_cache_init_late(void) /* Done! */ g_cpucache_up = FULL; + /* Annotate slab for lockdep -- annotate the malloc caches */ + init_lock_keys(); + /* * Register a cpu startup notifier callback that initializes * cpu_cache_get for all new cpus */ register_cpu_notifier(&cpucache_notifier); +#ifdef CONFIG_NUMA + /* + * Register a memory hotplug callback that initializes and frees + * nodelists. + */ + hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); +#endif + /* * The reap timers are started later, with a module init call: That part * of the kernel is not yet operational. @@ -2041,7 +2120,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) } } } - cachep->nodelists[numa_node_id()]->next_reap = + cachep->nodelists[numa_mem_id()]->next_reap = jiffies + REAPTIMEOUT_LIST3 + ((unsigned long)cachep) % REAPTIMEOUT_LIST3; @@ -2209,8 +2288,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, if (ralign < align) { ralign = align; } - /* disable debug if necessary */ - if (ralign > __alignof__(unsigned long long)) + /* disable debug if not aligning with REDZONE_ALIGN */ + if (ralign & (__alignof__(unsigned long long) - 1)) flags &= ~(SLAB_RED_ZONE | SLAB_STORE_USER); /* * 4) Store it. @@ -2236,8 +2315,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, */ if (flags & SLAB_RED_ZONE) { /* add space for red zone words */ - cachep->obj_offset += sizeof(unsigned long long); - size += 2 * sizeof(unsigned long long); + cachep->obj_offset += align; + size += align + sizeof(unsigned long long); } if (flags & SLAB_STORE_USER) { /* user store requires one word storage behind the end of @@ -2251,8 +2330,8 @@ kmem_cache_create (const char *name, size_t size, size_t align, } #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC) if (size >= malloc_sizes[INDEX_L3 + 1].cs_size - && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) { - cachep->obj_offset += PAGE_SIZE - size; + && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) { + cachep->obj_offset += PAGE_SIZE - ALIGN(size, align); size = PAGE_SIZE; } #endif @@ -2261,9 +2340,11 @@ kmem_cache_create (const char *name, size_t size, size_t align, /* * Determine if the slab management is 'on' or 'off' slab. * (bootstrapping cannot cope with offslab caches so don't do - * it too early on.) + * it too early on. Always use on-slab management when + * SLAB_NOLEAKTRACE to avoid recursive calls into kmemleak) */ - if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init) + if ((size >= (PAGE_SIZE >> 3)) && !slab_early_init && + !(flags & SLAB_NOLEAKTRACE)) /* * Size is large, assume best to place the slab management obj * off-slab (should allow better packing of objs). @@ -2370,7 +2451,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock); + assert_spin_locked(&cachep->nodelists[numa_mem_id()]->list_lock); #endif } @@ -2397,7 +2478,7 @@ static void do_drain(void *arg) { struct kmem_cache *cachep = arg; struct array_cache *ac; - int node = numa_node_id(); + int node = numa_mem_id(); check_irq_off(); ac = cpu_cache_get(cachep); @@ -2547,7 +2628,7 @@ void kmem_cache_destroy(struct kmem_cache *cachep) } if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) - synchronize_rcu(); + rcu_barrier(); __kmem_cache_destroy(cachep); mutex_unlock(&cache_chain_mutex); @@ -2582,8 +2663,8 @@ static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, * kmemleak does not treat the ->s_mem pointer as a reference * to the object. Otherwise we will not report the leak. */ - kmemleak_scan_area(slabp, offsetof(struct slab, list), - sizeof(struct list_head), local_flags); + kmemleak_scan_area(&slabp->list, sizeof(struct list_head), + local_flags); if (!slabp) return NULL; } else { @@ -2700,7 +2781,7 @@ static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, /* * Map pages beginning at addr to the given cache and slab. This is required * for the slab allocator to be able to lookup the cache and slab of a - * virtual address for kfree, ksize, kmem_ptr_validate, and slab debugging. + * virtual address for kfree, ksize, and slab debugging. */ static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, void *addr) @@ -2930,7 +3011,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) retry: check_irq_off(); - node = numa_node_id(); + node = numa_mem_id(); ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -2947,8 +3028,10 @@ retry: spin_lock(&l3->list_lock); /* See if we can refill from the shared array */ - if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) + if (l3->shared && transfer_objects(ac, l3->shared, batchcount)) { + l3->shared->touched = 1; goto alloc_done; + } while (batchcount > 0) { struct list_head *entry; @@ -3085,7 +3168,7 @@ static bool slab_should_failslab(struct kmem_cache *cachep, gfp_t flags) if (cachep == &cache_cache) return false; - return should_failslab(obj_size(cachep), flags); + return should_failslab(obj_size(cachep), flags, cachep->flags); } static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) @@ -3103,13 +3186,19 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) } else { STATS_INC_ALLOCMISS(cachep); objp = cache_alloc_refill(cachep, flags); + /* + * the 'ac' may be updated by cache_alloc_refill(), + * and kmemleak_erase() requires its correct value. + */ + ac = cpu_cache_get(cachep); } /* * To avoid a false negative, if an object that is in one of the * per-CPU caches is leaked, we need to make sure kmemleak doesn't * treat the array pointers as a reference to the object. */ - kmemleak_erase(&ac->entry[ac->avail]); + if (objp) + kmemleak_erase(&ac->entry[ac->avail]); return objp; } @@ -3126,11 +3215,13 @@ static void *alternate_node_alloc(struct kmem_cache *cachep, gfp_t flags) if (in_interrupt() || (flags & __GFP_THISNODE)) return NULL; - nid_alloc = nid_here = numa_node_id(); + nid_alloc = nid_here = numa_mem_id(); + get_mems_allowed(); if (cpuset_do_slab_mem_spread() && (cachep->flags & SLAB_MEM_SPREAD)) - nid_alloc = cpuset_mem_spread_node(); + nid_alloc = cpuset_slab_spread_node(); else if (current->mempolicy) nid_alloc = slab_node(current->mempolicy); + put_mems_allowed(); if (nid_alloc != nid_here) return ____cache_alloc_node(cachep, flags, nid_alloc); return NULL; @@ -3157,6 +3248,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags) if (flags & __GFP_THISNODE) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); @@ -3188,7 +3280,7 @@ retry: if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - obj = kmem_getpages(cache, local_flags, numa_node_id()); + obj = kmem_getpages(cache, local_flags, numa_mem_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); if (obj) { @@ -3212,6 +3304,7 @@ retry: } } } + put_mems_allowed(); return obj; } @@ -3295,6 +3388,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, { unsigned long save_flags; void *ptr; + int slab_node = numa_mem_id(); flags &= gfp_allowed_mask; @@ -3306,8 +3400,8 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, cache_alloc_debugcheck_before(cachep, flags); local_irq_save(save_flags); - if (unlikely(nodeid == -1)) - nodeid = numa_node_id(); + if (nodeid == -1) + nodeid = slab_node; if (unlikely(!cachep->nodelists[nodeid])) { /* Node not bootstrapped yet */ @@ -3315,7 +3409,7 @@ __cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, goto out; } - if (nodeid == numa_node_id()) { + if (nodeid == slab_node) { /* * Use the locally cached objects if possible. * However ____cache_alloc does not allow fallback @@ -3359,8 +3453,8 @@ __do_cache_alloc(struct kmem_cache *cache, gfp_t flags) * We may just have run out of memory on the local node. * ____cache_alloc_node() knows how to locate memory on other nodes */ - if (!objp) - objp = ____cache_alloc_node(cache, flags, numa_node_id()); + if (!objp) + objp = ____cache_alloc_node(cache, flags, numa_mem_id()); out: return objp; @@ -3457,7 +3551,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) { int batchcount; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = numa_mem_id(); batchcount = ac->batchcount; #if DEBUG @@ -3558,54 +3652,20 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags) } EXPORT_SYMBOL(kmem_cache_alloc); -#ifdef CONFIG_KMEMTRACE -void *kmem_cache_alloc_notrace(struct kmem_cache *cachep, gfp_t flags) +#ifdef CONFIG_TRACING +void * +kmem_cache_alloc_trace(size_t size, struct kmem_cache *cachep, gfp_t flags) { - return __cache_alloc(cachep, flags, __builtin_return_address(0)); -} -EXPORT_SYMBOL(kmem_cache_alloc_notrace); -#endif + void *ret; -/** - * kmem_ptr_validate - check if an untrusted pointer might be a slab entry. - * @cachep: the cache we're checking against - * @ptr: pointer to validate - * - * This verifies that the untrusted pointer looks sane; - * it is _not_ a guarantee that the pointer is actually - * part of the slab cache in question, but it at least - * validates that the pointer can be dereferenced and - * looks half-way sane. - * - * Currently only used for dentry validation. - */ -int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr) -{ - unsigned long addr = (unsigned long)ptr; - unsigned long min_addr = PAGE_OFFSET; - unsigned long align_mask = BYTES_PER_WORD - 1; - unsigned long size = cachep->buffer_size; - struct page *page; + ret = __cache_alloc(cachep, flags, __builtin_return_address(0)); - if (unlikely(addr < min_addr)) - goto out; - if (unlikely(addr > (unsigned long)high_memory - size)) - goto out; - if (unlikely(addr & align_mask)) - goto out; - if (unlikely(!kern_addr_valid(addr))) - goto out; - if (unlikely(!kern_addr_valid(addr + size - 1))) - goto out; - page = virt_to_page(ptr); - if (unlikely(!PageSlab(page))) - goto out; - if (unlikely(page_get_cache(page) != cachep)) - goto out; - return 1; -out: - return 0; + trace_kmalloc(_RET_IP_, ret, + size, slab_buffer_size(cachep), flags); + return ret; } +EXPORT_SYMBOL(kmem_cache_alloc_trace); +#endif #ifdef CONFIG_NUMA void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) @@ -3621,35 +3681,36 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) } EXPORT_SYMBOL(kmem_cache_alloc_node); -#ifdef CONFIG_KMEMTRACE -void *kmem_cache_alloc_node_notrace(struct kmem_cache *cachep, - gfp_t flags, - int nodeid) +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_node_trace(size_t size, + struct kmem_cache *cachep, + gfp_t flags, + int nodeid) { - return __cache_alloc_node(cachep, flags, nodeid, + void *ret; + + ret = __cache_alloc_node(cachep, flags, nodeid, __builtin_return_address(0)); + trace_kmalloc_node(_RET_IP_, ret, + size, slab_buffer_size(cachep), + flags, nodeid); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); #endif static __always_inline void * __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller) { struct kmem_cache *cachep; - void *ret; cachep = kmem_find_general_cachep(size, flags); if (unlikely(ZERO_OR_NULL_PTR(cachep))) return cachep; - ret = kmem_cache_alloc_node_notrace(cachep, flags, node); - - trace_kmalloc_node((unsigned long) caller, ret, - size, cachep->buffer_size, flags, node); - - return ret; + return kmem_cache_alloc_node_trace(size, cachep, flags, node); } -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc_node(size_t size, gfp_t flags, int node) { return __do_kmalloc_node(size, flags, node, @@ -3669,7 +3730,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node) return __do_kmalloc_node(size, flags, node, NULL); } EXPORT_SYMBOL(__kmalloc_node); -#endif /* CONFIG_DEBUG_SLAB */ +#endif /* CONFIG_DEBUG_SLAB || CONFIG_TRACING */ #endif /* CONFIG_NUMA */ /** @@ -3701,7 +3762,7 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags, } -#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_KMEMTRACE) +#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_TRACING) void *__kmalloc(size_t size, gfp_t flags) { return __do_kmalloc(size, flags, __builtin_return_address(0)); @@ -3902,7 +3963,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, return -ENOMEM; for_each_online_cpu(i) { - new->new[i] = alloc_arraycache(cpu_to_node(i), limit, + new->new[i] = alloc_arraycache(cpu_to_mem(i), limit, batchcount, gfp); if (!new->new[i]) { for (i--; i >= 0; i--) @@ -3924,9 +3985,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, struct array_cache *ccold = new->new[i]; if (!ccold) continue; - spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); - free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i)); - spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock); + spin_lock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); + free_block(cachep, ccold->entry, ccold->avail, cpu_to_mem(i)); + spin_unlock_irq(&cachep->nodelists[cpu_to_mem(i)]->list_lock); kfree(ccold); } kfree(new); @@ -4032,7 +4093,7 @@ static void cache_reap(struct work_struct *w) { struct kmem_cache *searchp; struct kmem_list3 *l3; - int node = numa_node_id(); + int node = numa_mem_id(); struct delayed_work *work = to_delayed_work(w); if (!mutex_trylock(&cache_chain_mutex)) @@ -4206,10 +4267,11 @@ static int s_show(struct seq_file *m, void *p) unsigned long node_frees = cachep->node_frees; unsigned long overflows = cachep->node_overflow; - seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \ - %4lu %4lu %4lu %4lu %4lu", allocs, high, grown, - reaped, errors, max_freeable, node_allocs, - node_frees, overflows); + seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu " + "%4lu %4lu %4lu %4lu %4lu", + allocs, high, grown, + reaped, errors, max_freeable, node_allocs, + node_frees, overflows); } /* cpu stats */ { @@ -66,8 +66,10 @@ #include <linux/module.h> #include <linux/rcupdate.h> #include <linux/list.h> -#include <linux/kmemtrace.h> #include <linux/kmemleak.h> + +#include <trace/events/kmem.h> + #include <asm/atomic.h> /* @@ -394,6 +396,7 @@ static void slob_free(void *block, int size) slob_t *prev, *next, *b = (slob_t *)block; slobidx_t units; unsigned long flags; + struct list_head *slob_list; if (unlikely(ZERO_OR_NULL_PTR(block))) return; @@ -422,7 +425,13 @@ static void slob_free(void *block, int size) set_slob(b, units, (void *)((unsigned long)(b + SLOB_UNITS(PAGE_SIZE)) & PAGE_MASK)); - set_slob_page_free(sp, &free_slob_small); + if (size < SLOB_BREAK1) + slob_list = &free_slob_small; + else if (size < SLOB_BREAK2) + slob_list = &free_slob_medium; + else + slob_list = &free_slob_large; + set_slob_page_free(sp, slob_list); goto out; } @@ -467,14 +476,6 @@ out: * End of slob allocator proper. Begin kmem_cache_alloc and kmalloc frontend. */ -#ifndef ARCH_KMALLOC_MINALIGN -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long) -#endif - void *__kmalloc_node(size_t size, gfp_t gfp, int node) { unsigned int *m; @@ -499,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node) } else { unsigned int order = get_order(size); - ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node); + if (likely(order)) + gfp |= __GFP_COMP; + ret = slob_new_pages(gfp, order, node); if (ret) { struct page *page; page = virt_to_page(ret); @@ -595,6 +598,8 @@ EXPORT_SYMBOL(kmem_cache_create); void kmem_cache_destroy(struct kmem_cache *c) { kmemleak_free(c); + if (c->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); slob_free(c, sizeof(struct kmem_cache)); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -645,7 +650,6 @@ void kmem_cache_free(struct kmem_cache *c, void *b) if (unlikely(c->flags & SLAB_DESTROY_BY_RCU)) { struct slob_rcu *slob_rcu; slob_rcu = b + (c->size - sizeof(struct slob_rcu)); - INIT_RCU_HEAD(&slob_rcu->head); slob_rcu->size = c->size; call_rcu(&slob_rcu->head, kmem_rcu_free); } else { @@ -674,11 +678,6 @@ int kmem_cache_shrink(struct kmem_cache *d) } EXPORT_SYMBOL(kmem_cache_shrink); -int kmem_ptr_validate(struct kmem_cache *a, const void *b) -{ - return 0; -} - static unsigned int slob_ready __read_mostly; int slab_is_available(void) @@ -690,3 +689,8 @@ void __init kmem_cache_init(void) { slob_ready = 1; } + +void __init kmem_cache_init_late(void) +{ + /* Nothing to do */ +} @@ -17,11 +17,9 @@ #include <linux/slab.h> #include <linux/proc_fs.h> #include <linux/seq_file.h> -#include <linux/kmemtrace.h> #include <linux/kmemcheck.h> #include <linux/cpu.h> #include <linux/cpuset.h> -#include <linux/kmemleak.h> #include <linux/mempolicy.h> #include <linux/ctype.h> #include <linux/debugobjects.h> @@ -30,6 +28,8 @@ #include <linux/math64.h> #include <linux/fault-inject.h> +#include <trace/events/kmem.h> + /* * Lock order: * 1. slab_lock(page) @@ -108,11 +108,17 @@ * the fast path and disables lockless freelists. */ +#define SLAB_DEBUG_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ + SLAB_TRACE | SLAB_DEBUG_FREE) + +static inline int kmem_cache_debug(struct kmem_cache *s) +{ #ifdef CONFIG_SLUB_DEBUG -#define SLABDEBUG 1 + return unlikely(s->flags & SLAB_DEBUG_FLAGS); #else -#define SLABDEBUG 0 + return 0; #endif +} /* * Issues still to be resolved: @@ -142,29 +148,28 @@ SLAB_POISON | SLAB_STORE_USER) /* + * Debugging flags that require metadata to be stored in the slab. These get + * disabled when slub_debug=O is used and a cache's min order increases with + * metadata. + */ +#define DEBUG_METADATA_FLAGS (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER) + +/* * Set of flags that will prevent slab merging */ #define SLUB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \ - SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE) + SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \ + SLAB_FAILSLAB) #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \ SLAB_CACHE_DMA | SLAB_NOTRACK) -#ifndef ARCH_KMALLOC_MINALIGN -#define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long) -#endif - -#ifndef ARCH_SLAB_MINALIGN -#define ARCH_SLAB_MINALIGN __alignof__(unsigned long long) -#endif - #define OO_SHIFT 16 #define OO_MASK ((1 << OO_SHIFT) - 1) #define MAX_OBJS_PER_PAGE 65535 /* since page.objects is u16 */ /* Internal SLUB flags */ -#define __OBJECT_POISON 0x80000000 /* Poison object */ -#define __SYSFS_ADD_DEFERRED 0x40000000 /* Not yet visible via sysfs */ +#define __OBJECT_POISON 0x80000000UL /* Poison object */ static int kmem_size = sizeof(struct kmem_cache); @@ -174,7 +179,7 @@ static struct notifier_block slab_notifier; static enum { DOWN, /* No slab functionality available */ - PARTIAL, /* kmem_cache_open() works but kmalloc does not */ + PARTIAL, /* Kmem_cache_node works */ UP, /* Everything works but does not show up in sysfs */ SYSFS /* Sysfs up */ } slab_state = DOWN; @@ -195,7 +200,7 @@ struct track { enum track_item { TRACK_ALLOC, TRACK_FREE }; -#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SYSFS static int sysfs_slab_add(struct kmem_cache *); static int sysfs_slab_alias(struct kmem_cache *, const char *); static void sysfs_slab_remove(struct kmem_cache *); @@ -206,15 +211,16 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p) { return 0; } static inline void sysfs_slab_remove(struct kmem_cache *s) { + kfree(s->name); kfree(s); } #endif -static inline void stat(struct kmem_cache_cpu *c, enum stat_item si) +static inline void stat(struct kmem_cache *s, enum stat_item si) { #ifdef CONFIG_SLUB_STATS - c->stat[si]++; + __this_cpu_inc(s->cpu_slab->stat[si]); #endif } @@ -229,20 +235,7 @@ int slab_is_available(void) static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node) { -#ifdef CONFIG_NUMA return s->node[node]; -#else - return &s->local_node; -#endif -} - -static inline struct kmem_cache_cpu *get_cpu_slab(struct kmem_cache *s, int cpu) -{ -#ifdef CONFIG_SMP - return s->cpu_slab[cpu]; -#else - return &s->cpu_slab; -#endif } /* Verify that a pointer has an address that is valid within a slab page */ @@ -263,13 +256,6 @@ static inline int check_valid_pointer(struct kmem_cache *s, return 1; } -/* - * Slow version of get and set free pointer. - * - * This version requires touching the cache lines of kmem_cache which - * we avoid to do in the fast alloc free paths. There we obtain the offset - * from the page struct. - */ static inline void *get_freepointer(struct kmem_cache *s, void *object) { return *(void **)(object + s->offset); @@ -326,6 +312,7 @@ static int slub_debug; #endif static char *slub_debug_slabs; +static int disable_higher_order_debug; /* * Object debugging @@ -505,7 +492,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...) dump_stack(); } -static void init_object(struct kmem_cache *s, void *object, int active) +static void init_object(struct kmem_cache *s, void *object, u8 val) { u8 *p = object; @@ -515,9 +502,7 @@ static void init_object(struct kmem_cache *s, void *object, int active) } if (s->flags & SLAB_RED_ZONE) - memset(p + s->objsize, - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE, - s->inuse - s->objsize); + memset(p + s->objsize, val, s->inuse - s->objsize); } static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes) @@ -647,22 +632,19 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); print_section("Padding", end - remainder, remainder); - restore_bytes(s, "slab padding", POISON_INUSE, start, end); + restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); return 0; } static int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) + void *object, u8 val) { u8 *p = object; u8 *endobject = object + s->objsize; if (s->flags & SLAB_RED_ZONE) { - unsigned int red = - active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE; - if (!check_bytes_and_report(s, page, object, "Redzone", - endobject, red, s->inuse - s->objsize)) + endobject, val, s->inuse - s->objsize)) return 0; } else { if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) { @@ -672,7 +654,7 @@ static int check_object(struct kmem_cache *s, struct page *page, } if (s->flags & SLAB_POISON) { - if (!active && (s->flags & __OBJECT_POISON) && + if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && (!check_bytes_and_report(s, page, p, "Poison", p, POISON_FREE, s->objsize - 1) || !check_bytes_and_report(s, page, p, "Poison", @@ -684,7 +666,7 @@ static int check_object(struct kmem_cache *s, struct page *page, check_pad_bytes(s, page, p); } - if (!s->offset && active) + if (!s->offset && val == SLUB_RED_ACTIVE) /* * Object and freepointer overlap. Cannot check * freepointer while object is allocated. @@ -803,6 +785,39 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, } /* + * Hooks for other subsystems that check memory allocations. In a typical + * production configuration these hooks all should produce no code at all. + */ +static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) +{ + flags &= gfp_allowed_mask; + lockdep_trace_alloc(flags); + might_sleep_if(flags & __GFP_WAIT); + + return should_failslab(s->objsize, flags, s->flags); +} + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object) +{ + flags &= gfp_allowed_mask; + kmemcheck_slab_alloc(s, flags, object, s->objsize); + kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags); +} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) +{ + kmemleak_free_recursive(x, s->flags); +} + +static inline void slab_free_hook_irq(struct kmem_cache *s, void *object) +{ + kmemcheck_slab_free(s, object, s->objsize); + debug_check_no_locks_freed(object, s->objsize); + if (!(s->flags & SLAB_DEBUG_OBJECTS)) + debug_check_no_obj_freed(object, s->objsize); +} + +/* * Tracking of fully allocated slabs for debugging purposes. */ static void add_full(struct kmem_cache_node *n, struct page *page) @@ -849,7 +864,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) * dilemma by deferring the increment of the count during * bootstrap (see early_kmem_cache_node_alloc). */ - if (!NUMA_BUILD || n) { + if (n) { atomic_long_inc(&n->nr_slabs); atomic_long_add(objects, &n->total_objects); } @@ -869,11 +884,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page, if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))) return; - init_object(s, object, 0); + init_object(s, object, SLUB_RED_INACTIVE); init_tracking(s, object); } -static int alloc_debug_processing(struct kmem_cache *s, struct page *page, +static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) @@ -889,14 +904,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page, goto bad; } - if (!check_object(s, page, object, 0)) + if (!check_object(s, page, object, SLUB_RED_INACTIVE)) goto bad; /* Success perform special debug activities for allocs */ if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_ALLOC, addr); trace(s, page, object, 1); - init_object(s, object, 1); + init_object(s, object, SLUB_RED_ACTIVE); return 1; bad: @@ -913,8 +928,8 @@ bad: return 0; } -static int free_debug_processing(struct kmem_cache *s, struct page *page, - void *object, unsigned long addr) +static noinline int free_debug_processing(struct kmem_cache *s, + struct page *page, void *object, unsigned long addr) { if (!check_slab(s, page)) goto fail; @@ -929,7 +944,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, goto fail; } - if (!check_object(s, page, object, 1)) + if (!check_object(s, page, object, SLUB_RED_ACTIVE)) return 0; if (unlikely(s != page->slab)) { @@ -953,7 +968,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page, if (s->flags & SLAB_STORE_USER) set_track(s, object, TRACK_FREE, addr); trace(s, page, object, 0); - init_object(s, object, 0); + init_object(s, object, SLUB_RED_INACTIVE); return 1; fail: @@ -977,6 +992,15 @@ static int __init setup_slub_debug(char *str) */ goto check_slabs; + if (tolower(*str) == 'o') { + /* + * Avoid enabling debugging on caches if its minimum order + * would increase as a result. + */ + disable_higher_order_debug = 1; + goto out; + } + slub_debug = 0; if (*str == '-') /* @@ -1004,6 +1028,9 @@ static int __init setup_slub_debug(char *str) case 't': slub_debug |= SLAB_TRACE; break; + case 'a': + slub_debug |= SLAB_FAILSLAB; + break; default: printk(KERN_ERR "slub_debug option '%c' " "unknown. skipped\n", *str); @@ -1027,8 +1054,8 @@ static unsigned long kmem_cache_flags(unsigned long objsize, * Enable debugging if selected on the kernel commandline. */ if (slub_debug && (!slub_debug_slabs || - strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)) == 0)) - flags |= slub_debug; + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) + flags |= slub_debug; return flags; } @@ -1045,7 +1072,7 @@ static inline int free_debug_processing(struct kmem_cache *s, static inline int slab_pad_check(struct kmem_cache *s, struct page *page) { return 1; } static inline int check_object(struct kmem_cache *s, struct page *page, - void *object, int active) { return 1; } + void *object, u8 val) { return 1; } static inline void add_full(struct kmem_cache_node *n, struct page *page) {} static inline unsigned long kmem_cache_flags(unsigned long objsize, unsigned long flags, const char *name, @@ -1055,6 +1082,8 @@ static inline unsigned long kmem_cache_flags(unsigned long objsize, } #define slub_debug 0 +#define disable_higher_order_debug 0 + static inline unsigned long slabs_node(struct kmem_cache *s, int node) { return 0; } static inline unsigned long node_nr_slabs(struct kmem_cache_node *n) @@ -1063,7 +1092,19 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects) {} static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -#endif + +static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) + { return 0; } + +static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, + void *object) {} + +static inline void slab_free_hook(struct kmem_cache *s, void *x) {} + +static inline void slab_free_hook_irq(struct kmem_cache *s, + void *object) {} + +#endif /* CONFIG_SLUB_DEBUG */ /* * Slab allocation and freeing @@ -1075,21 +1116,27 @@ static inline struct page *alloc_slab_page(gfp_t flags, int node, flags |= __GFP_NOTRACK; - if (node == -1) + if (node == NUMA_NO_NODE) return alloc_pages(flags, order); else - return alloc_pages_node(node, flags, order); + return alloc_pages_exact_node(node, flags, order); } static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; struct kmem_cache_order_objects oo = s->oo; + gfp_t alloc_gfp; flags |= s->allocflags; - page = alloc_slab_page(flags | __GFP_NOWARN | __GFP_NORETRY, node, - oo); + /* + * Let the initial higher-order allocation fail under memory pressure + * so we fall-back to the minimum order allocation. + */ + alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; + + page = alloc_slab_page(alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; /* @@ -1100,12 +1147,11 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) if (!page) return NULL; - stat(get_cpu_slab(s, raw_smp_processor_id()), ORDER_FALLBACK); + stat(s, ORDER_FALLBACK); } if (kmemcheck_enabled - && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) - { + && !(s->flags & (SLAB_NOTRACK | DEBUG_DEFAULT_FLAGS))) { int pages = 1 << oo_order(oo); kmemcheck_alloc_shadow(page, oo_order(oo), flags, node); @@ -1154,9 +1200,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; page->flags |= 1 << PG_slab; - if (s->flags & (SLAB_DEBUG_FREE | SLAB_RED_ZONE | SLAB_POISON | - SLAB_STORE_USER | SLAB_TRACE)) - __SetPageSlubDebug(page); start = page_address(page); @@ -1183,14 +1226,13 @@ static void __free_slab(struct kmem_cache *s, struct page *page) int order = compound_order(page); int pages = 1 << order; - if (unlikely(SLABDEBUG && PageSlubDebug(page))) { + if (kmem_cache_debug(s)) { void *p; slab_pad_check(s, page); for_each_object(p, s, page_address(page), page->objects) - check_object(s, page, p, 0); - __ClearPageSlubDebug(page); + check_object(s, page, p, SLUB_RED_INACTIVE); } kmemcheck_free_shadow(page, compound_order(page)); @@ -1270,13 +1312,19 @@ static void add_partial(struct kmem_cache_node *n, spin_unlock(&n->list_lock); } +static inline void __remove_partial(struct kmem_cache_node *n, + struct page *page) +{ + list_del(&page->lru); + n->nr_partial--; +} + static void remove_partial(struct kmem_cache *s, struct page *page) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); spin_lock(&n->list_lock); - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); spin_unlock(&n->list_lock); } @@ -1289,8 +1337,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n, struct page *page) { if (slab_trylock(page)) { - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); __SetPageSlubFrozen(page); return 1; } @@ -1357,6 +1404,7 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) get_cycles() % 1024 > s->remote_node_defrag_ratio) return NULL; + get_mems_allowed(); zonelist = node_zonelist(slab_node(current->mempolicy), flags); for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { struct kmem_cache_node *n; @@ -1366,10 +1414,13 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) if (n && cpuset_zone_allowed_hardwall(zone, flags) && n->nr_partial > s->min_partial) { page = get_partial_node(n); - if (page) + if (page) { + put_mems_allowed(); return page; + } } } + put_mems_allowed(); #endif return NULL; } @@ -1380,10 +1431,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags) static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) { struct page *page; - int searchnode = (node == -1) ? numa_node_id() : node; + int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node; page = get_partial_node(get_node(s, searchnode)); - if (page || (flags & __GFP_THISNODE)) + if (page || node != -1) return page; return get_any_partial(s, flags); @@ -1397,25 +1448,24 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node) * On exit the slab lock will have been dropped. */ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) + __releases(bitlock) { struct kmem_cache_node *n = get_node(s, page_to_nid(page)); - struct kmem_cache_cpu *c = get_cpu_slab(s, smp_processor_id()); __ClearPageSlubFrozen(page); if (page->inuse) { if (page->freelist) { add_partial(n, page, tail); - stat(c, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); + stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD); } else { - stat(c, DEACTIVATE_FULL); - if (SLABDEBUG && PageSlubDebug(page) && - (s->flags & SLAB_STORE_USER)) + stat(s, DEACTIVATE_FULL); + if (kmem_cache_debug(s) && (s->flags & SLAB_STORE_USER)) add_full(n, page); } slab_unlock(page); } else { - stat(c, DEACTIVATE_EMPTY); + stat(s, DEACTIVATE_EMPTY); if (n->nr_partial < s->min_partial) { /* * Adding an empty slab to the partial slabs in order @@ -1431,7 +1481,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) slab_unlock(page); } else { slab_unlock(page); - stat(get_cpu_slab(s, raw_smp_processor_id()), FREE_SLAB); + stat(s, FREE_SLAB); discard_slab(s, page); } } @@ -1441,12 +1491,13 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail) * Remove the cpu slab */ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) + __releases(bitlock) { struct page *page = c->page; int tail = 1; if (page->freelist) - stat(c, DEACTIVATE_REMOTE_FREES); + stat(s, DEACTIVATE_REMOTE_FREES); /* * Merge cpu freelist into slab freelist. Typically we get here * because both freelists are empty. So this is unlikely @@ -1459,10 +1510,10 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) /* Retrieve object from cpu_freelist */ object = c->freelist; - c->freelist = c->freelist[c->offset]; + c->freelist = get_freepointer(s, c->freelist); /* And put onto the regular freelist */ - object[c->offset] = page->freelist; + set_freepointer(s, object, page->freelist); page->freelist = object; page->inuse--; } @@ -1472,7 +1523,7 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) { - stat(c, CPUSLAB_FLUSH); + stat(s, CPUSLAB_FLUSH); slab_lock(c->page); deactivate_slab(s, c); } @@ -1484,7 +1535,7 @@ static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) */ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); if (likely(c && c->page)) flush_slab(s, c); @@ -1509,7 +1560,7 @@ static void flush_all(struct kmem_cache *s) static inline int node_match(struct kmem_cache_cpu *c, int node) { #ifdef CONFIG_NUMA - if (node != -1 && c->node != node) + if (node != NUMA_NO_NODE && c->node != node) return 0; #endif return 1; @@ -1555,6 +1606,10 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) "default order: %d, min order: %d\n", s->name, s->objsize, s->size, oo_order(s->oo), oo_order(s->min)); + if (oo_order(s->min) > get_order(s->objsize)) + printk(KERN_WARNING " %s debugging increased min order, use " + "slub_debug=O to disable.\n", s->name); + for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); unsigned long nr_slabs; @@ -1608,22 +1663,22 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (unlikely(!node_match(c, node))) goto another_slab; - stat(c, ALLOC_REFILL); + stat(s, ALLOC_REFILL); load_freelist: object = c->page->freelist; if (unlikely(!object)) goto another_slab; - if (unlikely(SLABDEBUG && PageSlubDebug(c->page))) + if (kmem_cache_debug(s)) goto debug; - c->freelist = object[c->offset]; + c->freelist = get_freepointer(s, object); c->page->inuse = c->page->objects; c->page->freelist = NULL; c->node = page_to_nid(c->page); unlock_out: slab_unlock(c->page); - stat(c, ALLOC_SLOWPATH); + stat(s, ALLOC_SLOWPATH); return object; another_slab: @@ -1633,10 +1688,11 @@ new_slab: new = get_partial(s, gfpflags, node); if (new) { c->page = new; - stat(c, ALLOC_FROM_PARTIAL); + stat(s, ALLOC_FROM_PARTIAL); goto load_freelist; } + gfpflags &= gfp_allowed_mask; if (gfpflags & __GFP_WAIT) local_irq_enable(); @@ -1646,8 +1702,8 @@ new_slab: local_irq_disable(); if (new) { - c = get_cpu_slab(s, smp_processor_id()); - stat(c, ALLOC_SLAB); + c = __this_cpu_ptr(s->cpu_slab); + stat(s, ALLOC_SLAB); if (c->page) flush_slab(s, c); slab_lock(new); @@ -1663,8 +1719,8 @@ debug: goto another_slab; c->page->inuse++; - c->page->freelist = object[c->offset]; - c->node = -1; + c->page->freelist = get_freepointer(s, object); + c->node = NUMA_NO_NODE; goto unlock_out; } @@ -1684,42 +1740,34 @@ static __always_inline void *slab_alloc(struct kmem_cache *s, void **object; struct kmem_cache_cpu *c; unsigned long flags; - unsigned int objsize; - - gfpflags &= gfp_allowed_mask; - - lockdep_trace_alloc(gfpflags); - might_sleep_if(gfpflags & __GFP_WAIT); - if (should_failslab(s->objsize, gfpflags)) + if (slab_pre_alloc_hook(s, gfpflags)) return NULL; local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); - objsize = c->objsize; - if (unlikely(!c->freelist || !node_match(c, node))) + c = __this_cpu_ptr(s->cpu_slab); + object = c->freelist; + if (unlikely(!object || !node_match(c, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { - object = c->freelist; - c->freelist = object[c->offset]; - stat(c, ALLOC_FASTPATH); + c->freelist = get_freepointer(s, object); + stat(s, ALLOC_FASTPATH); } local_irq_restore(flags); - if (unlikely((gfpflags & __GFP_ZERO) && object)) - memset(object, 0, objsize); + if (unlikely(gfpflags & __GFP_ZERO) && object) + memset(object, 0, s->objsize); - kmemcheck_slab_alloc(s, gfpflags, object, c->objsize); - kmemleak_alloc_recursive(object, objsize, 1, s->flags, gfpflags); + slab_post_alloc_hook(s, gfpflags, object); return object; } void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) { - void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags); @@ -1727,12 +1775,22 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags) } EXPORT_SYMBOL(kmem_cache_alloc); -#ifdef CONFIG_KMEMTRACE -void *kmem_cache_alloc_notrace(struct kmem_cache *s, gfp_t gfpflags) +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size) +{ + void *ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, _RET_IP_); + trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags); + return ret; +} +EXPORT_SYMBOL(kmem_cache_alloc_trace); + +void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { - return slab_alloc(s, gfpflags, -1, _RET_IP_); + void *ret = kmalloc_order(size, flags, order); + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_notrace); +EXPORT_SYMBOL(kmalloc_order_trace); #endif #ifdef CONFIG_NUMA @@ -1746,16 +1804,20 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node) return ret; } EXPORT_SYMBOL(kmem_cache_alloc_node); -#endif -#ifdef CONFIG_KMEMTRACE -void *kmem_cache_alloc_node_notrace(struct kmem_cache *s, +#ifdef CONFIG_TRACING +void *kmem_cache_alloc_node_trace(struct kmem_cache *s, gfp_t gfpflags, - int node) + int node, size_t size) { - return slab_alloc(s, gfpflags, node, _RET_IP_); + void *ret = slab_alloc(s, gfpflags, node, _RET_IP_); + + trace_kmalloc_node(_RET_IP_, ret, + size, s->size, gfpflags, node); + return ret; } -EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); +EXPORT_SYMBOL(kmem_cache_alloc_node_trace); +#endif #endif /* @@ -1767,26 +1829,25 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_notrace); * handling required then we can return immediately. */ static void __slab_free(struct kmem_cache *s, struct page *page, - void *x, unsigned long addr, unsigned int offset) + void *x, unsigned long addr) { void *prior; void **object = (void *)x; - struct kmem_cache_cpu *c; - c = get_cpu_slab(s, raw_smp_processor_id()); - stat(c, FREE_SLOWPATH); + stat(s, FREE_SLOWPATH); slab_lock(page); - if (unlikely(SLABDEBUG && PageSlubDebug(page))) + if (kmem_cache_debug(s)) goto debug; checks_ok: - prior = object[offset] = page->freelist; + prior = page->freelist; + set_freepointer(s, object, prior); page->freelist = object; page->inuse--; if (unlikely(PageSlubFrozen(page))) { - stat(c, FREE_FROZEN); + stat(s, FREE_FROZEN); goto out_unlock; } @@ -1799,7 +1860,7 @@ checks_ok: */ if (unlikely(!prior)) { add_partial(get_node(s, page_to_nid(page)), page, 1); - stat(c, FREE_ADD_PARTIAL); + stat(s, FREE_ADD_PARTIAL); } out_unlock: @@ -1812,10 +1873,10 @@ slab_empty: * Slab still on the partial list. */ remove_partial(s, page); - stat(c, FREE_REMOVE_PARTIAL); + stat(s, FREE_REMOVE_PARTIAL); } slab_unlock(page); - stat(c, FREE_SLAB); + stat(s, FREE_SLAB); discard_slab(s, page); return; @@ -1843,19 +1904,19 @@ static __always_inline void slab_free(struct kmem_cache *s, struct kmem_cache_cpu *c; unsigned long flags; - kmemleak_free_recursive(x, s->flags); + slab_free_hook(s, x); + local_irq_save(flags); - c = get_cpu_slab(s, smp_processor_id()); - kmemcheck_slab_free(s, object, c->objsize); - debug_check_no_locks_freed(object, c->objsize); - if (!(s->flags & SLAB_DEBUG_OBJECTS)) - debug_check_no_obj_freed(object, c->objsize); - if (likely(page == c->page && c->node >= 0)) { - object[c->offset] = c->freelist; + c = __this_cpu_ptr(s->cpu_slab); + + slab_free_hook_irq(s, x); + + if (likely(page == c->page && c->node != NUMA_NO_NODE)) { + set_freepointer(s, object, c->freelist); c->freelist = object; - stat(c, FREE_FASTPATH); + stat(s, FREE_FASTPATH); } else - __slab_free(s, page, x, addr, c->offset); + __slab_free(s, page, x, addr); local_irq_restore(flags); } @@ -1872,17 +1933,6 @@ void kmem_cache_free(struct kmem_cache *s, void *x) } EXPORT_SYMBOL(kmem_cache_free); -/* Figure out on which slab page the object resides */ -static struct page *get_object_page(const void *x) -{ - struct page *page = virt_to_head_page(x); - - if (!PageSlab(page)) - return NULL; - - return page; -} - /* * Object placement in a slab is made very easy because we always start at * offset 0. If we tune the size of the object to the alignment then we can @@ -1996,7 +2046,7 @@ static inline int calculate_order(int size) return order; fraction /= 2; } - min_objects --; + min_objects--; } /* @@ -2042,19 +2092,6 @@ static unsigned long calculate_alignment(unsigned long flags, return ALIGN(align, sizeof(void *)); } -static void init_kmem_cache_cpu(struct kmem_cache *s, - struct kmem_cache_cpu *c) -{ - c->page = NULL; - c->freelist = NULL; - c->node = 0; - c->offset = s->offset / sizeof(void *); - c->objsize = s->objsize; -#ifdef CONFIG_SLUB_STATS - memset(c->stat, 0, NR_SLUB_STAT_ITEMS * sizeof(unsigned)); -#endif -} - static void init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) { @@ -2068,132 +2105,18 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s) #endif } -#ifdef CONFIG_SMP -/* - * Per cpu array for per cpu structures. - * - * The per cpu array places all kmem_cache_cpu structures from one processor - * close together meaning that it becomes possible that multiple per cpu - * structures are contained in one cacheline. This may be particularly - * beneficial for the kmalloc caches. - * - * A desktop system typically has around 60-80 slabs. With 100 here we are - * likely able to get per cpu structures for all caches from the array defined - * here. We must be able to cover all kmalloc caches during bootstrap. - * - * If the per cpu array is exhausted then fall back to kmalloc - * of individual cachelines. No sharing is possible then. - */ -#define NR_KMEM_CACHE_CPU 100 - -static DEFINE_PER_CPU(struct kmem_cache_cpu, - kmem_cache_cpu)[NR_KMEM_CACHE_CPU]; - -static DEFINE_PER_CPU(struct kmem_cache_cpu *, kmem_cache_cpu_free); -static DECLARE_BITMAP(kmem_cach_cpu_free_init_once, CONFIG_NR_CPUS); - -static struct kmem_cache_cpu *alloc_kmem_cache_cpu(struct kmem_cache *s, - int cpu, gfp_t flags) +static inline int alloc_kmem_cache_cpus(struct kmem_cache *s) { - struct kmem_cache_cpu *c = per_cpu(kmem_cache_cpu_free, cpu); + BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE < + SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu)); - if (c) - per_cpu(kmem_cache_cpu_free, cpu) = - (void *)c->freelist; - else { - /* Table overflow: So allocate ourselves */ - c = kmalloc_node( - ALIGN(sizeof(struct kmem_cache_cpu), cache_line_size()), - flags, cpu_to_node(cpu)); - if (!c) - return NULL; - } + s->cpu_slab = alloc_percpu(struct kmem_cache_cpu); - init_kmem_cache_cpu(s, c); - return c; -} - -static void free_kmem_cache_cpu(struct kmem_cache_cpu *c, int cpu) -{ - if (c < per_cpu(kmem_cache_cpu, cpu) || - c >= per_cpu(kmem_cache_cpu, cpu) + NR_KMEM_CACHE_CPU) { - kfree(c); - return; - } - c->freelist = (void *)per_cpu(kmem_cache_cpu_free, cpu); - per_cpu(kmem_cache_cpu_free, cpu) = c; + return s->cpu_slab != NULL; } -static void free_kmem_cache_cpus(struct kmem_cache *s) -{ - int cpu; +static struct kmem_cache *kmem_cache_node; - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c) { - s->cpu_slab[cpu] = NULL; - free_kmem_cache_cpu(c, cpu); - } - } -} - -static int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) -{ - int cpu; - - for_each_online_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - - if (c) - continue; - - c = alloc_kmem_cache_cpu(s, cpu, flags); - if (!c) { - free_kmem_cache_cpus(s); - return 0; - } - s->cpu_slab[cpu] = c; - } - return 1; -} - -/* - * Initialize the per cpu array. - */ -static void init_alloc_cpu_cpu(int cpu) -{ - int i; - - if (cpumask_test_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once))) - return; - - for (i = NR_KMEM_CACHE_CPU - 1; i >= 0; i--) - free_kmem_cache_cpu(&per_cpu(kmem_cache_cpu, cpu)[i], cpu); - - cpumask_set_cpu(cpu, to_cpumask(kmem_cach_cpu_free_init_once)); -} - -static void __init init_alloc_cpu(void) -{ - int cpu; - - for_each_online_cpu(cpu) - init_alloc_cpu_cpu(cpu); - } - -#else -static inline void free_kmem_cache_cpus(struct kmem_cache *s) {} -static inline void init_alloc_cpu(void) {} - -static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) -{ - init_kmem_cache_cpu(s, &s->cpu_slab); - return 1; -} -#endif - -#ifdef CONFIG_NUMA /* * No kmalloc_node yet so do it by hand. We know that this is the first * slab on the node for this slabcache. There are no concurrent accesses @@ -2203,15 +2126,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags) * when allocating for the kmalloc_node_cache. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ -static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) +static void early_kmem_cache_node_alloc(int node) { struct page *page; struct kmem_cache_node *n; unsigned long flags; - BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); + BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags, node); + page = new_slab(kmem_cache_node, GFP_NOWAIT, node); BUG_ON(!page); if (page_to_nid(page) != node) { @@ -2223,15 +2146,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node) n = page->freelist; BUG_ON(!n); - page->freelist = get_freepointer(kmalloc_caches, n); + page->freelist = get_freepointer(kmem_cache_node, n); page->inuse++; - kmalloc_caches->node[node] = n; + kmem_cache_node->node[node] = n; #ifdef CONFIG_SLUB_DEBUG - init_object(kmalloc_caches, n, 1); - init_tracking(kmalloc_caches, n); + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); #endif - init_kmem_cache_node(n, kmalloc_caches); - inc_slabs_node(kmalloc_caches, node, page->objects); + init_kmem_cache_node(n, kmem_cache_node); + inc_slabs_node(kmem_cache_node, node, page->objects); /* * lockdep requires consistent irq usage for each lock @@ -2249,57 +2172,38 @@ static void free_kmem_cache_nodes(struct kmem_cache *s) for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = s->node[node]; - if (n && n != &s->local_node) - kmem_cache_free(kmalloc_caches, n); + + if (n) + kmem_cache_free(kmem_cache_node, n); + s->node[node] = NULL; } } -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) +static int init_kmem_cache_nodes(struct kmem_cache *s) { int node; - int local_node; - - if (slab_state >= UP) - local_node = page_to_nid(virt_to_page(s)); - else - local_node = 0; for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n; - if (local_node == node) - n = &s->local_node; - else { - if (slab_state == DOWN) { - early_kmem_cache_node_alloc(gfpflags, node); - continue; - } - n = kmem_cache_alloc_node(kmalloc_caches, - gfpflags, node); - - if (!n) { - free_kmem_cache_nodes(s); - return 0; - } + if (slab_state == DOWN) { + early_kmem_cache_node_alloc(node); + continue; + } + n = kmem_cache_alloc_node(kmem_cache_node, + GFP_KERNEL, node); + if (!n) { + free_kmem_cache_nodes(s); + return 0; } + s->node[node] = n; init_kmem_cache_node(n, s); } return 1; } -#else -static void free_kmem_cache_nodes(struct kmem_cache *s) -{ -} - -static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags) -{ - init_kmem_cache_node(&s->local_node, s); - return 1; -} -#endif static void set_min_partial(struct kmem_cache *s, unsigned long min) { @@ -2395,6 +2299,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * on bootup. */ align = calculate_alignment(flags, align, s->objsize); + s->align = align; /* * SLUB stores one object immediately after another beginning from @@ -2433,7 +2338,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) } -static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, +static int kmem_cache_open(struct kmem_cache *s, const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) @@ -2447,6 +2352,18 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, if (!calculate_sizes(s, -1)) goto error; + if (disable_higher_order_debug) { + /* + * Disable debugging flags that store metadata if the min slab + * order increased. + */ + if (get_order(s->size) > get_order(s->objsize)) { + s->flags &= ~DEBUG_METADATA_FLAGS; + s->offset = 0; + if (!calculate_sizes(s, -1)) + goto error; + } + } /* * The larger the object size is, the more pages we want on the partial @@ -2457,11 +2374,12 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags, #ifdef CONFIG_NUMA s->remote_node_defrag_ratio = 1000; #endif - if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA)) + if (!init_kmem_cache_nodes(s)) goto error; - if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA)) + if (alloc_kmem_cache_cpus(s)) return 1; + free_kmem_cache_nodes(s); error: if (flags & SLAB_PANIC) @@ -2473,32 +2391,6 @@ error: } /* - * Check if a given pointer is valid - */ -int kmem_ptr_validate(struct kmem_cache *s, const void *object) -{ - struct page *page; - - page = get_object_page(object); - - if (!page || s != page->slab) - /* No slab or wrong slab */ - return 0; - - if (!check_valid_pointer(s, page, object)) - return 0; - - /* - * We could also check if the object is on the slabs freelist. - * But this would be too expensive and it seems that the main - * purpose of kmem_ptr_valid() is to check if the object belongs - * to a certain slab. - */ - return 1; -} -EXPORT_SYMBOL(kmem_ptr_validate); - -/* * Determine the size of a slab object */ unsigned int kmem_cache_size(struct kmem_cache *s) @@ -2519,9 +2411,10 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG void *addr = page_address(page); void *p; - DECLARE_BITMAP(map, page->objects); - - bitmap_zero(map, page->objects); + unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) * + sizeof(long), GFP_ATOMIC); + if (!map) + return; slab_err(s, page, "%s", text); slab_lock(page); for_each_free_object(p, s, page->freelist) @@ -2536,6 +2429,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, } } slab_unlock(page); + kfree(map); #endif } @@ -2550,9 +2444,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry_safe(page, h, &n->partial, lru) { if (!page->inuse) { - list_del(&page->lru); + __remove_partial(n, page); discard_slab(s, page); - n->nr_partial--; } else { list_slab_objects(s, page, "Objects remaining on kmem_cache_close()"); @@ -2569,9 +2462,8 @@ static inline int kmem_cache_close(struct kmem_cache *s) int node; flush_all(s); - + free_percpu(s->cpu_slab); /* Attempt to free all objects */ - free_kmem_cache_cpus(s); for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -2593,15 +2485,16 @@ void kmem_cache_destroy(struct kmem_cache *s) s->refcount--; if (!s->refcount) { list_del(&s->list); - up_write(&slub_lock); if (kmem_cache_close(s)) { printk(KERN_ERR "SLUB %s: %s called for cache that " "still has objects.\n", s->name, __func__); dump_stack(); } + if (s->flags & SLAB_DESTROY_BY_RCU) + rcu_barrier(); sysfs_slab_remove(s); - } else - up_write(&slub_lock); + } + up_write(&slub_lock); } EXPORT_SYMBOL(kmem_cache_destroy); @@ -2609,9 +2502,15 @@ EXPORT_SYMBOL(kmem_cache_destroy); * Kmalloc subsystem *******************************************************************/ -struct kmem_cache kmalloc_caches[SLUB_PAGE_SHIFT] __cacheline_aligned; +struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT]; EXPORT_SYMBOL(kmalloc_caches); +static struct kmem_cache *kmem_cache; + +#ifdef CONFIG_ZONE_DMA +static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT]; +#endif + static int __init setup_slub_min_order(char *str) { get_option(&str, &slub_min_order); @@ -2648,108 +2547,29 @@ static int __init setup_slub_nomerge(char *str) __setup("slub_nomerge", setup_slub_nomerge); -static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s, - const char *name, int size, gfp_t gfp_flags) +static struct kmem_cache *__init create_kmalloc_cache(const char *name, + int size, unsigned int flags) { - unsigned int flags = 0; + struct kmem_cache *s; - if (gfp_flags & SLUB_DMA) - flags = SLAB_CACHE_DMA; + s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); /* * This function is called with IRQs disabled during early-boot on * single CPU so there's no need to take slub_lock here. */ - if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN, + if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN, flags, NULL)) goto panic; list_add(&s->list, &slab_caches); - - if (sysfs_slab_add(s)) - goto panic; return s; panic: panic("Creation of kmalloc slab %s size=%d failed.\n", name, size); + return NULL; } -#ifdef CONFIG_ZONE_DMA -static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT]; - -static void sysfs_add_func(struct work_struct *w) -{ - struct kmem_cache *s; - - down_write(&slub_lock); - list_for_each_entry(s, &slab_caches, list) { - if (s->flags & __SYSFS_ADD_DEFERRED) { - s->flags &= ~__SYSFS_ADD_DEFERRED; - sysfs_slab_add(s); - } - } - up_write(&slub_lock); -} - -static DECLARE_WORK(sysfs_add_work, sysfs_add_func); - -static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags) -{ - struct kmem_cache *s; - char *text; - size_t realsize; - unsigned long slabflags; - - s = kmalloc_caches_dma[index]; - if (s) - return s; - - /* Dynamically create dma cache */ - if (flags & __GFP_WAIT) - down_write(&slub_lock); - else { - if (!down_write_trylock(&slub_lock)) - goto out; - } - - if (kmalloc_caches_dma[index]) - goto unlock_out; - - realsize = kmalloc_caches[index].objsize; - text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d", - (unsigned int)realsize); - s = kmalloc(kmem_size, flags & ~SLUB_DMA); - - /* - * Must defer sysfs creation to a workqueue because we don't know - * what context we are called from. Before sysfs comes up, we don't - * need to do anything because our sysfs initcall will start by - * adding all existing slabs to sysfs. - */ - slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK; - if (slab_state >= SYSFS) - slabflags |= __SYSFS_ADD_DEFERRED; - - if (!s || !text || !kmem_cache_open(s, flags, text, - realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) { - kfree(s); - kfree(text); - goto unlock_out; - } - - list_add(&s->list, &slab_caches); - kmalloc_caches_dma[index] = s; - - if (slab_state >= SYSFS) - schedule_work(&sysfs_add_work); - -unlock_out: - up_write(&slub_lock); -out: - return kmalloc_caches_dma[index]; -} -#endif - /* * Conversion table for small slabs sizes / 8 to the index in the * kmalloc array. This is necessary for slabs < 192 since we have non power @@ -2783,6 +2603,11 @@ static s8 size_index[24] = { 2 /* 192 */ }; +static inline int size_index_elem(size_t bytes) +{ + return (bytes - 1) / 8; +} + static struct kmem_cache *get_slab(size_t size, gfp_t flags) { int index; @@ -2791,16 +2616,16 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags) if (!size) return ZERO_SIZE_PTR; - index = size_index[(size - 1) / 8]; + index = size_index[size_index_elem(size)]; } else index = fls(size - 1); #ifdef CONFIG_ZONE_DMA if (unlikely((flags & SLUB_DMA))) - return dma_kmalloc_cache(index, flags); + return kmalloc_dma_caches[index]; #endif - return &kmalloc_caches[index]; + return kmalloc_caches[index]; } void *__kmalloc(size_t size, gfp_t flags) @@ -2816,7 +2641,7 @@ void *__kmalloc(size_t size, gfp_t flags) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, flags, -1, _RET_IP_); + ret = slab_alloc(s, flags, NUMA_NO_NODE, _RET_IP_); trace_kmalloc(_RET_IP_, ret, size, s->size, flags); @@ -2824,19 +2649,21 @@ void *__kmalloc(size_t size, gfp_t flags) } EXPORT_SYMBOL(__kmalloc); +#ifdef CONFIG_NUMA static void *kmalloc_large_node(size_t size, gfp_t flags, int node) { struct page *page; + void *ptr = NULL; flags |= __GFP_COMP | __GFP_NOTRACK; page = alloc_pages_node(node, flags, get_order(size)); if (page) - return page_address(page); - else - return NULL; + ptr = page_address(page); + + kmemleak_alloc(ptr, size, 1, flags); + return ptr; } -#ifdef CONFIG_NUMA void *__kmalloc_node(size_t size, gfp_t flags, int node) { struct kmem_cache *s; @@ -2918,6 +2745,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); + kmemleak_free(x); put_page(page); return; } @@ -2975,8 +2803,7 @@ int kmem_cache_shrink(struct kmem_cache *s) * may have freed the last object and be * waiting to release the slab. */ - list_del(&page->lru); - n->nr_partial--; + __remove_partial(n, page); slab_unlock(page); discard_slab(s, page); } else { @@ -3000,7 +2827,7 @@ int kmem_cache_shrink(struct kmem_cache *s) } EXPORT_SYMBOL(kmem_cache_shrink); -#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG) +#if defined(CONFIG_MEMORY_HOTPLUG) static int slab_mem_going_offline_callback(void *arg) { struct kmem_cache *s; @@ -3036,13 +2863,13 @@ static void slab_mem_offline_callback(void *arg) /* * if n->nr_slabs > 0, slabs still exist on the node * that is going down. We were unable to free them, - * and offline_pages() function shoudn't call this + * and offline_pages() function shouldn't call this * callback. So, we must fail. */ BUG_ON(slabs_node(s, offline_node)); s->node[offline_node] = NULL; - kmem_cache_free(kmalloc_caches, n); + kmem_cache_free(kmem_cache_node, n); } } up_read(&slub_lock); @@ -3075,7 +2902,7 @@ static int slab_mem_going_online_callback(void *arg) * since memory is not yet available from the node that * is brought up. */ - n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL); + n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL); if (!n) { ret = -ENOMEM; goto out; @@ -3121,46 +2948,92 @@ static int slab_memory_callback(struct notifier_block *self, * Basic setup of slabs *******************************************************************/ +/* + * Used for early kmem_cache structures that were allocated using + * the page allocator + */ + +static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s) +{ + int node; + + list_add(&s->list, &slab_caches); + s->refcount = -1; + + for_each_node_state(node, N_NORMAL_MEMORY) { + struct kmem_cache_node *n = get_node(s, node); + struct page *p; + + if (n) { + list_for_each_entry(p, &n->partial, lru) + p->slab = s; + +#ifdef CONFIG_SLAB_DEBUG + list_for_each_entry(p, &n->full, lru) + p->slab = s; +#endif + } + } +} + void __init kmem_cache_init(void) { int i; int caches = 0; + struct kmem_cache *temp_kmem_cache; + int order; + struct kmem_cache *temp_kmem_cache_node; + unsigned long kmalloc_size; - init_alloc_cpu(); + kmem_size = offsetof(struct kmem_cache, node) + + nr_node_ids * sizeof(struct kmem_cache_node *); + + /* Allocate two kmem_caches from the page allocator */ + kmalloc_size = ALIGN(kmem_size, cache_line_size()); + order = get_order(2 * kmalloc_size); + kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order); -#ifdef CONFIG_NUMA /* * Must first have the slab cache available for the allocations of the * struct kmem_cache_node's. There is special bootstrap code in * kmem_cache_open for slab_state == DOWN. */ - create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node", - sizeof(struct kmem_cache_node), GFP_NOWAIT); - kmalloc_caches[0].refcount = -1; - caches++; + kmem_cache_node = (void *)kmem_cache + kmalloc_size; + + kmem_cache_open(kmem_cache_node, "kmem_cache_node", + sizeof(struct kmem_cache_node), + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI); -#endif /* Able to allocate the per node structures */ slab_state = PARTIAL; - /* Caches that are not of the two-to-the-power-of size */ - if (KMALLOC_MIN_SIZE <= 64) { - create_kmalloc_cache(&kmalloc_caches[1], - "kmalloc-96", 96, GFP_NOWAIT); - caches++; - create_kmalloc_cache(&kmalloc_caches[2], - "kmalloc-192", 192, GFP_NOWAIT); - caches++; - } + temp_kmem_cache = kmem_cache; + kmem_cache_open(kmem_cache, "kmem_cache", kmem_size, + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + memcpy(kmem_cache, temp_kmem_cache, kmem_size); - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { - create_kmalloc_cache(&kmalloc_caches[i], - "kmalloc", 1 << i, GFP_NOWAIT); - caches++; - } + /* + * Allocate kmem_cache_node properly from the kmem_cache slab. + * kmem_cache_node is separately allocated so no need to + * update any list pointers. + */ + temp_kmem_cache_node = kmem_cache_node; + kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); + + kmem_cache_bootstrap_fixup(kmem_cache_node); + + caches++; + kmem_cache_bootstrap_fixup(kmem_cache); + caches++; + /* Free temporary boot structure */ + free_pages((unsigned long)temp_kmem_cache, order); + + /* Now we can use the kmem_cache to allocate kmalloc slabs */ /* * Patch up the size_index table if we have strange large alignment @@ -3176,34 +3049,84 @@ void __init kmem_cache_init(void) BUILD_BUG_ON(KMALLOC_MIN_SIZE > 256 || (KMALLOC_MIN_SIZE & (KMALLOC_MIN_SIZE - 1))); - for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) - size_index[(i - 1) / 8] = KMALLOC_SHIFT_LOW; + for (i = 8; i < KMALLOC_MIN_SIZE; i += 8) { + int elem = size_index_elem(i); + if (elem >= ARRAY_SIZE(size_index)) + break; + size_index[elem] = KMALLOC_SHIFT_LOW; + } - if (KMALLOC_MIN_SIZE == 128) { + if (KMALLOC_MIN_SIZE == 64) { + /* + * The 96 byte size cache is not used if the alignment + * is 64 byte. + */ + for (i = 64 + 8; i <= 96; i += 8) + size_index[size_index_elem(i)] = 7; + } else if (KMALLOC_MIN_SIZE == 128) { /* * The 192 byte sized cache is not used if the alignment * is 128 byte. Redirect kmalloc to use the 256 byte cache * instead. */ for (i = 128 + 8; i <= 192; i += 8) - size_index[(i - 1) / 8] = 8; + size_index[size_index_elem(i)] = 8; + } + + /* Caches that are not of the two-to-the-power-of size */ + if (KMALLOC_MIN_SIZE <= 32) { + kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0); + caches++; + } + + if (KMALLOC_MIN_SIZE <= 64) { + kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0); + caches++; + } + + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { + kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0); + caches++; } slab_state = UP; /* Provide the correct kmalloc names now that the caches are up */ - for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) - kmalloc_caches[i]. name = - kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); + if (KMALLOC_MIN_SIZE <= 32) { + kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT); + BUG_ON(!kmalloc_caches[1]->name); + } + + if (KMALLOC_MIN_SIZE <= 64) { + kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT); + BUG_ON(!kmalloc_caches[2]->name); + } + + for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) { + char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i); + + BUG_ON(!s); + kmalloc_caches[i]->name = s; + } #ifdef CONFIG_SMP register_cpu_notifier(&slab_notifier); - kmem_size = offsetof(struct kmem_cache, cpu_slab) + - nr_cpu_ids * sizeof(struct kmem_cache_cpu *); -#else - kmem_size = sizeof(struct kmem_cache); #endif +#ifdef CONFIG_ZONE_DMA + for (i = 0; i < SLUB_PAGE_SHIFT; i++) { + struct kmem_cache *s = kmalloc_caches[i]; + + if (s && s->size) { + char *name = kasprintf(GFP_NOWAIT, + "dma-kmalloc-%d", s->objsize); + + BUG_ON(!name); + kmalloc_dma_caches[i] = create_kmalloc_cache(name, + s->objsize, SLAB_CACHE_DMA); + } + } +#endif printk(KERN_INFO "SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d," " CPUs=%d, Nodes=%d\n", @@ -3281,58 +3204,54 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, size_t align, unsigned long flags, void (*ctor)(void *)) { struct kmem_cache *s; + char *n; + + if (WARN_ON(!name)) + return NULL; down_write(&slub_lock); s = find_mergeable(size, align, flags, name, ctor); if (s) { - int cpu; - s->refcount++; /* * Adjust the object sizes so that we clear * the complete object on kzalloc. */ s->objsize = max(s->objsize, (int)size); - - /* - * And then we need to update the object size in the - * per cpu structures - */ - for_each_online_cpu(cpu) - get_cpu_slab(s, cpu)->objsize = s->objsize; - s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *))); - up_write(&slub_lock); if (sysfs_slab_alias(s, name)) { - down_write(&slub_lock); s->refcount--; - up_write(&slub_lock); goto err; } + up_write(&slub_lock); return s; } + n = kstrdup(name, GFP_KERNEL); + if (!n) + goto err; + s = kmalloc(kmem_size, GFP_KERNEL); if (s) { - if (kmem_cache_open(s, GFP_KERNEL, name, + if (kmem_cache_open(s, n, size, align, flags, ctor)) { list_add(&s->list, &slab_caches); - up_write(&slub_lock); if (sysfs_slab_add(s)) { - down_write(&slub_lock); list_del(&s->list); - up_write(&slub_lock); + kfree(n); kfree(s); goto err; } + up_write(&slub_lock); return s; } + kfree(n); kfree(s); } +err: up_write(&slub_lock); -err: if (flags & SLAB_PANIC) panic("Cannot create slabcache %s\n", name); else @@ -3354,29 +3273,15 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, unsigned long flags; switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - init_alloc_cpu_cpu(cpu); - down_read(&slub_lock); - list_for_each_entry(s, &slab_caches, list) - s->cpu_slab[cpu] = alloc_kmem_cache_cpu(s, cpu, - GFP_KERNEL); - up_read(&slub_lock); - break; - case CPU_UP_CANCELED: case CPU_UP_CANCELED_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: down_read(&slub_lock); list_for_each_entry(s, &slab_caches, list) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); - local_irq_save(flags); __flush_cpu_slab(s, cpu); local_irq_restore(flags); - free_kmem_cache_cpu(c, cpu); - s->cpu_slab[cpu] = NULL; } up_read(&slub_lock); break; @@ -3405,7 +3310,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) if (unlikely(ZERO_OR_NULL_PTR(s))) return s; - ret = slab_alloc(s, gfpflags, -1, caller); + ret = slab_alloc(s, gfpflags, NUMA_NO_NODE, caller); /* Honor the call site pointer we recieved. */ trace_kmalloc(caller, ret, size, s->size, gfpflags); @@ -3413,14 +3318,22 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller) return ret; } +#ifdef CONFIG_NUMA void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, int node, unsigned long caller) { struct kmem_cache *s; void *ret; - if (unlikely(size > SLUB_MAX_SIZE)) - return kmalloc_large_node(size, gfpflags, node); + if (unlikely(size > SLUB_MAX_SIZE)) { + ret = kmalloc_large_node(size, gfpflags, node); + + trace_kmalloc_node(caller, ret, + size, PAGE_SIZE << get_order(size), + gfpflags, node); + + return ret; + } s = get_slab(size, gfpflags); @@ -3434,8 +3347,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags, return ret; } +#endif -#ifdef CONFIG_SLUB_DEBUG +#ifdef CONFIG_SYSFS static int count_inuse(struct page *page) { return page->inuse; @@ -3445,7 +3359,9 @@ static int count_total(struct page *page) { return page->objects; } +#endif +#ifdef CONFIG_SLUB_DEBUG static int validate_slab(struct kmem_cache *s, struct page *page, unsigned long *map) { @@ -3461,13 +3377,13 @@ static int validate_slab(struct kmem_cache *s, struct page *page, for_each_free_object(p, s, page->freelist) { set_bit(slab_index(p, s, addr), map); - if (!check_object(s, page, p, 0)) + if (!check_object(s, page, p, SLUB_RED_INACTIVE)) return 0; } for_each_object(p, s, addr, page->objects) if (!test_bit(slab_index(p, s, addr), map)) - if (!check_object(s, page, p, 1)) + if (!check_object(s, page, p, SLUB_RED_ACTIVE)) return 0; return 1; } @@ -3481,16 +3397,6 @@ static void validate_slab_slab(struct kmem_cache *s, struct page *page, } else printk(KERN_INFO "SLUB %s: Skipped busy slab 0x%p\n", s->name, page); - - if (s->flags & DEBUG_DEFAULT_FLAGS) { - if (!PageSlubDebug(page)) - printk(KERN_ERR "SLUB %s: SlubDebug not set " - "on slab 0x%p\n", s->name, page); - } else { - if (PageSlubDebug(page)) - printk(KERN_ERR "SLUB %s: SlubDebug set on " - "slab 0x%p\n", s->name, page); - } } static int validate_slab_node(struct kmem_cache *s, @@ -3546,65 +3452,6 @@ static long validate_slab_cache(struct kmem_cache *s) kfree(map); return count; } - -#ifdef SLUB_RESILIENCY_TEST -static void resiliency_test(void) -{ - u8 *p; - - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); - - p = kzalloc(16, GFP_KERNEL); - p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); - - validate_slab_cache(kmalloc_caches + 4); - - /* Hmmm... The next two are dangerous */ - p = kzalloc(32, GFP_KERNEL); - p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - - validate_slab_cache(kmalloc_caches + 5); - p = kzalloc(64, GFP_KERNEL); - p += 64 + (get_cycles() & 0xff) * sizeof(void *); - *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); - validate_slab_cache(kmalloc_caches + 6); - - printk(KERN_ERR "\nB. Corruption after free\n"); - p = kzalloc(128, GFP_KERNEL); - kfree(p); - *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches + 7); - - p = kzalloc(256, GFP_KERNEL); - kfree(p); - p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); - validate_slab_cache(kmalloc_caches + 8); - - p = kzalloc(512, GFP_KERNEL); - kfree(p); - p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); - validate_slab_cache(kmalloc_caches + 9); -} -#else -static void resiliency_test(void) {}; -#endif - /* * Generate lists of code addresses where slabcache objects are allocated * and freed. @@ -3732,10 +3579,10 @@ static int add_location(struct loc_track *t, struct kmem_cache *s, } static void process_slab(struct loc_track *t, struct kmem_cache *s, - struct page *page, enum track_item alloc) + struct page *page, enum track_item alloc, + unsigned long *map) { void *addr = page_address(page); - DECLARE_BITMAP(map, page->objects); void *p; bitmap_zero(map, page->objects); @@ -3754,11 +3601,14 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long i; struct loc_track t = { 0, 0, NULL }; int node; + unsigned long *map = kmalloc(BITS_TO_LONGS(oo_objects(s->max)) * + sizeof(unsigned long), GFP_KERNEL); - if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), - GFP_TEMPORARY)) + if (!map || !alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location), + GFP_TEMPORARY)) { + kfree(map); return sprintf(buf, "Out of memory\n"); - + } /* Push back cpu slabs */ flush_all(s); @@ -3772,9 +3622,9 @@ static int list_locations(struct kmem_cache *s, char *buf, spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, lru) - process_slab(&t, s, page, alloc); + process_slab(&t, s, page, alloc, map); list_for_each_entry(page, &n->full, lru) - process_slab(&t, s, page, alloc); + process_slab(&t, s, page, alloc, map); spin_unlock_irqrestore(&n->list_lock, flags); } @@ -3825,11 +3675,76 @@ static int list_locations(struct kmem_cache *s, char *buf, } free_loc_track(&t); + kfree(map); if (!t.count) len += sprintf(buf, "No data\n"); return len; } +#endif + +#ifdef SLUB_RESILIENCY_TEST +static void resiliency_test(void) +{ + u8 *p; + + BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10); + + printk(KERN_ERR "SLUB resiliency testing\n"); + printk(KERN_ERR "-----------------------\n"); + printk(KERN_ERR "A. Corruption after allocation\n"); + + p = kzalloc(16, GFP_KERNEL); + p[16] = 0x12; + printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" + " 0x12->0x%p\n\n", p + 16); + + validate_slab_cache(kmalloc_caches[4]); + /* Hmmm... The next two are dangerous */ + p = kzalloc(32, GFP_KERNEL); + p[32 + sizeof(void *)] = 0x34; + printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" + " 0x34 -> -0x%p\n", p); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); + + validate_slab_cache(kmalloc_caches[5]); + p = kzalloc(64, GFP_KERNEL); + p += 64 + (get_cycles() & 0xff) * sizeof(void *); + *p = 0x56; + printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + printk(KERN_ERR + "If allocated object is overwritten then not detectable\n\n"); + validate_slab_cache(kmalloc_caches[6]); + + printk(KERN_ERR "\nB. Corruption after free\n"); + p = kzalloc(128, GFP_KERNEL); + kfree(p); + *p = 0x78; + printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[7]); + + p = kzalloc(256, GFP_KERNEL); + kfree(p); + p[50] = 0x9a; + printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", + p); + validate_slab_cache(kmalloc_caches[8]); + + p = kzalloc(512, GFP_KERNEL); + kfree(p); + p[512] = 0xab; + printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + validate_slab_cache(kmalloc_caches[9]); +} +#else +#ifdef CONFIG_SYSFS +static void resiliency_test(void) {}; +#endif +#endif + +#ifdef CONFIG_SYSFS enum slab_stat_type { SL_ALL, /* All slabs */ SL_PARTIAL, /* Only partially allocated slabs */ @@ -3862,7 +3777,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, int cpu; for_each_possible_cpu(cpu) { - struct kmem_cache_cpu *c = get_cpu_slab(s, cpu); + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); if (!c || c->node < 0) continue; @@ -3882,6 +3797,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } + down_read(&slub_lock); +#ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3898,7 +3815,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s, nodes[node] += x; } - } else if (flags & SO_PARTIAL) { + } else +#endif + if (flags & SO_PARTIAL) { for_each_node_state(node, N_NORMAL_MEMORY) { struct kmem_cache_node *n = get_node(s, node); @@ -3919,10 +3838,12 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif + up_read(&slub_lock); kfree(nodes); return x + sprintf(buf + x, "\n"); } +#ifdef CONFIG_SLUB_DEBUG static int any_slab_objects(struct kmem_cache *s) { int node; @@ -3938,6 +3859,7 @@ static int any_slab_objects(struct kmem_cache *s) } return 0; } +#endif #define to_slab_attr(n) container_of(n, struct slab_attribute, attr) #define to_slab(n) container_of(n, struct kmem_cache, kobj); @@ -4039,12 +3961,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(aliases); -static ssize_t slabs_show(struct kmem_cache *s, char *buf) -{ - return show_slab_objects(s, buf, SO_ALL); -} -SLAB_ATTR_RO(slabs); - static ssize_t partial_show(struct kmem_cache *s, char *buf) { return show_slab_objects(s, buf, SO_PARTIAL); @@ -4069,6 +3985,48 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(objects_partial); +static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); +} + +static ssize_t reclaim_account_store(struct kmem_cache *s, + const char *buf, size_t length) +{ + s->flags &= ~SLAB_RECLAIM_ACCOUNT; + if (buf[0] == '1') + s->flags |= SLAB_RECLAIM_ACCOUNT; + return length; +} +SLAB_ATTR(reclaim_account); + +static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); +} +SLAB_ATTR_RO(hwcache_align); + +#ifdef CONFIG_ZONE_DMA +static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); +} +SLAB_ATTR_RO(cache_dma); +#endif + +static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); +} +SLAB_ATTR_RO(destroy_by_rcu); + +#ifdef CONFIG_SLUB_DEBUG +static ssize_t slabs_show(struct kmem_cache *s, char *buf) +{ + return show_slab_objects(s, buf, SO_ALL); +} +SLAB_ATTR_RO(slabs); + static ssize_t total_objects_show(struct kmem_cache *s, char *buf) { return show_slab_objects(s, buf, SO_ALL|SO_TOTAL); @@ -4105,41 +4063,6 @@ static ssize_t trace_store(struct kmem_cache *s, const char *buf, } SLAB_ATTR(trace); -static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf) -{ - return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT)); -} - -static ssize_t reclaim_account_store(struct kmem_cache *s, - const char *buf, size_t length) -{ - s->flags &= ~SLAB_RECLAIM_ACCOUNT; - if (buf[0] == '1') - s->flags |= SLAB_RECLAIM_ACCOUNT; - return length; -} -SLAB_ATTR(reclaim_account); - -static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf) -{ - return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN)); -} -SLAB_ATTR_RO(hwcache_align); - -#ifdef CONFIG_ZONE_DMA -static ssize_t cache_dma_show(struct kmem_cache *s, char *buf) -{ - return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA)); -} -SLAB_ATTR_RO(cache_dma); -#endif - -static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) -{ - return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU)); -} -SLAB_ATTR_RO(destroy_by_rcu); - static ssize_t red_zone_show(struct kmem_cache *s, char *buf) { return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE)); @@ -4216,6 +4139,40 @@ static ssize_t validate_store(struct kmem_cache *s, } SLAB_ATTR(validate); +static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_ALLOC); +} +SLAB_ATTR_RO(alloc_calls); + +static ssize_t free_calls_show(struct kmem_cache *s, char *buf) +{ + if (!(s->flags & SLAB_STORE_USER)) + return -ENOSYS; + return list_locations(s, buf, TRACK_FREE); +} +SLAB_ATTR_RO(free_calls); +#endif /* CONFIG_SLUB_DEBUG */ + +#ifdef CONFIG_FAILSLAB +static ssize_t failslab_show(struct kmem_cache *s, char *buf) +{ + return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB)); +} + +static ssize_t failslab_store(struct kmem_cache *s, const char *buf, + size_t length) +{ + s->flags &= ~SLAB_FAILSLAB; + if (buf[0] == '1') + s->flags |= SLAB_FAILSLAB; + return length; +} +SLAB_ATTR(failslab); +#endif + static ssize_t shrink_show(struct kmem_cache *s, char *buf) { return 0; @@ -4235,22 +4192,6 @@ static ssize_t shrink_store(struct kmem_cache *s, } SLAB_ATTR(shrink); -static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_ALLOC); -} -SLAB_ATTR_RO(alloc_calls); - -static ssize_t free_calls_show(struct kmem_cache *s, char *buf) -{ - if (!(s->flags & SLAB_STORE_USER)) - return -ENOSYS; - return list_locations(s, buf, TRACK_FREE); -} -SLAB_ATTR_RO(free_calls); - #ifdef CONFIG_NUMA static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf) { @@ -4287,7 +4228,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return -ENOMEM; for_each_online_cpu(cpu) { - unsigned x = get_cpu_slab(s, cpu)->stat[si]; + unsigned x = per_cpu_ptr(s->cpu_slab, cpu)->stat[si]; data[cpu] = x; sum += x; @@ -4305,12 +4246,28 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si) return len + sprintf(buf + len, "\n"); } +static void clear_stat(struct kmem_cache *s, enum stat_item si) +{ + int cpu; + + for_each_online_cpu(cpu) + per_cpu_ptr(s->cpu_slab, cpu)->stat[si] = 0; +} + #define STAT_ATTR(si, text) \ static ssize_t text##_show(struct kmem_cache *s, char *buf) \ { \ return show_stat(s, buf, si); \ } \ -SLAB_ATTR_RO(text); \ +static ssize_t text##_store(struct kmem_cache *s, \ + const char *buf, size_t length) \ +{ \ + if (buf[0] != '0') \ + return -EINVAL; \ + clear_stat(s, si); \ + return length; \ +} \ +SLAB_ATTR(text); \ STAT_ATTR(ALLOC_FASTPATH, alloc_fastpath); STAT_ATTR(ALLOC_SLOWPATH, alloc_slowpath); @@ -4340,25 +4297,27 @@ static struct attribute *slab_attrs[] = { &min_partial_attr.attr, &objects_attr.attr, &objects_partial_attr.attr, - &total_objects_attr.attr, - &slabs_attr.attr, &partial_attr.attr, &cpu_slabs_attr.attr, &ctor_attr.attr, &aliases_attr.attr, &align_attr.attr, - &sanity_checks_attr.attr, - &trace_attr.attr, &hwcache_align_attr.attr, &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, + &shrink_attr.attr, +#ifdef CONFIG_SLUB_DEBUG + &total_objects_attr.attr, + &slabs_attr.attr, + &sanity_checks_attr.attr, + &trace_attr.attr, &red_zone_attr.attr, &poison_attr.attr, &store_user_attr.attr, &validate_attr.attr, - &shrink_attr.attr, &alloc_calls_attr.attr, &free_calls_attr.attr, +#endif #ifdef CONFIG_ZONE_DMA &cache_dma_attr.attr, #endif @@ -4385,6 +4344,10 @@ static struct attribute *slab_attrs[] = { &deactivate_remote_frees_attr.attr, &order_fallback_attr.attr, #endif +#ifdef CONFIG_FAILSLAB + &failslab_attr.attr, +#endif + NULL }; @@ -4434,10 +4397,11 @@ static void kmem_cache_release(struct kobject *kobj) { struct kmem_cache *s = to_slab(kobj); + kfree(s->name); kfree(s); } -static struct sysfs_ops slab_sysfs_ops = { +static const struct sysfs_ops slab_sysfs_ops = { .show = slab_attr_show, .store = slab_attr_store, }; @@ -4456,7 +4420,7 @@ static int uevent_filter(struct kset *kset, struct kobject *kobj) return 0; } -static struct kset_uevent_ops slab_uevent_ops = { +static const struct kset_uevent_ops slab_uevent_ops = { .filter = uevent_filter, }; @@ -4533,8 +4497,11 @@ static int sysfs_slab_add(struct kmem_cache *s) } err = sysfs_create_group(&s->kobj, &slab_attr_group); - if (err) + if (err) { + kobject_del(&s->kobj); + kobject_put(&s->kobj); return err; + } kobject_uevent(&s->kobj, KOBJ_ADD); if (!unmergeable) { /* Setup first alias */ @@ -4546,6 +4513,13 @@ static int sysfs_slab_add(struct kmem_cache *s) static void sysfs_slab_remove(struct kmem_cache *s) { + if (slab_state < SYSFS) + /* + * Sysfs has not been setup yet so no need to remove the + * cache from sysfs. + */ + return; + kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_del(&s->kobj); kobject_put(&s->kobj); @@ -4591,8 +4565,11 @@ static int __init slab_sysfs_init(void) struct kmem_cache *s; int err; + down_write(&slub_lock); + slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { + up_write(&slub_lock); printk(KERN_ERR "Cannot register slab subsystem.\n"); return -ENOSYS; } @@ -4617,12 +4594,13 @@ static int __init slab_sysfs_init(void) kfree(al); } + up_write(&slub_lock); resiliency_test(); return 0; } __initcall(slab_sysfs_init); -#endif +#endif /* CONFIG_SYSFS */ /* * The /proc/slabinfo ABI @@ -4716,7 +4694,7 @@ static const struct file_operations proc_slabinfo_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); + proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations); return 0; } module_init(slab_proc_init); diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c index a13ea64..29d6cbf 100644 --- a/mm/sparse-vmemmap.c +++ b/mm/sparse-vmemmap.c @@ -22,6 +22,7 @@ #include <linux/bootmem.h> #include <linux/highmem.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/spinlock.h> #include <linux/vmalloc.h> #include <linux/sched.h> @@ -40,16 +41,24 @@ static void * __init_refok __earlyonly_bootmem_alloc(int node, unsigned long align, unsigned long goal) { - return __alloc_bootmem_node(NODE_DATA(node), size, align, goal); + return __alloc_bootmem_node_high(NODE_DATA(node), size, align, goal); } +static void *vmemmap_buf; +static void *vmemmap_buf_end; void * __meminit vmemmap_alloc_block(unsigned long size, int node) { /* If the main allocator is up use that, fallback to bootmem. */ if (slab_is_available()) { - struct page *page = alloc_pages_node(node, + struct page *page; + + if (node_state(node, N_HIGH_MEMORY)) + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, get_order(size)); + else + page = alloc_pages(GFP_KERNEL | __GFP_ZERO, + get_order(size)); if (page) return page_address(page); return NULL; @@ -58,6 +67,24 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node) __pa(MAX_DMA_ADDRESS)); } +/* need to make sure size is all the same during early stage */ +void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node) +{ + void *ptr; + + if (!vmemmap_buf) + return vmemmap_alloc_block(size, node); + + /* take the from buf */ + ptr = (void *)ALIGN((unsigned long)vmemmap_buf, size); + if (ptr + size > vmemmap_buf_end) + return vmemmap_alloc_block(size, node); + + vmemmap_buf = ptr + size; + + return ptr; +} + void __meminit vmemmap_verify(pte_t *pte, int node, unsigned long start, unsigned long end) { @@ -74,7 +101,7 @@ pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node) pte_t *pte = pte_offset_kernel(pmd, addr); if (pte_none(*pte)) { pte_t entry; - void *p = vmemmap_alloc_block(PAGE_SIZE, node); + void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node); if (!p) return NULL; entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL); @@ -157,3 +184,44 @@ struct page * __meminit sparse_mem_map_populate(unsigned long pnum, int nid) return map; } + +void __init sparse_mem_maps_populate_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + unsigned long pnum; + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + void *vmemmap_buf_start; + + size = ALIGN(size, PMD_SIZE); + vmemmap_buf_start = __earlyonly_bootmem_alloc(nodeid, size * map_count, + PMD_SIZE, __pa(MAX_DMA_ADDRESS)); + + if (vmemmap_buf_start) { + vmemmap_buf = vmemmap_buf_start; + vmemmap_buf_end = vmemmap_buf_start + size * map_count; + } + + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + + map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); + if (map_map[pnum]) + continue; + ms = __nr_to_section(pnum); + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + } + + if (vmemmap_buf_start) { + /* need to free left buf */ + free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf); + vmemmap_buf = NULL; + vmemmap_buf_end = NULL; + } +} diff --git a/mm/sparse.c b/mm/sparse.c index da432d9..95ac219 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -2,6 +2,7 @@ * sparse memory mappings. */ #include <linux/mm.h> +#include <linux/slab.h> #include <linux/mmzone.h> #include <linux/bootmem.h> #include <linux/highmem.h> @@ -62,9 +63,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) unsigned long array_size = SECTIONS_PER_ROOT * sizeof(struct mem_section); - if (slab_is_available()) - section = kmalloc_node(array_size, GFP_KERNEL, nid); - else + if (slab_is_available()) { + if (node_state(nid, N_HIGH_MEMORY)) + section = kmalloc_node(array_size, GFP_KERNEL, nid); + else + section = kmalloc(array_size, GFP_KERNEL); + } else section = alloc_bootmem_node(NODE_DATA(nid), array_size); if (section) @@ -268,7 +272,8 @@ static unsigned long *__kmalloc_section_usemap(void) #ifdef CONFIG_MEMORY_HOTREMOVE static unsigned long * __init -sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long count) { unsigned long section_nr; @@ -283,7 +288,7 @@ sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) * this problem. */ section_nr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); - return alloc_bootmem_section(usemap_size(), section_nr); + return alloc_bootmem_section(usemap_size() * count, section_nr); } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) @@ -326,7 +331,8 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #else static unsigned long * __init -sparse_early_usemap_alloc_pgdat_section(struct pglist_data *pgdat) +sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, + unsigned long count) { return NULL; } @@ -336,44 +342,117 @@ static void __init check_usemap_section_nr(int nid, unsigned long *usemap) } #endif /* CONFIG_MEMORY_HOTREMOVE */ -static unsigned long *__init sparse_early_usemap_alloc(unsigned long pnum) +static void __init sparse_early_usemaps_alloc_node(unsigned long**usemap_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long usemap_count, int nodeid) { - unsigned long *usemap; - struct mem_section *ms = __nr_to_section(pnum); - int nid = sparse_early_nid(ms); - - usemap = sparse_early_usemap_alloc_pgdat_section(NODE_DATA(nid)); - if (usemap) - return usemap; + void *usemap; + unsigned long pnum; + int size = usemap_size(); - usemap = alloc_bootmem_node(NODE_DATA(nid), usemap_size()); + usemap = sparse_early_usemaps_alloc_pgdat_section(NODE_DATA(nodeid), + usemap_count); if (usemap) { - check_usemap_section_nr(nid, usemap); - return usemap; + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = usemap; + usemap += size; + } + return; } - /* Stupid: suppress gcc warning for SPARSEMEM && !NUMA */ - nid = 0; + usemap = alloc_bootmem_node(NODE_DATA(nodeid), size * usemap_count); + if (usemap) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + usemap_map[pnum] = usemap; + usemap += size; + check_usemap_section_nr(nodeid, usemap_map[pnum]); + } + return; + } printk(KERN_WARNING "%s: allocation failed\n", __func__); - return NULL; } #ifndef CONFIG_SPARSEMEM_VMEMMAP struct page __init *sparse_mem_map_populate(unsigned long pnum, int nid) { struct page *map; + unsigned long size; map = alloc_remap(nid, sizeof(struct page) * PAGES_PER_SECTION); if (map) return map; - map = alloc_bootmem_pages_node(NODE_DATA(nid), - PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION)); + size = PAGE_ALIGN(sizeof(struct page) * PAGES_PER_SECTION); + map = __alloc_bootmem_node_high(NODE_DATA(nid), size, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); return map; } +void __init sparse_mem_maps_populate_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + void *map; + unsigned long pnum; + unsigned long size = sizeof(struct page) * PAGES_PER_SECTION; + + map = alloc_remap(nodeid, size * map_count); + if (map) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = map; + map += size; + } + return; + } + + size = PAGE_ALIGN(size); + map = __alloc_bootmem_node_high(NODE_DATA(nodeid), size * map_count, + PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (map) { + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = map; + map += size; + } + return; + } + + /* fallback */ + for (pnum = pnum_begin; pnum < pnum_end; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + map_map[pnum] = sparse_mem_map_populate(pnum, nodeid); + if (map_map[pnum]) + continue; + ms = __nr_to_section(pnum); + printk(KERN_ERR "%s: sparsemem memory map backing failed " + "some memory will not be available.\n", __func__); + ms->section_mem_map = 0; + } +} #endif /* !CONFIG_SPARSEMEM_VMEMMAP */ +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER +static void __init sparse_early_mem_maps_alloc_node(struct page **map_map, + unsigned long pnum_begin, + unsigned long pnum_end, + unsigned long map_count, int nodeid) +{ + sparse_mem_maps_populate_node(map_map, pnum_begin, pnum_end, + map_count, nodeid); +} +#else static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) { struct page *map; @@ -389,10 +468,12 @@ static struct page __init *sparse_early_mem_map_alloc(unsigned long pnum) ms->section_mem_map = 0; return NULL; } +#endif void __attribute__((weak)) __meminit vmemmap_populate_print_last(void) { } + /* * Allocate the accumulated non-linear sections, allocate a mem_map * for each and record the physical to section mapping. @@ -404,6 +485,14 @@ void __init sparse_init(void) unsigned long *usemap; unsigned long **usemap_map; int size; + int nodeid_begin = 0; + unsigned long pnum_begin = 0; + unsigned long usemap_count; +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + unsigned long map_count; + int size2; + struct page **map_map; +#endif /* * map is using big page (aka 2M in x86 64 bit) @@ -422,10 +511,81 @@ void __init sparse_init(void) panic("can not allocate usemap_map\n"); for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid_begin = sparse_early_nid(ms); + pnum_begin = pnum; + break; + } + usemap_count = 1; + for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + int nodeid; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid = sparse_early_nid(ms); + if (nodeid == nodeid_begin) { + usemap_count++; + continue; + } + /* ok, we need to take cake of from pnum_begin to pnum - 1*/ + sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, pnum, + usemap_count, nodeid_begin); + /* new start, update count etc*/ + nodeid_begin = nodeid; + pnum_begin = pnum; + usemap_count = 1; + } + /* ok, last chunk */ + sparse_early_usemaps_alloc_node(usemap_map, pnum_begin, NR_MEM_SECTIONS, + usemap_count, nodeid_begin); + +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + size2 = sizeof(struct page *) * NR_MEM_SECTIONS; + map_map = alloc_bootmem(size2); + if (!map_map) + panic("can not allocate map_map\n"); + + for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + if (!present_section_nr(pnum)) continue; - usemap_map[pnum] = sparse_early_usemap_alloc(pnum); + ms = __nr_to_section(pnum); + nodeid_begin = sparse_early_nid(ms); + pnum_begin = pnum; + break; } + map_count = 1; + for (pnum = pnum_begin + 1; pnum < NR_MEM_SECTIONS; pnum++) { + struct mem_section *ms; + int nodeid; + + if (!present_section_nr(pnum)) + continue; + ms = __nr_to_section(pnum); + nodeid = sparse_early_nid(ms); + if (nodeid == nodeid_begin) { + map_count++; + continue; + } + /* ok, we need to take cake of from pnum_begin to pnum - 1*/ + sparse_early_mem_maps_alloc_node(map_map, pnum_begin, pnum, + map_count, nodeid_begin); + /* new start, update count etc*/ + nodeid_begin = nodeid; + pnum_begin = pnum; + map_count = 1; + } + /* ok, last chunk */ + sparse_early_mem_maps_alloc_node(map_map, pnum_begin, NR_MEM_SECTIONS, + map_count, nodeid_begin); +#endif for (pnum = 0; pnum < NR_MEM_SECTIONS; pnum++) { if (!present_section_nr(pnum)) @@ -435,7 +595,11 @@ void __init sparse_init(void) if (!usemap) continue; +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + map = map_map[pnum]; +#else map = sparse_early_mem_map_alloc(pnum); +#endif if (!map) continue; @@ -445,6 +609,9 @@ void __init sparse_init(void) vmemmap_populate_print_last(); +#ifdef CONFIG_SPARSEMEM_ALLOC_MEM_MAP_TOGETHER + free_bootmem(__pa(map_map), size2); +#endif free_bootmem(__pa(usemap_map), size); } @@ -30,6 +30,7 @@ #include <linux/notifier.h> #include <linux/backing-dev.h> #include <linux/memcontrol.h> +#include <linux/gfp.h> #include "internal.h" @@ -55,7 +56,7 @@ static void __page_cache_release(struct page *page) del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } - free_hot_page(page); + free_hot_cold_page(page, 0); } static void put_compound_page(struct page *page) @@ -118,7 +119,7 @@ static void pagevec_move_tail(struct pagevec *pvec) spin_lock(&zone->lru_lock); } if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_is_file_cache(page); + int lru = page_lru_base_type(page); list_move_tail(&page->lru, &zone->lru[lru].list); pgmoved++; } @@ -181,7 +182,7 @@ void activate_page(struct page *page) spin_lock_irq(&zone->lru_lock); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { int file = page_is_file_cache(page); - int lru = LRU_BASE + file; + int lru = page_lru_base_type(page); del_page_from_lru_list(zone, page, lru); SetPageActive(page); @@ -189,7 +190,7 @@ void activate_page(struct page *page) add_page_to_lru_list(zone, page, lru); __count_vm_event(PGACTIVATE); - update_page_reclaim_stat(zone, page, !!file, 1); + update_page_reclaim_stat(zone, page, file, 1); } spin_unlock_irq(&zone->lru_lock); } @@ -223,6 +224,7 @@ void __lru_cache_add(struct page *page, enum lru_list lru) ____pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } +EXPORT_SYMBOL(__lru_cache_add); /** * lru_cache_add_lru - add a page to a page list @@ -376,6 +378,7 @@ void release_pages(struct page **pages, int nr, int cold) pagevec_free(&pages_to_free); } +EXPORT_SYMBOL(release_pages); /* * The pages which we're about to release may be in the deferred lru-addition @@ -496,7 +499,7 @@ EXPORT_SYMBOL(pagevec_lookup_tag); */ void __init swap_setup(void) { - unsigned long megs = num_physpages >> (20 - PAGE_SHIFT); + unsigned long megs = totalram_pages >> (20 - PAGE_SHIFT); #ifdef CONFIG_SWAP bdi_init(swapper_space.backing_dev_info); diff --git a/mm/swap_state.c b/mm/swap_state.c index 42cd38e..e10f583 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -8,6 +8,7 @@ */ #include <linux/module.h> #include <linux/mm.h> +#include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/swapops.h> @@ -34,6 +35,7 @@ static const struct address_space_operations swap_aops = { }; static struct backing_dev_info swap_backing_dev_info = { + .name = "swap", .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = swap_unplug_io_fn, }; @@ -66,10 +68,10 @@ void show_swap_cache_info(void) } /* - * add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, + * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) +static int __add_to_swap_cache(struct page *page, swp_entry_t entry) { int error; @@ -77,28 +79,43 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) VM_BUG_ON(PageSwapCache(page)); VM_BUG_ON(!PageSwapBacked(page)); + page_cache_get(page); + SetPageSwapCache(page); + set_page_private(page, entry.val); + + spin_lock_irq(&swapper_space.tree_lock); + error = radix_tree_insert(&swapper_space.page_tree, entry.val, page); + if (likely(!error)) { + total_swapcache_pages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(add_total); + } + spin_unlock_irq(&swapper_space.tree_lock); + + if (unlikely(error)) { + /* + * Only the context which have set SWAP_HAS_CACHE flag + * would call add_to_swap_cache(). + * So add_to_swap_cache() doesn't returns -EEXIST. + */ + VM_BUG_ON(error == -EEXIST); + set_page_private(page, 0UL); + ClearPageSwapCache(page); + page_cache_release(page); + } + + return error; +} + + +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) +{ + int error; + error = radix_tree_preload(gfp_mask); if (!error) { - page_cache_get(page); - SetPageSwapCache(page); - set_page_private(page, entry.val); - - spin_lock_irq(&swapper_space.tree_lock); - error = radix_tree_insert(&swapper_space.page_tree, - entry.val, page); - if (likely(!error)) { - total_swapcache_pages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(add_total); - } - spin_unlock_irq(&swapper_space.tree_lock); + error = __add_to_swap_cache(page, entry); radix_tree_preload_end(); - - if (unlikely(error)) { - set_page_private(page, 0UL); - ClearPageSwapCache(page); - page_cache_release(page); - } } return error; } @@ -136,38 +153,34 @@ int add_to_swap(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageUptodate(page)); - for (;;) { - entry = get_swap_page(); - if (!entry.val) - return 0; + entry = get_swap_page(); + if (!entry.val) + return 0; + /* + * Radix-tree node allocations from PF_MEMALLOC contexts could + * completely exhaust the page allocator. __GFP_NOMEMALLOC + * stops emergency reserves from being allocated. + * + * TODO: this could cause a theoretical memory reclaim + * deadlock in the swap out path. + */ + /* + * Add it to the swap cache and mark it dirty + */ + err = add_to_swap_cache(page, entry, + __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); + + if (!err) { /* Success */ + SetPageDirty(page); + return 1; + } else { /* -ENOMEM radix-tree allocation failure */ /* - * Radix-tree node allocations from PF_MEMALLOC contexts could - * completely exhaust the page allocator. __GFP_NOMEMALLOC - * stops emergency reserves from being allocated. - * - * TODO: this could cause a theoretical memory reclaim - * deadlock in the swap out path. - */ - /* - * Add it to the swap cache and mark it dirty + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. */ - err = add_to_swap_cache(page, entry, - __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); - - switch (err) { - case 0: /* Success */ - SetPageDirty(page); - return 1; - case -EEXIST: - /* Raced with "speculative" read_swap_cache_async */ - swapcache_free(entry, NULL); - continue; - default: - /* -ENOMEM radix-tree allocation failure */ - swapcache_free(entry, NULL); - return 0; - } + swapcache_free(entry, NULL); + return 0; } } @@ -289,26 +302,31 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, } /* + * call radix_tree_preload() while we can wait. + */ + err = radix_tree_preload(gfp_mask & GFP_KERNEL); + if (err) + break; + + /* * Swap entry may have been freed since our caller observed it. */ err = swapcache_prepare(entry); - if (err == -EEXIST) /* seems racy */ + if (err == -EEXIST) { /* seems racy */ + radix_tree_preload_end(); continue; - if (err) /* swp entry is obsolete ? */ + } + if (err) { /* swp entry is obsolete ? */ + radix_tree_preload_end(); break; + } - /* - * Associate the page with swap entry in the swap cache. - * May fail (-EEXIST) if there is already a page associated - * with this entry in the swap cache: added by a racing - * read_swap_cache_async, or add_to_swap or shmem_writepage - * re-using the just freed swap entry for an existing page. - * May fail (-ENOMEM) if radix-tree node allocation failed. - */ + /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __set_page_locked(new_page); SetPageSwapBacked(new_page); - err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); + err = __add_to_swap_cache(new_page, entry); if (likely(!err)) { + radix_tree_preload_end(); /* * Initiate read into locked page and return. */ @@ -316,8 +334,13 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, swap_readpage(new_page); return new_page; } + radix_tree_preload_end(); ClearPageSwapBacked(new_page); __clear_page_locked(new_page); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ swapcache_free(entry, NULL); } while (err != -ENOMEM); diff --git a/mm/swapfile.c b/mm/swapfile.c index d1ade1a..67ddaaf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -22,6 +22,7 @@ #include <linux/seq_file.h> #include <linux/init.h> #include <linux/module.h> +#include <linux/ksm.h> #include <linux/rmap.h> #include <linux/security.h> #include <linux/backing-dev.h> @@ -29,17 +30,22 @@ #include <linux/capability.h> #include <linux/syscalls.h> #include <linux/memcontrol.h> +#include <linux/poll.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> #include <linux/swapops.h> #include <linux/page_cgroup.h> +static bool swap_count_continued(struct swap_info_struct *, pgoff_t, + unsigned char); +static void free_swap_count_continuations(struct swap_info_struct *); +static sector_t map_swap_entry(swp_entry_t, struct block_device**); + static DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long nr_swap_pages; long total_swap_pages; -static int swap_overflow; static int least_priority; static const char Bad_file[] = "Bad swap file entry "; @@ -49,42 +55,24 @@ static const char Unused_offset[] = "Unused swap offset entry "; static struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct swap_info[MAX_SWAPFILES]; +static struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); -/* For reference count accounting in swap_map */ -/* enum for swap_map[] handling. internal use only */ -enum { - SWAP_MAP = 0, /* ops for reference from swap users */ - SWAP_CACHE, /* ops for reference from swap cache */ -}; - -static inline int swap_count(unsigned short ent) -{ - return ent & SWAP_COUNT_MASK; -} +static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); +/* Activity counter to indicate that a swapon or swapoff has occurred */ +static atomic_t proc_poll_event = ATOMIC_INIT(0); -static inline bool swap_has_cache(unsigned short ent) +static inline unsigned char swap_count(unsigned char ent) { - return !!(ent & SWAP_HAS_CACHE); -} - -static inline unsigned short encode_swapmap(int count, bool has_cache) -{ - unsigned short ret = count; - - if (has_cache) - return SWAP_HAS_CACHE | ret; - return ret; + return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ } -/* returnes 1 if swap entry is freed */ +/* returns 1 if swap entry is freed */ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset) { - int type = si - swap_info; - swp_entry_t entry = swp_entry(type, offset); + swp_entry_t entry = swp_entry(si->type, offset); struct page *page; int ret = 0; @@ -120,7 +108,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) down_read(&swap_unplug_sem); entry.val = page_private(page); if (PageSwapCache(page)) { - struct block_device *bdev = swap_info[swp_type(entry)].bdev; + struct block_device *bdev = swap_info[swp_type(entry)]->bdev; struct backing_dev_info *bdi; /* @@ -146,22 +134,28 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) static int discard_swap(struct swap_info_struct *si) { struct swap_extent *se; + sector_t start_block; + sector_t nr_blocks; int err = 0; - list_for_each_entry(se, &si->extent_list, list) { - sector_t start_block = se->start_block << (PAGE_SHIFT - 9); - sector_t nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); + /* Do not discard the swap header page! */ + se = &si->first_swap_extent; + start_block = (se->start_block + 1) << (PAGE_SHIFT - 9); + nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9); + if (nr_blocks) { + err = blkdev_issue_discard(si->bdev, start_block, + nr_blocks, GFP_KERNEL, 0); + if (err) + return err; + cond_resched(); + } - if (se->start_page == 0) { - /* Do not discard the swap header page! */ - start_block += 1 << (PAGE_SHIFT - 9); - nr_blocks -= 1 << (PAGE_SHIFT - 9); - if (!nr_blocks) - continue; - } + list_for_each_entry(se, &si->first_swap_extent.list, list) { + start_block = se->start_block << (PAGE_SHIFT - 9); + nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9); err = blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_KERNEL); + nr_blocks, GFP_KERNEL, 0); if (err) break; @@ -200,13 +194,11 @@ static void discard_swap_cluster(struct swap_info_struct *si, start_block <<= PAGE_SHIFT - 9; nr_blocks <<= PAGE_SHIFT - 9; if (blkdev_issue_discard(si->bdev, start_block, - nr_blocks, GFP_NOIO)) + nr_blocks, GFP_NOIO, 0)) break; } lh = se->list.next; - if (lh == &si->extent_list) - lh = lh->next; se = list_entry(lh, struct swap_extent, list); } } @@ -221,7 +213,7 @@ static int wait_for_discard(void *word) #define LATENCY_LIMIT 256 static inline unsigned long scan_swap_map(struct swap_info_struct *si, - int cache) + unsigned char usage) { unsigned long offset; unsigned long scan_base; @@ -352,10 +344,7 @@ checks: si->lowest_bit = si->max; si->highest_bit = 0; } - if (cache == SWAP_CACHE) /* at usual swap-out via vmscan.c */ - si->swap_map[offset] = encode_swapmap(0, true); - else /* at suspend */ - si->swap_map[offset] = encode_swapmap(1, false); + si->swap_map[offset] = usage; si->cluster_next = offset + 1; si->flags -= SWP_SCANNING; @@ -465,10 +454,10 @@ swp_entry_t get_swap_page(void) nr_swap_pages--; for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { - si = swap_info + type; + si = swap_info[type]; next = si->next; if (next < 0 || - (!wrapped && si->prio != swap_info[next].prio)) { + (!wrapped && si->prio != swap_info[next]->prio)) { next = swap_list.head; wrapped++; } @@ -480,7 +469,7 @@ swp_entry_t get_swap_page(void) swap_list.next = next; /* This is called for allocating swap entry for cache */ - offset = scan_swap_map(si, SWAP_CACHE); + offset = scan_swap_map(si, SWAP_HAS_CACHE); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -501,11 +490,11 @@ swp_entry_t get_swap_page_of_type(int type) pgoff_t offset; spin_lock(&swap_lock); - si = swap_info + type; - if (si->flags & SWP_WRITEOK) { + si = swap_info[type]; + if (si && (si->flags & SWP_WRITEOK)) { nr_swap_pages--; /* This is called for allocating swap entry, not cache */ - offset = scan_swap_map(si, SWAP_MAP); + offset = scan_swap_map(si, 1); if (offset) { spin_unlock(&swap_lock); return swp_entry(type, offset); @@ -516,9 +505,9 @@ swp_entry_t get_swap_page_of_type(int type) return (swp_entry_t) {0}; } -static struct swap_info_struct * swap_info_get(swp_entry_t entry) +static struct swap_info_struct *swap_info_get(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; unsigned long offset, type; if (!entry.val) @@ -526,7 +515,7 @@ static struct swap_info_struct * swap_info_get(swp_entry_t entry) type = swp_type(entry); if (type >= nr_swapfiles) goto bad_nofile; - p = & swap_info[type]; + p = swap_info[type]; if (!(p->flags & SWP_USED)) goto bad_device; offset = swp_offset(entry); @@ -552,41 +541,60 @@ out: return NULL; } -static int swap_entry_free(struct swap_info_struct *p, - swp_entry_t ent, int cache) +static unsigned char swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) { - unsigned long offset = swp_offset(ent); - int count = swap_count(p->swap_map[offset]); - bool has_cache; + unsigned long offset = swp_offset(entry); + unsigned char count; + unsigned char has_cache; - has_cache = swap_has_cache(p->swap_map[offset]); + count = p->swap_map[offset]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; - if (cache == SWAP_MAP) { /* dropping usage count of swap */ - if (count < SWAP_MAP_MAX) { - count--; - p->swap_map[offset] = encode_swapmap(count, has_cache); - } - } else { /* dropping swap cache flag */ + if (usage == SWAP_HAS_CACHE) { VM_BUG_ON(!has_cache); - p->swap_map[offset] = encode_swapmap(count, false); - + has_cache = 0; + } else if (count == SWAP_MAP_SHMEM) { + /* + * Or we could insist on shmem.c using a special + * swap_shmem_free() and free_shmem_swap_and_cache()... + */ + count = 0; + } else if ((count & ~COUNT_CONTINUED) <= SWAP_MAP_MAX) { + if (count == COUNT_CONTINUED) { + if (swap_count_continued(p, offset, count)) + count = SWAP_MAP_MAX | COUNT_CONTINUED; + else + count = SWAP_MAP_MAX; + } else + count--; } - /* return code. */ - count = p->swap_map[offset]; + + if (!count) + mem_cgroup_uncharge_swap(entry); + + usage = count | has_cache; + p->swap_map[offset] = usage; + /* free if no reference */ - if (!count) { + if (!usage) { + struct gendisk *disk = p->bdev->bd_disk; if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) p->highest_bit = offset; - if (p->prio > swap_info[swap_list.next].prio) - swap_list.next = p - swap_info; + if (swap_list.next >= 0 && + p->prio > swap_info[swap_list.next]->prio) + swap_list.next = p->type; nr_swap_pages++; p->inuse_pages--; + if ((p->flags & SWP_BLKDEV) && + disk->fops->swap_slot_free_notify) + disk->fops->swap_slot_free_notify(p->bdev, offset); } - if (!swap_count(count)) - mem_cgroup_uncharge_swap(ent); - return count; + + return usage; } /* @@ -595,11 +603,11 @@ static int swap_entry_free(struct swap_info_struct *p, */ void swap_free(swp_entry_t entry) { - struct swap_info_struct * p; + struct swap_info_struct *p; p = swap_info_get(entry); if (p) { - swap_entry_free(p, entry, SWAP_MAP); + swap_entry_free(p, entry, 1); spin_unlock(&swap_lock); } } @@ -610,26 +618,21 @@ void swap_free(swp_entry_t entry) void swapcache_free(swp_entry_t entry, struct page *page) { struct swap_info_struct *p; - int ret; + unsigned char count; p = swap_info_get(entry); if (p) { - ret = swap_entry_free(p, entry, SWAP_CACHE); - if (page) { - bool swapout; - if (ret) - swapout = true; /* the end of swap out */ - else - swapout = false; /* no more swap users! */ - mem_cgroup_uncharge_swapcache(page, entry, swapout); - } + count = swap_entry_free(p, entry, SWAP_HAS_CACHE); + if (page) + mem_cgroup_uncharge_swapcache(page, entry, count != 0); spin_unlock(&swap_lock); } - return; } /* * How many references to page are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. */ static inline int page_swapcount(struct page *page) { @@ -657,6 +660,8 @@ int reuse_swap_page(struct page *page) int count; VM_BUG_ON(!PageLocked(page)); + if (unlikely(PageKsm(page))) + return 0; count = page_mapcount(page); if (count <= 1 && PageSwapCache(page)) { count += page_swapcount(page); @@ -665,7 +670,7 @@ int reuse_swap_page(struct page *page) SetPageDirty(page); } } - return count == 1; + return count <= 1; } /* @@ -683,6 +688,24 @@ int try_to_free_swap(struct page *page) if (page_swapcount(page)) return 0; + /* + * Once hibernation has begun to create its image of memory, + * there's a danger that one of the calls to try_to_free_swap() + * - most probably a call from __try_to_reclaim_swap() while + * hibernation is allocating its own swap pages for the image, + * but conceivably even a call from memory reclaim - will free + * the swap from a page which has already been recorded in the + * image as a clean swapcache page, and then reuse its swap for + * another page of the image. On waking from hibernation, the + * original page might be freed under memory pressure, then + * later read back in from swap, now with the wrong data. + * + * Hibernation clears bits from gfp_allowed_mask to prevent + * memory reclaim from writing to disk, so check that here. + */ + if (!(gfp_allowed_mask & __GFP_IO)) + return 0; + delete_from_swap_cache(page); SetPageDirty(page); return 1; @@ -697,12 +720,12 @@ int free_swap_and_cache(swp_entry_t entry) struct swap_info_struct *p; struct page *page = NULL; - if (is_migration_entry(entry)) + if (non_swap_entry(entry)) return 1; p = swap_info_get(entry); if (p) { - if (swap_entry_free(p, entry, SWAP_MAP) == SWAP_HAS_CACHE) { + if (swap_entry_free(p, entry, 1) == SWAP_HAS_CACHE) { page = find_get_page(&swapper_space, entry.val); if (page && !trylock_page(page)) { page_cache_release(page); @@ -727,6 +750,37 @@ int free_swap_and_cache(swp_entry_t entry) return p != NULL; } +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +/** + * mem_cgroup_count_swap_user - count the user of a swap entry + * @ent: the swap entry to be checked + * @pagep: the pointer for the swap cache page of the entry to be stored + * + * Returns the number of the user of the swap entry. The number is valid only + * for swaps of anonymous pages. + * If the entry is found on swap cache, the page is stored to pagep with + * refcount of it being incremented. + */ +int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep) +{ + struct page *page; + struct swap_info_struct *p; + int count = 0; + + page = find_get_page(&swapper_space, ent.val); + if (page) + count += page_mapcount(page); + p = swap_info_get(ent); + if (p) { + count += swap_count(p->swap_map[swp_offset(ent)]); + spin_unlock(&swap_lock); + } + + *pagep = page; + return count; +} +#endif + #ifdef CONFIG_HIBERNATION /* * Find the swap type that corresponds to given device (if any). @@ -739,37 +793,35 @@ int free_swap_and_cache(swp_entry_t entry) int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) { struct block_device *bdev = NULL; - int i; + int type; if (device) bdev = bdget(device); spin_lock(&swap_lock); - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *sis = swap_info + i; + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *sis = swap_info[type]; if (!(sis->flags & SWP_WRITEOK)) continue; if (!bdev) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); - return i; + return type; } if (bdev == sis->bdev) { - struct swap_extent *se; + struct swap_extent *se = &sis->first_swap_extent; - se = list_entry(sis->extent_list.next, - struct swap_extent, list); if (se->start_block == offset) { if (bdev_p) - *bdev_p = bdget(sis->bdev->bd_dev); + *bdev_p = bdgrab(sis->bdev); spin_unlock(&swap_lock); bdput(bdev); - return i; + return type; } } } @@ -781,6 +833,21 @@ int swap_type_of(dev_t device, sector_t offset, struct block_device **bdev_p) } /* + * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev + * corresponding to given index in swap_info (swap type). + */ +sector_t swapdev_block(int type, pgoff_t offset) +{ + struct block_device *bdev; + + if ((unsigned int)type >= nr_swapfiles) + return 0; + if (!(swap_info[type]->flags & SWP_WRITEOK)) + return 0; + return map_swap_entry(swp_entry(type, offset), &bdev); +} + +/* * Return either the total number of swap pages of given type, or the number * of free pages of that type (depending on @free) * @@ -790,18 +857,20 @@ unsigned int count_swap_pages(int type, int free) { unsigned int n = 0; - if (type < nr_swapfiles) { - spin_lock(&swap_lock); - if (swap_info[type].flags & SWP_WRITEOK) { - n = swap_info[type].pages; + spin_lock(&swap_lock); + if ((unsigned int)type < nr_swapfiles) { + struct swap_info_struct *sis = swap_info[type]; + + if (sis->flags & SWP_WRITEOK) { + n = sis->pages; if (free) - n -= swap_info[type].inuse_pages; + n -= sis->inuse_pages; } - spin_unlock(&swap_lock); } + spin_unlock(&swap_lock); return n; } -#endif +#endif /* CONFIG_HIBERNATION */ /* * No need to decide whether this PTE shares the swap entry with others, @@ -829,7 +898,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, goto out; } - inc_mm_counter(vma->vm_mm, anon_rss); + dec_mm_counter(vma->vm_mm, MM_SWAPENTS); + inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); set_pte_at(vma->vm_mm, addr, pte, pte_mkold(mk_pte(page, vma->vm_page_prot))); @@ -930,7 +1000,7 @@ static int unuse_vma(struct vm_area_struct *vma, unsigned long addr, end, next; int ret; - if (page->mapping) { + if (page_anon_vma(page)) { addr = page_address_in_vma(page, vma); if (addr == -EFAULT) return 0; @@ -986,7 +1056,7 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, { unsigned int max = si->max; unsigned int i = prev; - int count; + unsigned char count; /* * No need for swap_lock here: we're just looking @@ -1022,16 +1092,14 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, */ static int try_to_unuse(unsigned int type) { - struct swap_info_struct * si = &swap_info[type]; + struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; - unsigned short *swap_map; - unsigned short swcount; + unsigned char *swap_map; + unsigned char swcount; struct page *page; swp_entry_t entry; unsigned int i = 0; int retval = 0; - int reset_overflow = 0; - int shmem; /* * When searching mms for an entry, a good strategy is to @@ -1045,8 +1113,7 @@ static int try_to_unuse(unsigned int type) * together, child after parent. If we race with dup_mmap(), we * prefer to resolve parent before child, lest we miss entries * duplicated after we scanned child: using last mm would invert - * that. Though it's only a serious concern when an overflowed - * swap count is reset from SWAP_MAP_MAX, preventing a rescan. + * that. */ start_mm = &init_mm; atomic_inc(&init_mm.mm_users); @@ -1108,17 +1175,18 @@ static int try_to_unuse(unsigned int type) /* * Remove all references to entry. - * Whenever we reach init_mm, there's no address space - * to search, but use it as a reminder to search shmem. */ - shmem = 0; swcount = *swap_map; - if (swap_count(swcount)) { - if (start_mm == &init_mm) - shmem = shmem_unuse(entry, page); - else - retval = unuse_mm(start_mm, entry, page); + if (swap_count(swcount) == SWAP_MAP_SHMEM) { + retval = shmem_unuse(entry, page); + /* page has already been unlocked and released */ + if (retval < 0) + break; + continue; } + if (swap_count(swcount) && start_mm != &init_mm) + retval = unuse_mm(start_mm, entry, page); + if (swap_count(*swap_map)) { int set_start_mm = (*swap_map >= swcount); struct list_head *p = &start_mm->mmlist; @@ -1129,7 +1197,7 @@ static int try_to_unuse(unsigned int type) atomic_inc(&new_start_mm->mm_users); atomic_inc(&prev_mm->mm_users); spin_lock(&mmlist_lock); - while (swap_count(*swap_map) && !retval && !shmem && + while (swap_count(*swap_map) && !retval && (p = p->next) != &start_mm->mmlist) { mm = list_entry(p, struct mm_struct, mmlist); if (!atomic_inc_not_zero(&mm->mm_users)) @@ -1143,14 +1211,12 @@ static int try_to_unuse(unsigned int type) swcount = *swap_map; if (!swap_count(swcount)) /* any usage ? */ ; - else if (mm == &init_mm) { + else if (mm == &init_mm) set_start_mm = 1; - shmem = shmem_unuse(entry, page); - } else + else retval = unuse_mm(mm, entry, page); - if (set_start_mm && - swap_count(*swap_map) < swcount) { + if (set_start_mm && *swap_map < swcount) { mmput(new_start_mm); atomic_inc(&mm->mm_users); new_start_mm = mm; @@ -1163,13 +1229,6 @@ static int try_to_unuse(unsigned int type) mmput(start_mm); start_mm = new_start_mm; } - if (shmem) { - /* page has already been unlocked and released */ - if (shmem > 0) - continue; - retval = shmem; - break; - } if (retval) { unlock_page(page); page_cache_release(page); @@ -1177,30 +1236,6 @@ static int try_to_unuse(unsigned int type) } /* - * How could swap count reach 0x7ffe ? - * There's no way to repeat a swap page within an mm - * (except in shmem, where it's the shared object which takes - * the reference count)? - * We believe SWAP_MAP_MAX cannot occur.(if occur, unsigned - * short is too small....) - * If that's wrong, then we should worry more about - * exit_mmap() and do_munmap() cases described above: - * we might be resetting SWAP_MAP_MAX too early here. - * We know "Undead"s can happen, they're okay, so don't - * report them; but do report if we reset SWAP_MAP_MAX. - */ - /* We might release the lock_page() in unuse_mm(). */ - if (!PageSwapCache(page) || page_private(page) != entry.val) - goto retry; - - if (swap_count(*swap_map) == SWAP_MAP_MAX) { - spin_lock(&swap_lock); - *swap_map = encode_swapmap(0, true); - spin_unlock(&swap_lock); - reset_overflow = 1; - } - - /* * If a reference remains (rare), we would like to leave * the page in the swap cache; but try_to_unmap could * then re-duplicate the entry once we drop page lock, @@ -1212,6 +1247,12 @@ static int try_to_unuse(unsigned int type) * read from disk into another page. Splitting into two * pages would be incorrect if swap supported "shared * private" pages, but they are handled by tmpfs files. + * + * Given how unuse_vma() targets one particular offset + * in an anon_vma, once the anon_vma has been determined, + * this splitting happens to be just what is needed to + * handle where KSM pages have been swapped out: re-reading + * is unnecessarily slow, but we can fix that later on. */ if (swap_count(*swap_map) && PageDirty(page) && PageSwapCache(page)) { @@ -1241,7 +1282,6 @@ static int try_to_unuse(unsigned int type) * mark page dirty so shrink_page_list will preserve it. */ SetPageDirty(page); -retry: unlock_page(page); page_cache_release(page); @@ -1253,10 +1293,6 @@ retry: } mmput(start_mm); - if (reset_overflow) { - printk(KERN_WARNING "swapoff: cleared swap entry overflow\n"); - swap_overflow = 0; - } return retval; } @@ -1269,10 +1305,10 @@ retry: static void drain_mmlist(void) { struct list_head *p, *next; - unsigned int i; + unsigned int type; - for (i = 0; i < nr_swapfiles; i++) - if (swap_info[i].inuse_pages) + for (type = 0; type < nr_swapfiles; type++) + if (swap_info[type]->inuse_pages) return; spin_lock(&mmlist_lock); list_for_each_safe(p, next, &init_mm.mmlist) @@ -1282,12 +1318,23 @@ static void drain_mmlist(void) /* * Use this swapdev's extent info to locate the (PAGE_SIZE) block which - * corresponds to page offset `offset'. + * corresponds to page offset for the specified swap entry. + * Note that the type of this function is sector_t, but it returns page offset + * into the bdev, not sector offset. */ -sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) +static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev) { - struct swap_extent *se = sis->curr_swap_extent; - struct swap_extent *start_se = se; + struct swap_info_struct *sis; + struct swap_extent *start_se; + struct swap_extent *se; + pgoff_t offset; + + sis = swap_info[swp_type(entry)]; + *bdev = sis->bdev; + + offset = swp_offset(entry); + start_se = sis->curr_swap_extent; + se = start_se; for ( ; ; ) { struct list_head *lh; @@ -1297,40 +1344,31 @@ sector_t map_swap_page(struct swap_info_struct *sis, pgoff_t offset) return se->start_block + (offset - se->start_page); } lh = se->list.next; - if (lh == &sis->extent_list) - lh = lh->next; se = list_entry(lh, struct swap_extent, list); sis->curr_swap_extent = se; BUG_ON(se == start_se); /* It *must* be present */ } } -#ifdef CONFIG_HIBERNATION /* - * Get the (PAGE_SIZE) block corresponding to given offset on the swapdev - * corresponding to given index in swap_info (swap type). + * Returns the page offset into bdev for the specified page's swap entry. */ -sector_t swapdev_block(int swap_type, pgoff_t offset) +sector_t map_swap_page(struct page *page, struct block_device **bdev) { - struct swap_info_struct *sis; - - if (swap_type >= nr_swapfiles) - return 0; - - sis = swap_info + swap_type; - return (sis->flags & SWP_WRITEOK) ? map_swap_page(sis, offset) : 0; + swp_entry_t entry; + entry.val = page_private(page); + return map_swap_entry(entry, bdev); } -#endif /* CONFIG_HIBERNATION */ /* * Free all of a swapdev's extent information */ static void destroy_swap_extents(struct swap_info_struct *sis) { - while (!list_empty(&sis->extent_list)) { + while (!list_empty(&sis->first_swap_extent.list)) { struct swap_extent *se; - se = list_entry(sis->extent_list.next, + se = list_entry(sis->first_swap_extent.list.next, struct swap_extent, list); list_del(&se->list); kfree(se); @@ -1351,8 +1389,15 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, struct swap_extent *new_se; struct list_head *lh; - lh = sis->extent_list.prev; /* The highest page extent */ - if (lh != &sis->extent_list) { + if (start_page == 0) { + se = &sis->first_swap_extent; + sis->curr_swap_extent = se; + se->start_page = 0; + se->nr_pages = nr_pages; + se->start_block = start_block; + return 1; + } else { + lh = sis->first_swap_extent.list.prev; /* Highest extent */ se = list_entry(lh, struct swap_extent, list); BUG_ON(se->start_page + se->nr_pages != start_page); if (se->start_block + se->nr_pages == start_block) { @@ -1372,7 +1417,7 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, new_se->nr_pages = nr_pages; new_se->start_block = start_block; - list_add_tail(&new_se->list, &sis->extent_list); + list_add_tail(&new_se->list, &sis->first_swap_extent.list); return 1; } @@ -1424,7 +1469,7 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; - goto done; + goto out; } blkbits = inode->i_blkbits; @@ -1495,25 +1540,22 @@ reprobe: sis->max = page_no; sis->pages = page_no - 1; sis->highest_bit = page_no - 1; -done: - sis->curr_swap_extent = list_entry(sis->extent_list.prev, - struct swap_extent, list); - goto out; +out: + return ret; bad_bmap: printk(KERN_ERR "swapon: swapfile has holes\n"); ret = -EINVAL; -out: - return ret; + goto out; } SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { - struct swap_info_struct * p = NULL; - unsigned short *swap_map; + struct swap_info_struct *p = NULL; + unsigned char *swap_map; struct file *swap_file, *victim; struct address_space *mapping; struct inode *inode; - char * pathname; + char *pathname; int i, type, prev; int err; @@ -1534,8 +1576,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mapping = victim->f_mapping; prev = -1; spin_lock(&swap_lock); - for (type = swap_list.head; type >= 0; type = swap_info[type].next) { - p = swap_info + type; + for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { + p = swap_info[type]; if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) break; @@ -1554,18 +1596,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - if (prev < 0) { + if (prev < 0) swap_list.head = p->next; - } else { - swap_info[prev].next = p->next; - } + else + swap_info[prev]->next = p->next; if (type == swap_list.next) { /* just pick something that's safe... */ swap_list.next = swap_list.head; } if (p->prio < 0) { - for (i = p->next; i >= 0; i = swap_info[i].next) - swap_info[i].prio = p->prio--; + for (i = p->next; i >= 0; i = swap_info[i]->next) + swap_info[i]->prio = p->prio--; least_priority++; } nr_swap_pages -= p->pages; @@ -1573,9 +1614,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->flags &= ~SWP_WRITEOK; spin_unlock(&swap_lock); - current->flags |= PF_SWAPOFF; + current->flags |= PF_OOM_ORIGIN; err = try_to_unuse(type); - current->flags &= ~PF_SWAPOFF; + current->flags &= ~PF_OOM_ORIGIN; if (err) { /* re-insert swap space back into swap_list */ @@ -1583,16 +1624,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->prio < 0) p->prio = --least_priority; prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i].next) { - if (p->prio >= swap_info[i].prio) + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { + if (p->prio >= swap_info[i]->prio) break; prev = i; } p->next = i; if (prev < 0) - swap_list.head = swap_list.next = p - swap_info; + swap_list.head = swap_list.next = type; else - swap_info[prev].next = p - swap_info; + swap_info[prev]->next = type; nr_swap_pages += p->pages; total_swap_pages += p->pages; p->flags |= SWP_WRITEOK; @@ -1605,6 +1646,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) up_write(&swap_unplug_sem); destroy_swap_extents(p); + if (p->flags & SWP_CONTINUED) + free_swap_count_continuations(p); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); drain_mmlist(); @@ -1641,6 +1685,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } filp_close(swap_file, NULL); err = 0; + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); out_dput: filp_close(victim, NULL); @@ -1649,11 +1695,30 @@ out: } #ifdef CONFIG_PROC_FS +struct proc_swaps { + struct seq_file seq; + int event; +}; + +static unsigned swaps_poll(struct file *file, poll_table *wait) +{ + struct proc_swaps *s = file->private_data; + + poll_wait(file, &proc_poll_wait, wait); + + if (s->event != atomic_read(&proc_poll_event)) { + s->event = atomic_read(&proc_poll_event); + return POLLIN | POLLRDNORM | POLLERR | POLLPRI; + } + + return POLLIN | POLLRDNORM; +} + /* iterator */ static void *swap_start(struct seq_file *swap, loff_t *pos) { - struct swap_info_struct *ptr = swap_info; - int i; + struct swap_info_struct *si; + int type; loff_t l = *pos; mutex_lock(&swapon_mutex); @@ -1661,11 +1726,13 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) if (!l) return SEQ_START_TOKEN; - for (i = 0; i < nr_swapfiles; i++, ptr++) { - if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + for (type = 0; type < nr_swapfiles; type++) { + smp_rmb(); /* read nr_swapfiles before swap_info[type] */ + si = swap_info[type]; + if (!(si->flags & SWP_USED) || !si->swap_map) continue; if (!--l) - return ptr; + return si; } return NULL; @@ -1673,21 +1740,21 @@ static void *swap_start(struct seq_file *swap, loff_t *pos) static void *swap_next(struct seq_file *swap, void *v, loff_t *pos) { - struct swap_info_struct *ptr; - struct swap_info_struct *endptr = swap_info + nr_swapfiles; + struct swap_info_struct *si = v; + int type; if (v == SEQ_START_TOKEN) - ptr = swap_info; - else { - ptr = v; - ptr++; - } + type = 0; + else + type = si->type + 1; - for (; ptr < endptr; ptr++) { - if (!(ptr->flags & SWP_USED) || !ptr->swap_map) + for (; type < nr_swapfiles; type++) { + smp_rmb(); /* read nr_swapfiles before swap_info[type] */ + si = swap_info[type]; + if (!(si->flags & SWP_USED) || !si->swap_map) continue; ++*pos; - return ptr; + return si; } return NULL; @@ -1700,24 +1767,24 @@ static void swap_stop(struct seq_file *swap, void *v) static int swap_show(struct seq_file *swap, void *v) { - struct swap_info_struct *ptr = v; + struct swap_info_struct *si = v; struct file *file; int len; - if (ptr == SEQ_START_TOKEN) { + if (si == SEQ_START_TOKEN) { seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n"); return 0; } - file = ptr->swap_file; + file = si->swap_file; len = seq_path(swap, &file->f_path, " \t\n\\"); seq_printf(swap, "%*s%s\t%u\t%u\t%d\n", len < 40 ? 40 - len : 1, " ", S_ISBLK(file->f_path.dentry->d_inode->i_mode) ? "partition" : "file\t", - ptr->pages << (PAGE_SHIFT - 10), - ptr->inuse_pages << (PAGE_SHIFT - 10), - ptr->prio); + si->pages << (PAGE_SHIFT - 10), + si->inuse_pages << (PAGE_SHIFT - 10), + si->prio); return 0; } @@ -1730,7 +1797,24 @@ static const struct seq_operations swaps_op = { static int swaps_open(struct inode *inode, struct file *file) { - return seq_open(file, &swaps_op); + struct proc_swaps *s; + int ret; + + s = kmalloc(sizeof(struct proc_swaps), GFP_KERNEL); + if (!s) + return -ENOMEM; + + file->private_data = s; + + ret = seq_open(file, &swaps_op); + if (ret) { + kfree(s); + return ret; + } + + s->seq.private = s; + s->event = atomic_read(&proc_poll_event); + return ret; } static const struct file_operations proc_swaps_operations = { @@ -1738,6 +1822,7 @@ static const struct file_operations proc_swaps_operations = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, + .poll = swaps_poll, }; static int __init procswaps_init(void) @@ -1764,7 +1849,7 @@ late_initcall(max_swapfiles_check); */ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { - struct swap_info_struct * p; + struct swap_info_struct *p; char *name = NULL; struct block_device *bdev = NULL; struct file *swap_file = NULL; @@ -1772,36 +1857,58 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) unsigned int type; int i, prev; int error; - union swap_header *swap_header = NULL; - unsigned int nr_good_pages = 0; + union swap_header *swap_header; + unsigned int nr_good_pages; int nr_extents = 0; sector_t span; - unsigned long maxpages = 1; + unsigned long maxpages; unsigned long swapfilepages; - unsigned short *swap_map = NULL; + unsigned char *swap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; int did_down = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return -ENOMEM; + spin_lock(&swap_lock); - p = swap_info; - for (type = 0 ; type < nr_swapfiles ; type++,p++) - if (!(p->flags & SWP_USED)) + for (type = 0; type < nr_swapfiles; type++) { + if (!(swap_info[type]->flags & SWP_USED)) break; + } error = -EPERM; if (type >= MAX_SWAPFILES) { spin_unlock(&swap_lock); + kfree(p); goto out; } - if (type >= nr_swapfiles) - nr_swapfiles = type+1; - memset(p, 0, sizeof(*p)); - INIT_LIST_HEAD(&p->extent_list); + if (type >= nr_swapfiles) { + p->type = type; + swap_info[type] = p; + /* + * Write swap_info[type] before nr_swapfiles, in case a + * racing procfs swap_start() or swap_next() is reading them. + * (We never shrink nr_swapfiles, we never free this entry.) + */ + smp_wmb(); + nr_swapfiles++; + } else { + kfree(p); + p = swap_info[type]; + /* + * Do not memset this entry: a racing procfs swap_next() + * would be relying on p->type to remain valid. + */ + } + INIT_LIST_HEAD(&p->first_swap_extent.list); p->flags = SWP_USED; p->next = -1; spin_unlock(&swap_lock); + name = getname(specialfile); error = PTR_ERR(name); if (IS_ERR(name)) { @@ -1821,7 +1928,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -EBUSY; for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = &swap_info[i]; + struct swap_info_struct *q = swap_info[i]; if (i == type || !q->swap_file) continue; @@ -1843,6 +1950,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error < 0) goto bad_swap; p->bdev = bdev; + p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; mutex_lock(&inode->i_mutex); @@ -1896,6 +2004,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->lowest_bit = 1; p->cluster_next = 1; + p->cluster_nr = 0; /* * Find out how many pages are allowed for a single swap @@ -1912,9 +2021,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) * swap pte. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) - 1; - if (maxpages > swap_header->info.last_page) - maxpages = swap_header->info.last_page; + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + if (maxpages > swap_header->info.last_page) { + maxpages = swap_header->info.last_page + 1; + /* p->max is an unsigned int: don't overflow it */ + if ((unsigned int)maxpages == 0) + maxpages = UINT_MAX; + } p->highest_bit = maxpages - 1; error = -EINVAL; @@ -1931,30 +2044,31 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; /* OK, set up the swap map and apply the bad block list */ - swap_map = vmalloc(maxpages * sizeof(short)); + swap_map = vmalloc(maxpages); if (!swap_map) { error = -ENOMEM; goto bad_swap; } - memset(swap_map, 0, maxpages * sizeof(short)); + memset(swap_map, 0, maxpages); + nr_good_pages = maxpages - 1; /* omit header page */ + for (i = 0; i < swap_header->info.nr_badpages; i++) { - int page_nr = swap_header->info.badpages[i]; - if (page_nr <= 0 || page_nr >= swap_header->info.last_page) { + unsigned int page_nr = swap_header->info.badpages[i]; + if (page_nr == 0 || page_nr > swap_header->info.last_page) { error = -EINVAL; goto bad_swap; } - swap_map[page_nr] = SWAP_MAP_BAD; + if (page_nr < maxpages) { + swap_map[page_nr] = SWAP_MAP_BAD; + nr_good_pages--; + } } error = swap_cgroup_swapon(type, maxpages); if (error) goto bad_swap; - nr_good_pages = swap_header->info.last_page - - swap_header->info.nr_badpages - - 1 /* header page */; - if (nr_good_pages) { swap_map[0] = SWAP_MAP_BAD; p->max = maxpages; @@ -1972,12 +2086,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } - if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { - p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (random32() % p->highest_bit); + if (p->bdev) { + if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + p->cluster_next = 1 + (random32() % p->highest_bit); + } + if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD)) + p->flags |= SWP_DISCARDABLE; } - if (discard_swap(p) == 0) - p->flags |= SWP_DISCARDABLE; mutex_lock(&swapon_mutex); spin_lock(&swap_lock); @@ -2000,20 +2116,21 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) /* insert swap space into swap_list: */ prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i].next) { - if (p->prio >= swap_info[i].prio) { + for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { + if (p->prio >= swap_info[i]->prio) break; - } prev = i; } p->next = i; - if (prev < 0) { - swap_list.head = swap_list.next = p - swap_info; - } else { - swap_info[prev].next = p - swap_info; - } + if (prev < 0) + swap_list.head = swap_list.next = type; + else + swap_info[prev]->next = type; spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); + atomic_inc(&proc_poll_event); + wake_up_interruptible(&proc_poll_wait); + error = 0; goto out; bad_swap: @@ -2048,15 +2165,15 @@ out: void si_swapinfo(struct sysinfo *val) { - unsigned int i; + unsigned int type; unsigned long nr_to_be_unused = 0; spin_lock(&swap_lock); - for (i = 0; i < nr_swapfiles; i++) { - if (!(swap_info[i].flags & SWP_USED) || - (swap_info[i].flags & SWP_WRITEOK)) - continue; - nr_to_be_unused += swap_info[i].inuse_pages; + for (type = 0; type < nr_swapfiles; type++) { + struct swap_info_struct *si = swap_info[type]; + + if ((si->flags & SWP_USED) && !(si->flags & SWP_WRITEOK)) + nr_to_be_unused += si->inuse_pages; } val->freeswap = nr_swap_pages + nr_to_be_unused; val->totalswap = total_swap_pages + nr_to_be_unused; @@ -2066,101 +2183,111 @@ void si_swapinfo(struct sysinfo *val) /* * Verify that a swap entry is valid and increment its swap map count. * - * Note: if swap_map[] reaches SWAP_MAP_MAX the entries are treated as - * "permanent", but will be reclaimed by the next swapoff. * Returns error code in following case. * - success -> 0 * - swp_entry is invalid -> EINVAL * - swp_entry is migration entry -> EINVAL * - swap-cache reference is requested but there is already one. -> EEXIST * - swap-cache reference is requested but the entry is not used. -> ENOENT + * - swap-mapped reference requested but needs continued swap count. -> ENOMEM */ -static int __swap_duplicate(swp_entry_t entry, bool cache) +static int __swap_duplicate(swp_entry_t entry, unsigned char usage) { - struct swap_info_struct * p; + struct swap_info_struct *p; unsigned long offset, type; - int result = -EINVAL; - int count; - bool has_cache; + unsigned char count; + unsigned char has_cache; + int err = -EINVAL; - if (is_migration_entry(entry)) - return -EINVAL; + if (non_swap_entry(entry)) + goto out; type = swp_type(entry); if (type >= nr_swapfiles) goto bad_file; - p = type + swap_info; + p = swap_info[type]; offset = swp_offset(entry); spin_lock(&swap_lock); - if (unlikely(offset >= p->max)) goto unlock_out; - count = swap_count(p->swap_map[offset]); - has_cache = swap_has_cache(p->swap_map[offset]); + count = p->swap_map[offset]; + has_cache = count & SWAP_HAS_CACHE; + count &= ~SWAP_HAS_CACHE; + err = 0; - if (cache == SWAP_CACHE) { /* called for swapcache/swapin-readahead */ + if (usage == SWAP_HAS_CACHE) { /* set SWAP_HAS_CACHE if there is no cache and entry is used */ - if (!has_cache && count) { - p->swap_map[offset] = encode_swapmap(count, true); - result = 0; - } else if (has_cache) /* someone added cache */ - result = -EEXIST; - else if (!count) /* no users */ - result = -ENOENT; + if (!has_cache && count) + has_cache = SWAP_HAS_CACHE; + else if (has_cache) /* someone else added cache */ + err = -EEXIST; + else /* no users remaining */ + err = -ENOENT; } else if (count || has_cache) { - if (count < SWAP_MAP_MAX - 1) { - p->swap_map[offset] = encode_swapmap(count + 1, - has_cache); - result = 0; - } else if (count <= SWAP_MAP_MAX) { - if (swap_overflow++ < 5) - printk(KERN_WARNING - "swap_dup: swap entry overflow\n"); - p->swap_map[offset] = encode_swapmap(SWAP_MAP_MAX, - has_cache); - result = 0; - } + + if ((count & ~COUNT_CONTINUED) < SWAP_MAP_MAX) + count += usage; + else if ((count & ~COUNT_CONTINUED) > SWAP_MAP_MAX) + err = -EINVAL; + else if (swap_count_continued(p, offset, count)) + count = COUNT_CONTINUED; + else + err = -ENOMEM; } else - result = -ENOENT; /* unused swap entry */ + err = -ENOENT; /* unused swap entry */ + + p->swap_map[offset] = count | has_cache; + unlock_out: spin_unlock(&swap_lock); out: - return result; + return err; bad_file: printk(KERN_ERR "swap_dup: %s%08lx\n", Bad_file, entry.val); goto out; } + /* - * increase reference count of swap entry by 1. + * Help swapoff by noting that swap entry belongs to shmem/tmpfs + * (in which case its reference count is never incremented). */ -void swap_duplicate(swp_entry_t entry) +void swap_shmem_alloc(swp_entry_t entry) { - __swap_duplicate(entry, SWAP_MAP); + __swap_duplicate(entry, SWAP_MAP_SHMEM); +} + +/* + * Increase reference count of swap entry by 1. + * Returns 0 for success, or -ENOMEM if a swap_count_continuation is required + * but could not be atomically allocated. Returns 0, just as if it succeeded, + * if __swap_duplicate() fails for another reason (-EINVAL or -ENOENT), which + * might occur if a page table entry has got corrupted. + */ +int swap_duplicate(swp_entry_t entry) +{ + int err = 0; + + while (!err && __swap_duplicate(entry, 1) == -ENOMEM) + err = add_swap_count_continuation(entry, GFP_ATOMIC); + return err; } /* * @entry: swap entry for which we allocate swap cache. * - * Called when allocating swap cache for exising swap entry, + * Called when allocating swap cache for existing swap entry, * This can return error codes. Returns 0 at success. * -EBUSY means there is a swap cache. * Note: return code is different from swap_duplicate(). */ int swapcache_prepare(swp_entry_t entry) { - return __swap_duplicate(entry, SWAP_CACHE); -} - - -struct swap_info_struct * -get_swap_info_struct(unsigned type) -{ - return &swap_info[type]; + return __swap_duplicate(entry, SWAP_HAS_CACHE); } /* @@ -2178,7 +2305,7 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) if (!our_page_cluster) /* no readahead */ return 0; - si = &swap_info[swp_type(entry)]; + si = swap_info[swp_type(entry)]; target = swp_offset(entry); base = (target >> our_page_cluster) << our_page_cluster; end = base + (1 << our_page_cluster); @@ -2214,3 +2341,219 @@ int valid_swaphandles(swp_entry_t entry, unsigned long *offset) *offset = ++toff; return nr_pages? ++nr_pages: 0; } + +/* + * add_swap_count_continuation - called when a swap count is duplicated + * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's + * page of the original vmalloc'ed swap_map, to hold the continuation count + * (for that entry and for its neighbouring PAGE_SIZE swap entries). Called + * again when count is duplicated beyond SWAP_MAP_MAX * SWAP_CONT_MAX, etc. + * + * These continuation pages are seldom referenced: the common paths all work + * on the original swap_map, only referring to a continuation page when the + * low "digit" of a count is incremented or decremented through SWAP_MAP_MAX. + * + * add_swap_count_continuation(, GFP_ATOMIC) can be called while holding + * page table locks; if it fails, add_swap_count_continuation(, GFP_KERNEL) + * can be called after dropping locks. + */ +int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) +{ + struct swap_info_struct *si; + struct page *head; + struct page *page; + struct page *list_page; + pgoff_t offset; + unsigned char count; + + /* + * When debugging, it's easier to use __GFP_ZERO here; but it's better + * for latency not to zero a page while GFP_ATOMIC and holding locks. + */ + page = alloc_page(gfp_mask | __GFP_HIGHMEM); + + si = swap_info_get(entry); + if (!si) { + /* + * An acceptable race has occurred since the failing + * __swap_duplicate(): the swap entry has been freed, + * perhaps even the whole swap_map cleared for swapoff. + */ + goto outer; + } + + offset = swp_offset(entry); + count = si->swap_map[offset] & ~SWAP_HAS_CACHE; + + if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) { + /* + * The higher the swap count, the more likely it is that tasks + * will race to add swap count continuation: we need to avoid + * over-provisioning. + */ + goto out; + } + + if (!page) { + spin_unlock(&swap_lock); + return -ENOMEM; + } + + /* + * We are fortunate that although vmalloc_to_page uses pte_offset_map, + * no architecture is using highmem pages for kernel pagetables: so it + * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. + */ + head = vmalloc_to_page(si->swap_map + offset); + offset &= ~PAGE_MASK; + + /* + * Page allocation does not initialize the page's lru field, + * but it does always reset its private field. + */ + if (!page_private(head)) { + BUG_ON(count & COUNT_CONTINUED); + INIT_LIST_HEAD(&head->lru); + set_page_private(head, SWP_CONTINUED); + si->flags |= SWP_CONTINUED; + } + + list_for_each_entry(list_page, &head->lru, lru) { + unsigned char *map; + + /* + * If the previous map said no continuation, but we've found + * a continuation page, free our allocation and use this one. + */ + if (!(count & COUNT_CONTINUED)) + goto out; + + map = kmap_atomic(list_page, KM_USER0) + offset; + count = *map; + kunmap_atomic(map, KM_USER0); + + /* + * If this continuation count now has some space in it, + * free our allocation and use this one. + */ + if ((count & ~COUNT_CONTINUED) != SWAP_CONT_MAX) + goto out; + } + + list_add_tail(&page->lru, &head->lru); + page = NULL; /* now it's attached, don't free it */ +out: + spin_unlock(&swap_lock); +outer: + if (page) + __free_page(page); + return 0; +} + +/* + * swap_count_continued - when the original swap_map count is incremented + * from SWAP_MAP_MAX, check if there is already a continuation page to carry + * into, carry if so, or else fail until a new continuation page is allocated; + * when the original swap_map count is decremented from 0 with continuation, + * borrow from the continuation and report whether it still holds more. + * Called while __swap_duplicate() or swap_entry_free() holds swap_lock. + */ +static bool swap_count_continued(struct swap_info_struct *si, + pgoff_t offset, unsigned char count) +{ + struct page *head; + struct page *page; + unsigned char *map; + + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head) != SWP_CONTINUED) { + BUG_ON(count & COUNT_CONTINUED); + return false; /* need to add count continuation */ + } + + offset &= ~PAGE_MASK; + page = list_entry(head->lru.next, struct page, lru); + map = kmap_atomic(page, KM_USER0) + offset; + + if (count == SWAP_MAP_MAX) /* initial increment from swap_map */ + goto init_map; /* jump over SWAP_CONT_MAX checks */ + + if (count == (SWAP_MAP_MAX | COUNT_CONTINUED)) { /* incrementing */ + /* + * Think of how you add 1 to 999 + */ + while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + BUG_ON(page == head); + map = kmap_atomic(page, KM_USER0) + offset; + } + if (*map == SWAP_CONT_MAX) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + if (page == head) + return false; /* add count continuation */ + map = kmap_atomic(page, KM_USER0) + offset; +init_map: *map = 0; /* we didn't zero the page */ + } + *map += 1; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + while (page != head) { + map = kmap_atomic(page, KM_USER0) + offset; + *map = COUNT_CONTINUED; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + } + return true; /* incremented */ + + } else { /* decrementing */ + /* + * Think of how you subtract 1 from 1000 + */ + BUG_ON(count != COUNT_CONTINUED); + while (*map == COUNT_CONTINUED) { + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.next, struct page, lru); + BUG_ON(page == head); + map = kmap_atomic(page, KM_USER0) + offset; + } + BUG_ON(*map == 0); + *map -= 1; + if (*map == 0) + count = 0; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + while (page != head) { + map = kmap_atomic(page, KM_USER0) + offset; + *map = SWAP_CONT_MAX | count; + count = COUNT_CONTINUED; + kunmap_atomic(map, KM_USER0); + page = list_entry(page->lru.prev, struct page, lru); + } + return count == COUNT_CONTINUED; + } +} + +/* + * free_swap_count_continuations - swapoff free all the continuation pages + * appended to the swap_map, after swap_map is quiesced, before vfree'ing it. + */ +static void free_swap_count_continuations(struct swap_info_struct *si) +{ + pgoff_t offset; + + for (offset = 0; offset < si->max; offset += PAGE_SIZE) { + struct page *head; + head = vmalloc_to_page(si->swap_map + offset); + if (page_private(head)) { + struct list_head *this, *next; + list_for_each_safe(this, next, &head->lru) { + struct page *page; + page = list_entry(this, struct page, lru); + list_del(this); + __free_page(page); + } + } + } +} diff --git a/mm/thrash.c b/mm/thrash.c index c4c5205..2372d4e 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -26,47 +26,45 @@ static DEFINE_SPINLOCK(swap_token_lock); struct mm_struct *swap_token_mm; static unsigned int global_faults; -void grab_swap_token(void) +void grab_swap_token(struct mm_struct *mm) { int current_interval; global_faults++; - current_interval = global_faults - current->mm->faultstamp; + current_interval = global_faults - mm->faultstamp; if (!spin_trylock(&swap_token_lock)) return; /* First come first served */ if (swap_token_mm == NULL) { - current->mm->token_priority = current->mm->token_priority + 2; - swap_token_mm = current->mm; + mm->token_priority = mm->token_priority + 2; + swap_token_mm = mm; goto out; } - if (current->mm != swap_token_mm) { - if (current_interval < current->mm->last_interval) - current->mm->token_priority++; + if (mm != swap_token_mm) { + if (current_interval < mm->last_interval) + mm->token_priority++; else { - if (likely(current->mm->token_priority > 0)) - current->mm->token_priority--; + if (likely(mm->token_priority > 0)) + mm->token_priority--; } /* Check if we deserve the token */ - if (current->mm->token_priority > - swap_token_mm->token_priority) { - current->mm->token_priority += 2; - swap_token_mm = current->mm; + if (mm->token_priority > swap_token_mm->token_priority) { + mm->token_priority += 2; + swap_token_mm = mm; } } else { /* Token holder came in again! */ - current->mm->token_priority += 2; + mm->token_priority += 2; } out: - current->mm->faultstamp = global_faults; - current->mm->last_interval = current_interval; + mm->faultstamp = global_faults; + mm->last_interval = current_interval; spin_unlock(&swap_token_lock); -return; } /* Called on process exit. */ diff --git a/mm/truncate.c b/mm/truncate.c index ccc3ecf..3c2d5dd 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/backing-dev.h> +#include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> #include <linux/module.h> @@ -93,11 +94,11 @@ EXPORT_SYMBOL(cancel_dirty_page); * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void +static int truncate_complete_page(struct address_space *mapping, struct page *page) { if (page->mapping != mapping) - return; + return -EIO; if (page_has_private(page)) do_invalidatepage(page, 0); @@ -108,6 +109,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) remove_from_page_cache(page); ClearPageMappedToDisk(page); page_cache_release(page); /* pagecache ref */ + return 0; } /* @@ -135,6 +137,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) return ret; } +int truncate_inode_page(struct address_space *mapping, struct page *page) +{ + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } + return truncate_complete_page(mapping, page); +} + +/* + * Used to get rid of pages on hardware memory corruption. + */ +int generic_error_remove_page(struct address_space *mapping, struct page *page) +{ + if (!mapping) + return -EINVAL; + /* + * Only punch for normal data pages for now. + * Handling other types like directories would need more auditing. + */ + if (!S_ISREG(mapping->host->i_mode)) + return -EIO; + return truncate_inode_page(mapping, page); +} +EXPORT_SYMBOL(generic_error_remove_page); + +/* + * Safely invalidate one page from its pagecache mapping. + * It only drops clean, unused pages. The page must be locked. + * + * Returns 1 if the page is successfully invalidated, otherwise 0. + */ +int invalidate_inode_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + if (!mapping) + return 0; + if (PageDirty(page) || PageWriteback(page)) + return 0; + if (page_mapped(page)) + return 0; + return invalidate_complete_page(mapping, page); +} + /** * truncate_inode_pages - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate @@ -196,12 +243,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page_index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } - truncate_complete_page(mapping, page); + truncate_inode_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); @@ -231,6 +273,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); break; } + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; @@ -238,18 +281,14 @@ void truncate_inode_pages_range(struct address_space *mapping, break; lock_page(page); wait_on_page_writeback(page); - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page->index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } + truncate_inode_page(mapping, page); if (page->index > next) next = page->index; next++; - truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); } } EXPORT_SYMBOL(truncate_inode_pages_range); @@ -291,6 +330,7 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, pagevec_init(&pvec, 0); while (next <= end && pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t index; @@ -311,17 +351,14 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (lock_failed) continue; - if (PageDirty(page) || PageWriteback(page)) - goto unlock; - if (page_mapped(page)) - goto unlock; - ret += invalidate_complete_page(mapping, page); -unlock: + ret += invalidate_inode_page(page); + unlock_page(page); if (next > end) break; } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } return ret; @@ -353,6 +390,10 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (mapping->a_ops->freepage) + mapping->a_ops->freepage(page); + page_cache_release(page); /* pagecache ref */ return 1; failed: @@ -396,6 +437,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, while (next <= end && !wrapped && pagevec_lookup(&pvec, mapping, next, min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; pgoff_t page_index; @@ -445,6 +487,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, unlock_page(page); } pagevec_release(&pvec); + mem_cgroup_uncharge_end(); cond_resched(); } return ret; @@ -458,10 +501,92 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2_range); * Any pages which are found to be mapped into pagetables are unmapped prior to * invalidation. * - * Returns -EIO if any pages could not be invalidated. + * Returns -EBUSY if any pages could not be invalidated. */ int invalidate_inode_pages2(struct address_space *mapping) { return invalidate_inode_pages2_range(mapping, 0, -1); } EXPORT_SYMBOL_GPL(invalidate_inode_pages2); + +/** + * truncate_pagecache - unmap and remove pagecache that has been truncated + * @inode: inode + * @old: old file offset + * @new: new file offset + * + * inode's new i_size must already be written before truncate_pagecache + * is called. + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) +{ + struct address_space *mapping = inode->i_mapping; + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, new); + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); +} +EXPORT_SYMBOL(truncate_pagecache); + +/** + * truncate_setsize - update inode and pagecache for a new file size + * @inode: inode + * @newsize: new file size + * + * truncate_setsize updastes i_size update and performs pagecache + * truncation (if necessary) for a file size updates. It will be + * typically be called from the filesystem's setattr function when + * ATTR_SIZE is passed in. + * + * Must be called with inode_mutex held and after all filesystem + * specific block truncation has been performed. + */ +void truncate_setsize(struct inode *inode, loff_t newsize) +{ + loff_t oldsize; + + oldsize = inode->i_size; + i_size_write(inode, newsize); + + truncate_pagecache(inode, oldsize, newsize); +} +EXPORT_SYMBOL(truncate_setsize); + +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * This function is deprecated and truncate_setsize or truncate_pagecache + * should be used instead, together with filesystem specific block truncation. + */ +int vmtruncate(struct inode *inode, loff_t offset) +{ + int error; + + error = inode_newsize_ok(inode, offset); + if (error) + return error; + + truncate_setsize(inode, offset); + if (inode->i_op->truncate) + inode->i_op->truncate(inode); + return 0; +} +EXPORT_SYMBOL(vmtruncate); @@ -204,15 +204,10 @@ char *strndup_user(const char __user *s, long n) if (length > n) return ERR_PTR(-EINVAL); - p = kmalloc(length, GFP_KERNEL); + p = memdup_user(s, length); - if (!p) - return ERR_PTR(-ENOMEM); - - if (copy_from_user(p, s, length)) { - kfree(p); - return ERR_PTR(-EFAULT); - } + if (IS_ERR(p)) + return p; p[length - 1] = '\0'; @@ -220,7 +215,7 @@ char *strndup_user(const char __user *s, long n) } EXPORT_SYMBOL(strndup_user); -#ifndef HAVE_ARCH_PICK_MMAP_LAYOUT +#if defined(CONFIG_MMU) && !defined(HAVE_ARCH_PICK_MMAP_LAYOUT) void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; @@ -229,6 +224,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm) } #endif +/* + * Like get_user_pages_fast() except its IRQ-safe in that it won't fall + * back to the regular GUP. + * If the architecture not support this fucntion, simply return with no + * page pinned + */ +int __attribute__((weak)) __get_user_pages_fast(unsigned long start, + int nr_pages, int write, struct page **pages) +{ + return 0; +} +EXPORT_SYMBOL_GPL(__get_user_pages_fast); + /** * get_user_pages_fast() - pin user pages in memory * @start: starting user address diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f8189a4..eb5cc7d 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> @@ -25,11 +26,10 @@ #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> - #include <asm/atomic.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> - +#include <asm/shmparam.h> /*** Page table manipulation functions ***/ @@ -168,11 +168,9 @@ static int vmap_page_range_noflush(unsigned long start, unsigned long end, next = pgd_addr_end(addr, end); err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); if (err) - break; + return err; } while (pgd++, addr = next, addr != end); - if (unlikely(err)) - return err; return nr; } @@ -186,7 +184,7 @@ static int vmap_page_range(unsigned long start, unsigned long end, return ret; } -static inline int is_vmalloc_or_module_addr(const void *x) +int is_vmalloc_or_module_addr(const void *x) { /* * ARM, x86-64 and sparc64 put modules in a special place, @@ -265,6 +263,7 @@ struct vmap_area { static DEFINE_SPINLOCK(vmap_area_lock); static struct rb_root vmap_area_root = RB_ROOT; static LIST_HEAD(vmap_area_list); +static unsigned long vmap_area_pcpu_hole; static struct vmap_area *__find_vmap_area(unsigned long addr) { @@ -292,13 +291,13 @@ static void __insert_vmap_area(struct vmap_area *va) struct rb_node *tmp; while (*p) { - struct vmap_area *tmp; + struct vmap_area *tmp_va; parent = *p; - tmp = rb_entry(parent, struct vmap_area, rb_node); - if (va->va_start < tmp->va_end) + tmp_va = rb_entry(parent, struct vmap_area, rb_node); + if (va->va_start < tmp_va->va_end) p = &(*p)->rb_left; - else if (va->va_end > tmp->va_start) + else if (va->va_end > tmp_va->va_start) p = &(*p)->rb_right; else BUG(); @@ -431,6 +430,15 @@ static void __free_vmap_area(struct vmap_area *va) RB_CLEAR_NODE(&va->rb_node); list_del_rcu(&va->list); + /* + * Track the highest possible candidate for pcpu area + * allocation. Areas outside of vmalloc area can be returned + * here too, consider only end addresses which fall inside + * vmalloc area proper. + */ + if (va->va_end > VMALLOC_START && va->va_end <= VMALLOC_END) + vmap_area_pcpu_hole = max(vmap_area_pcpu_hole, va->va_end); + call_rcu(&va->rcu_head, rcu_free_va); } @@ -500,6 +508,18 @@ static unsigned long lazy_max_pages(void) static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); +/* for per-CPU blocks */ +static void purge_fragmented_blocks_allcpus(void); + +/* + * called before a call to iounmap() if the caller wants vm_area_struct's + * immediately freed. + */ +void set_iounmap_nonlazy(void) +{ + atomic_set(&vmap_lazy_nr, lazy_max_pages()+1); +} + /* * Purges all lazily-freed vmap areas. * @@ -530,6 +550,9 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } else spin_lock(&purge_lock); + if (sync) + purge_fragmented_blocks_allcpus(); + rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { if (va->flags & VM_LAZY_FREE) { @@ -538,7 +561,6 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, if (va->va_end > *end) *end = va->va_end; nr += (va->va_end - va->va_start) >> PAGE_SHIFT; - unmap_vmap_area(va); list_add_tail(&va->purge_list, &valist); va->flags |= VM_LAZY_FREEING; va->flags &= ~VM_LAZY_FREE; @@ -546,10 +568,8 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, } rcu_read_unlock(); - if (nr) { - BUG_ON(nr > atomic_read(&vmap_lazy_nr)); + if (nr) atomic_sub(nr, &vmap_lazy_nr); - } if (nr || force_flush) flush_tlb_kernel_range(*start, *end); @@ -585,10 +605,11 @@ static void purge_vmap_area_lazy(void) } /* - * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been - * called for the correct range previously. + * Free a vmap area, caller ensuring that the area has been unmapped + * and flush_cache_vunmap had been called for the correct range + * previously. */ -static void free_unmap_vmap_area_noflush(struct vmap_area *va) +static void free_vmap_area_noflush(struct vmap_area *va) { va->flags |= VM_LAZY_FREE; atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); @@ -597,6 +618,16 @@ static void free_unmap_vmap_area_noflush(struct vmap_area *va) } /* + * Free and unmap a vmap area, caller ensuring flush_cache_vunmap had been + * called for the correct range previously. + */ +static void free_unmap_vmap_area_noflush(struct vmap_area *va) +{ + unmap_vmap_area(va); + free_vmap_area_noflush(va); +} + +/* * Free and unmap a vmap area */ static void free_unmap_vmap_area(struct vmap_area *va) @@ -660,8 +691,6 @@ static bool vmap_initialized __read_mostly = false; struct vmap_block_queue { spinlock_t lock; struct list_head free; - struct list_head dirty; - unsigned int nr_dirty; }; struct vmap_block { @@ -671,10 +700,9 @@ struct vmap_block { unsigned long free, dirty; DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); - union { - struct list_head free_list; - struct rcu_head rcu_head; - }; + struct list_head free_list; + struct rcu_head rcu_head; + struct list_head purge; }; /* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ @@ -722,7 +750,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) node, gfp_mask); if (unlikely(IS_ERR(va))) { kfree(vb); - return ERR_PTR(PTR_ERR(va)); + return ERR_CAST(va); } err = radix_tree_preload(gfp_mask); @@ -750,9 +778,9 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vbq = &get_cpu_var(vmap_block_queue); vb->vbq = vbq; spin_lock(&vbq->lock); - list_add(&vb->free_list, &vbq->free); + list_add_rcu(&vb->free_list, &vbq->free); spin_unlock(&vbq->lock); - put_cpu_var(vmap_cpu_blocks); + put_cpu_var(vmap_block_queue); return vb; } @@ -769,24 +797,71 @@ static void free_vmap_block(struct vmap_block *vb) struct vmap_block *tmp; unsigned long vb_idx; - BUG_ON(!list_empty(&vb->free_list)); - vb_idx = addr_to_vb_idx(vb->va->va_start); spin_lock(&vmap_block_tree_lock); tmp = radix_tree_delete(&vmap_block_tree, vb_idx); spin_unlock(&vmap_block_tree_lock); BUG_ON(tmp != vb); - free_unmap_vmap_area_noflush(vb->va); + free_vmap_area_noflush(vb->va); call_rcu(&vb->rcu_head, rcu_free_vb); } +static void purge_fragmented_blocks(int cpu) +{ + LIST_HEAD(purge); + struct vmap_block *vb; + struct vmap_block *n_vb; + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + + if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS)) + continue; + + spin_lock(&vb->lock); + if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { + vb->free = 0; /* prevent further allocs after releasing lock */ + vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ + bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); + bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); + spin_unlock(&vb->lock); + list_add_tail(&vb->purge, &purge); + } else + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + + list_for_each_entry_safe(vb, n_vb, &purge, purge) { + list_del(&vb->purge); + free_vmap_block(vb); + } +} + +static void purge_fragmented_blocks_thiscpu(void) +{ + purge_fragmented_blocks(smp_processor_id()); +} + +static void purge_fragmented_blocks_allcpus(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + purge_fragmented_blocks(cpu); +} + static void *vb_alloc(unsigned long size, gfp_t gfp_mask) { struct vmap_block_queue *vbq; struct vmap_block *vb; unsigned long addr = 0; unsigned int order; + int purge = 0; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); @@ -799,25 +874,39 @@ again: int i; spin_lock(&vb->lock); + if (vb->free < 1UL << order) + goto next; + i = bitmap_find_free_region(vb->alloc_map, VMAP_BBMAP_BITS, order); - if (i >= 0) { - addr = vb->va->va_start + (i << PAGE_SHIFT); - BUG_ON(addr_to_vb_idx(addr) != - addr_to_vb_idx(vb->va->va_start)); - vb->free -= 1UL << order; - if (vb->free == 0) { - spin_lock(&vbq->lock); - list_del_init(&vb->free_list); - spin_unlock(&vbq->lock); + if (i < 0) { + if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { + /* fragmented and no outstanding allocations */ + BUG_ON(vb->dirty != VMAP_BBMAP_BITS); + purge = 1; } - spin_unlock(&vb->lock); - break; + goto next; + } + addr = vb->va->va_start + (i << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != + addr_to_vb_idx(vb->va->va_start)); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_rcu(&vb->free_list); + spin_unlock(&vbq->lock); } spin_unlock(&vb->lock); + break; +next: + spin_unlock(&vb->lock); } - put_cpu_var(vmap_cpu_blocks); + + if (purge) + purge_fragmented_blocks_thiscpu(); + + put_cpu_var(vmap_block_queue); rcu_read_unlock(); if (!addr) { @@ -852,12 +941,14 @@ static void vb_free(const void *addr, unsigned long size) rcu_read_unlock(); BUG_ON(!vb); + vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + spin_lock(&vb->lock); - bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); + BUG_ON(bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order)); vb->dirty += 1UL << order; if (vb->dirty == VMAP_BBMAP_BITS) { - BUG_ON(vb->free || !list_empty(&vb->free_list)); + BUG_ON(vb->free); spin_unlock(&vb->lock); free_vmap_block(vb); } else @@ -904,7 +995,6 @@ void vm_unmap_aliases(void) s = vb->va->va_start + (i << PAGE_SHIFT); e = vb->va->va_start + (j << PAGE_SHIFT); - vunmap_page_range(s, e); flush = 1; if (s < start) @@ -1026,8 +1116,6 @@ void __init vmalloc_init(void) vbq = &per_cpu(vmap_block_queue, i); spin_lock_init(&vbq->lock); INIT_LIST_HEAD(&vbq->free); - INIT_LIST_HEAD(&vbq->dirty); - vbq->nr_dirty = 0; } /* Import existing vmlist entries. */ @@ -1038,6 +1126,9 @@ void __init vmalloc_init(void) va->va_end = va->va_start + tmp->size; __insert_vmap_area(va); } + + vmap_area_pcpu_hole = VMALLOC_END; + vmap_initialized = true; } @@ -1122,14 +1213,34 @@ EXPORT_SYMBOL_GPL(map_vm_area); DEFINE_RWLOCK(vmlist_lock); struct vm_struct *vmlist; +static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, + unsigned long flags, void *caller) +{ + struct vm_struct *tmp, **p; + + vm->flags = flags; + vm->addr = (void *)va->va_start; + vm->size = va->va_end - va->va_start; + vm->caller = caller; + va->private = vm; + va->flags |= VM_VM_AREA; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= vm->addr) + break; + } + vm->next = *p; + *p = vm; + write_unlock(&vmlist_lock); +} + static struct vm_struct *__get_vm_area_node(unsigned long size, - unsigned long flags, unsigned long start, unsigned long end, - int node, gfp_t gfp_mask, void *caller) + unsigned long align, unsigned long flags, unsigned long start, + unsigned long end, int node, gfp_t gfp_mask, void *caller) { static struct vmap_area *va; struct vm_struct *area; - struct vm_struct *tmp, **p; - unsigned long align = 1; BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) { @@ -1147,7 +1258,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, if (unlikely(!size)) return NULL; - area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); + area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); if (unlikely(!area)) return NULL; @@ -1162,32 +1273,14 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - area->flags = flags; - area->addr = (void *)va->va_start; - area->size = size; - area->pages = NULL; - area->nr_pages = 0; - area->phys_addr = 0; - area->caller = caller; - va->private = area; - va->flags |= VM_VM_AREA; - - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { - if (tmp->addr >= area->addr) - break; - } - area->next = *p; - *p = area; - write_unlock(&vmlist_lock); - + insert_vmalloc_vm(area, va, flags, caller); return area; } struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end) { - return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(__get_vm_area); @@ -1196,7 +1289,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, void *caller) { - return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, caller); } @@ -1211,22 +1304,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, __builtin_return_address(0)); } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, void *caller) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, caller); } struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, - gfp_mask, __builtin_return_address(0)); + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + node, gfp_mask, __builtin_return_address(0)); } static struct vm_struct *find_vm_area(const void *addr) @@ -1256,17 +1349,21 @@ struct vm_struct *remove_vm_area(const void *addr) if (va && va->flags & VM_VM_AREA) { struct vm_struct *vm = va->private; struct vm_struct *tmp, **p; - - vmap_debug_free_range(va->va_start, va->va_end); - free_unmap_vmap_area(va); - vm->size -= PAGE_SIZE; - + /* + * remove from list and disallow access to this vm_struct + * before unmap. (address range confliction is maintained by + * vmap.) + */ write_lock(&vmlist_lock); for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) ; *p = tmp->next; write_unlock(&vmlist_lock); + vmap_debug_free_range(va->va_start, va->va_end); + free_unmap_vmap_area(va); + vm->size -= PAGE_SIZE; + return vm; } return NULL; @@ -1368,7 +1465,7 @@ void *vmap(struct page **pages, unsigned int count, might_sleep(); - if (count > num_physpages) + if (count > totalram_pages) return NULL; area = get_vm_area_caller((count << PAGE_SHIFT), flags, @@ -1385,13 +1482,15 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, int node, void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node, void *caller) { struct page **pages; unsigned int nr_pages, array_size, i; + gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT; array_size = (nr_pages * sizeof(struct page *)); @@ -1399,13 +1498,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, + pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, PAGE_KERNEL, node, caller); area->flags |= VM_VPAGES; } else { - pages = kmalloc_node(array_size, - (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO, - node); + pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; area->caller = caller; @@ -1458,6 +1555,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size + * @align: desired alignment * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 @@ -1467,19 +1565,20 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) { struct vm_struct *area; void *addr; unsigned long real_size = size; size = PAGE_ALIGN(size); - if (!size || (size >> PAGE_SHIFT) > num_physpages) + if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, - node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, + VMALLOC_END, node, gfp_mask, caller); if (!area) return NULL; @@ -1498,11 +1597,18 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { - return __vmalloc_node(size, gfp_mask, prot, -1, + return __vmalloc_node(size, 1, gfp_mask, prot, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); +static inline void *__vmalloc_node_flags(unsigned long size, + int node, gfp_t flags) +{ + return __vmalloc_node(size, 1, flags, PAGE_KERNEL, + node, __builtin_return_address(0)); +} + /** * vmalloc - allocate virtually contiguous memory * @size: allocation size @@ -1514,12 +1620,28 @@ EXPORT_SYMBOL(__vmalloc); */ void *vmalloc(unsigned long size) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, - -1, __builtin_return_address(0)); + return __vmalloc_node_flags(size, -1, GFP_KERNEL | __GFP_HIGHMEM); } EXPORT_SYMBOL(vmalloc); /** + * vzalloc - allocate virtually contiguous memory with zero fill + * @size: allocation size + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc() instead. + */ +void *vzalloc(unsigned long size) +{ + return __vmalloc_node_flags(size, -1, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc); + +/** * vmalloc_user - allocate zeroed virtually contiguous memory for userspace * @size: allocation size * @@ -1531,7 +1653,8 @@ void *vmalloc_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + ret = __vmalloc_node(size, SHMLBA, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL, -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); @@ -1554,11 +1677,30 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); +/** + * vzalloc_node - allocate memory on a specific node with zero fill + * @size: allocation size + * @node: numa node + * + * Allocate enough pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * The memory allocated is set to zero. + * + * For tight control over page level allocator and protection flags + * use __vmalloc_node() instead. + */ +void *vzalloc_node(unsigned long size, int node) +{ + return __vmalloc_node_flags(size, node, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); +} +EXPORT_SYMBOL(vzalloc_node); + #ifndef PAGE_KERNEL_EXEC # define PAGE_KERNEL_EXEC PAGE_KERNEL #endif @@ -1577,7 +1719,7 @@ EXPORT_SYMBOL(vmalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, -1, __builtin_return_address(0)); } @@ -1598,7 +1740,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -1615,7 +1757,7 @@ void *vmalloc_32_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); @@ -1625,10 +1767,120 @@ void *vmalloc_32_user(unsigned long size) } EXPORT_SYMBOL(vmalloc_32_user); +/* + * small helper routine , copy contents to buf from addr. + * If the page is not present, fill zero. + */ + +static int aligned_vread(char *buf, char *addr, unsigned long count) +{ + struct page *p; + int copied = 0; + + while (count) { + unsigned long offset, length; + + offset = (unsigned long)addr & ~PAGE_MASK; + length = PAGE_SIZE - offset; + if (length > count) + length = count; + p = vmalloc_to_page(addr); + /* + * To do safe access to this _mapped_ area, we need + * lock. But adding lock here means that we need to add + * overhead of vmalloc()/vfree() calles for this _debug_ + * interface, rarely used. Instead of that, we'll use + * kmap() and get small overhead in this access function. + */ + if (p) { + /* + * we can expect USER0 is not used (see vread/vwrite's + * function description) + */ + void *map = kmap_atomic(p, KM_USER0); + memcpy(buf, map + offset, length); + kunmap_atomic(map, KM_USER0); + } else + memset(buf, 0, length); + + addr += length; + buf += length; + copied += length; + count -= length; + } + return copied; +} + +static int aligned_vwrite(char *buf, char *addr, unsigned long count) +{ + struct page *p; + int copied = 0; + + while (count) { + unsigned long offset, length; + + offset = (unsigned long)addr & ~PAGE_MASK; + length = PAGE_SIZE - offset; + if (length > count) + length = count; + p = vmalloc_to_page(addr); + /* + * To do safe access to this _mapped_ area, we need + * lock. But adding lock here means that we need to add + * overhead of vmalloc()/vfree() calles for this _debug_ + * interface, rarely used. Instead of that, we'll use + * kmap() and get small overhead in this access function. + */ + if (p) { + /* + * we can expect USER0 is not used (see vread/vwrite's + * function description) + */ + void *map = kmap_atomic(p, KM_USER0); + memcpy(map + offset, buf, length); + kunmap_atomic(map, KM_USER0); + } + addr += length; + buf += length; + copied += length; + count -= length; + } + return copied; +} + +/** + * vread() - read vmalloc area in a safe way. + * @buf: buffer for reading data + * @addr: vm address. + * @count: number of bytes to be read. + * + * Returns # of bytes which addr and buf should be increased. + * (same number to @count). Returns 0 if [addr...addr+count) doesn't + * includes any intersect with alive vmalloc area. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from that area to a given buffer. If the given memory range + * of [addr...addr+count) includes some valid address, data is copied to + * proper area of @buf. If there are memory holes, they'll be zero-filled. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. + * @buf should be kernel's buffer. Because this function uses KM_USER0, + * the caller should guarantee KM_USER0 is not used. + * + * Note: In usual ops, vread() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + */ + long vread(char *buf, char *addr, unsigned long count) { struct vm_struct *tmp; char *vaddr, *buf_start = buf; + unsigned long buflen = count; unsigned long n; /* Don't allow overflow */ @@ -1636,7 +1888,7 @@ long vread(char *buf, char *addr, unsigned long count) count = -(unsigned long) addr; read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { + for (tmp = vmlist; count && tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; if (addr >= vaddr + tmp->size - PAGE_SIZE) continue; @@ -1649,32 +1901,72 @@ long vread(char *buf, char *addr, unsigned long count) count--; } n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *buf = *addr; - buf++; - addr++; - count--; - } while (--n > 0); + if (n > count) + n = count; + if (!(tmp->flags & VM_IOREMAP)) + aligned_vread(buf, addr, n); + else /* IOREMAP area is treated as memory hole */ + memset(buf, 0, n); + buf += n; + addr += n; + count -= n; } finished: read_unlock(&vmlist_lock); - return buf - buf_start; + + if (buf == buf_start) + return 0; + /* zero-fill memory holes */ + if (buf != buf_start + buflen) + memset(buf, 0, buflen - (buf - buf_start)); + + return buflen; } +/** + * vwrite() - write vmalloc area in a safe way. + * @buf: buffer for source data + * @addr: vm address. + * @count: number of bytes to be read. + * + * Returns # of bytes which addr and buf should be incresed. + * (same number to @count). + * If [addr...addr+count) doesn't includes any intersect with valid + * vmalloc area, returns 0. + * + * This function checks that addr is a valid vmalloc'ed area, and + * copy data from a buffer to the given addr. If specified range of + * [addr...addr+count) includes some valid address, data is copied from + * proper area of @buf. If there are memory holes, no copy to hole. + * IOREMAP area is treated as memory hole and no copy is done. + * + * If [addr...addr+count) doesn't includes any intersects with alive + * vm_struct area, returns 0. + * @buf should be kernel's buffer. Because this function uses KM_USER0, + * the caller should guarantee KM_USER0 is not used. + * + * Note: In usual ops, vwrite() is never necessary because the caller + * should know vmalloc() area is valid and can use memcpy(). + * This is for routines which have to access vmalloc area without + * any informaion, as /dev/kmem. + * + * The caller should guarantee KM_USER1 is not used. + */ + long vwrite(char *buf, char *addr, unsigned long count) { struct vm_struct *tmp; - char *vaddr, *buf_start = buf; - unsigned long n; + char *vaddr; + unsigned long n, buflen; + int copied = 0; /* Don't allow overflow */ if ((unsigned long) addr + count < count) count = -(unsigned long) addr; + buflen = count; read_lock(&vmlist_lock); - for (tmp = vmlist; tmp; tmp = tmp->next) { + for (tmp = vmlist; count && tmp; tmp = tmp->next) { vaddr = (char *) tmp->addr; if (addr >= vaddr + tmp->size - PAGE_SIZE) continue; @@ -1686,18 +1978,21 @@ long vwrite(char *buf, char *addr, unsigned long count) count--; } n = vaddr + tmp->size - PAGE_SIZE - addr; - do { - if (count == 0) - goto finished; - *addr = *buf; - buf++; - addr++; - count--; - } while (--n > 0); + if (n > count) + n = count; + if (!(tmp->flags & VM_IOREMAP)) { + aligned_vwrite(buf, addr, n); + copied++; + } + buf += n; + addr += n; + count -= n; } finished: read_unlock(&vmlist_lock); - return buf - buf_start; + if (!copied) + return 0; + return buflen; } /** @@ -1818,9 +2113,292 @@ void free_vm_area(struct vm_struct *area) } EXPORT_SYMBOL_GPL(free_vm_area); +#ifdef CONFIG_SMP +static struct vmap_area *node_to_va(struct rb_node *n) +{ + return n ? rb_entry(n, struct vmap_area, rb_node) : NULL; +} + +/** + * pvm_find_next_prev - find the next and prev vmap_area surrounding @end + * @end: target address + * @pnext: out arg for the next vmap_area + * @pprev: out arg for the previous vmap_area + * + * Returns: %true if either or both of next and prev are found, + * %false if no vmap_area exists + * + * Find vmap_areas end addresses of which enclose @end. ie. if not + * NULL, *pnext->va_end > @end and *pprev->va_end <= @end. + */ +static bool pvm_find_next_prev(unsigned long end, + struct vmap_area **pnext, + struct vmap_area **pprev) +{ + struct rb_node *n = vmap_area_root.rb_node; + struct vmap_area *va = NULL; + + while (n) { + va = rb_entry(n, struct vmap_area, rb_node); + if (end < va->va_end) + n = n->rb_left; + else if (end > va->va_end) + n = n->rb_right; + else + break; + } + + if (!va) + return false; + + if (va->va_end > end) { + *pnext = va; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } else { + *pprev = va; + *pnext = node_to_va(rb_next(&(*pprev)->rb_node)); + } + return true; +} + +/** + * pvm_determine_end - find the highest aligned address between two vmap_areas + * @pnext: in/out arg for the next vmap_area + * @pprev: in/out arg for the previous vmap_area + * @align: alignment + * + * Returns: determined end address + * + * Find the highest aligned address between *@pnext and *@pprev below + * VMALLOC_END. *@pnext and *@pprev are adjusted so that the aligned + * down address is between the end addresses of the two vmap_areas. + * + * Please note that the address returned by this function may fall + * inside *@pnext vmap_area. The caller is responsible for checking + * that. + */ +static unsigned long pvm_determine_end(struct vmap_area **pnext, + struct vmap_area **pprev, + unsigned long align) +{ + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + unsigned long addr; + + if (*pnext) + addr = min((*pnext)->va_start & ~(align - 1), vmalloc_end); + else + addr = vmalloc_end; + + while (*pprev && (*pprev)->va_end > addr) { + *pnext = *pprev; + *pprev = node_to_va(rb_prev(&(*pnext)->rb_node)); + } + + return addr; +} + +/** + * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator + * @offsets: array containing offset of each area + * @sizes: array containing size of each area + * @nr_vms: the number of areas to allocate + * @align: alignment, all entries in @offsets and @sizes must be aligned to this + * @gfp_mask: allocation mask + * + * Returns: kmalloc'd vm_struct pointer array pointing to allocated + * vm_structs on success, %NULL on failure + * + * Percpu allocator wants to use congruent vm areas so that it can + * maintain the offsets among percpu areas. This function allocates + * congruent vmalloc areas for it. These areas tend to be scattered + * pretty far, distance between two areas easily going up to + * gigabytes. To avoid interacting with regular vmallocs, these areas + * are allocated from top. + * + * Despite its complicated look, this allocator is rather simple. It + * does everything top-down and scans areas from the end looking for + * matching slot. While scanning, if any of the areas overlaps with + * existing vmap_area, the base address is pulled down to fit the + * area. Scanning is repeated till all the areas fit and then all + * necessary data structres are inserted and the result is returned. + */ +struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, + const size_t *sizes, int nr_vms, + size_t align, gfp_t gfp_mask) +{ + const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); + const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); + struct vmap_area **vas, *prev, *next; + struct vm_struct **vms; + int area, area2, last_area, term_area; + unsigned long base, start, end, last_end; + bool purged = false; + + gfp_mask &= GFP_RECLAIM_MASK; + + /* verify parameters and allocate data structures */ + BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); + for (last_area = 0, area = 0; area < nr_vms; area++) { + start = offsets[area]; + end = start + sizes[area]; + + /* is everything aligned properly? */ + BUG_ON(!IS_ALIGNED(offsets[area], align)); + BUG_ON(!IS_ALIGNED(sizes[area], align)); + + /* detect the area with the highest address */ + if (start > offsets[last_area]) + last_area = area; + + for (area2 = 0; area2 < nr_vms; area2++) { + unsigned long start2 = offsets[area2]; + unsigned long end2 = start2 + sizes[area2]; + + if (area2 == area) + continue; + + BUG_ON(start2 >= start && start2 < end); + BUG_ON(end2 <= end && end2 > start); + } + } + last_end = offsets[last_area] + sizes[last_area]; + + if (vmalloc_end - vmalloc_start < last_end) { + WARN_ON(true); + return NULL; + } + + vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); + vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + if (!vas || !vms) + goto err_free; + + for (area = 0; area < nr_vms; area++) { + vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); + vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + if (!vas[area] || !vms[area]) + goto err_free; + } +retry: + spin_lock(&vmap_area_lock); + + /* start scanning - we scan from the top, begin with the last area */ + area = term_area = last_area; + start = offsets[area]; + end = start + sizes[area]; + + if (!pvm_find_next_prev(vmap_area_pcpu_hole, &next, &prev)) { + base = vmalloc_end - last_end; + goto found; + } + base = pvm_determine_end(&next, &prev, align) - end; + + while (true) { + BUG_ON(next && next->va_end <= base + end); + BUG_ON(prev && prev->va_end > base + end); + + /* + * base might have underflowed, add last_end before + * comparing. + */ + if (base + last_end < vmalloc_start + last_end) { + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = true; + goto retry; + } + goto err_free; + } + + /* + * If next overlaps, move base downwards so that it's + * right below next and then recheck. + */ + if (next && next->va_start < base + end) { + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * If prev overlaps, shift down next and prev and move + * base so that it's right below new next and then + * recheck. + */ + if (prev && prev->va_end > base + start) { + next = prev; + prev = node_to_va(rb_prev(&next->rb_node)); + base = pvm_determine_end(&next, &prev, align) - end; + term_area = area; + continue; + } + + /* + * This area fits, move on to the previous one. If + * the previous one is the terminal one, we're done. + */ + area = (area + nr_vms - 1) % nr_vms; + if (area == term_area) + break; + start = offsets[area]; + end = start + sizes[area]; + pvm_find_next_prev(base + end, &next, &prev); + } +found: + /* we've found a fitting base, insert all va's */ + for (area = 0; area < nr_vms; area++) { + struct vmap_area *va = vas[area]; + + va->va_start = base + offsets[area]; + va->va_end = va->va_start + sizes[area]; + __insert_vmap_area(va); + } + + vmap_area_pcpu_hole = base + offsets[last_area]; + + spin_unlock(&vmap_area_lock); + + /* insert all vm's */ + for (area = 0; area < nr_vms; area++) + insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); + + kfree(vas); + return vms; + +err_free: + for (area = 0; area < nr_vms; area++) { + if (vas) + kfree(vas[area]); + if (vms) + kfree(vms[area]); + } + kfree(vas); + kfree(vms); + return NULL; +} + +/** + * pcpu_free_vm_areas - free vmalloc areas for percpu allocator + * @vms: vm_struct pointer array returned by pcpu_get_vm_areas() + * @nr_vms: the number of allocated areas + * + * Free vm_structs and the array allocated by pcpu_get_vm_areas(). + */ +void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms) +{ + int i; + + for (i = 0; i < nr_vms; i++) + free_vm_area(vms[i]); + kfree(vms); +} +#endif /* CONFIG_SMP */ #ifdef CONFIG_PROC_FS static void *s_start(struct seq_file *m, loff_t *pos) + __acquires(&vmlist_lock) { loff_t n = *pos; struct vm_struct *v; @@ -1847,6 +2425,7 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos) } static void s_stop(struct seq_file *m, void *p) + __releases(&vmlist_lock) { read_unlock(&vmlist_lock); } @@ -1889,7 +2468,7 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " pages=%d", v->nr_pages); if (v->phys_addr) - seq_printf(m, " phys=%lx", v->phys_addr); + seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); if (v->flags & VM_IOREMAP) seq_printf(m, " ioremap"); @@ -1923,8 +2502,11 @@ static int vmalloc_open(struct inode *inode, struct file *file) unsigned int *ptr = NULL; int ret; - if (NUMA_BUILD) + if (NUMA_BUILD) { ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); + if (ptr == NULL) + return -ENOMEM; + } ret = seq_open(file, &vmalloc_op); if (!ret) { struct seq_file *m = file->private_data; diff --git a/mm/vmscan.c b/mm/vmscan.c index e8fa2d9..9ca587c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -13,7 +13,7 @@ #include <linux/mm.h> #include <linux/module.h> -#include <linux/slab.h> +#include <linux/gfp.h> #include <linux/kernel_stat.h> #include <linux/swap.h> #include <linux/pagemap.h> @@ -48,6 +48,15 @@ #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/vmscan.h> + +enum lumpy_mode { + LUMPY_MODE_NONE, + LUMPY_MODE_ASYNC, + LUMPY_MODE_SYNC, +}; + struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; @@ -55,6 +64,11 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + unsigned long hibernation_mode; + /* This context's GFP mask */ gfp_t gfp_mask; @@ -66,18 +80,16 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; - /* This context's SWAP_CLUSTER_MAX. If freeing memory for - * suspend, we effectively ignore SWAP_CLUSTER_MAX. - * In this context, it doesn't matter that we scan the - * whole list at once. */ - int swap_cluster_max; - int swappiness; - int all_unreclaimable; - int order; + /* + * Intend to reclaim enough continuous memory rather than reclaim + * enough amount of memory. i.e, mode for high order allocation. + */ + enum lumpy_mode lumpy_reclaim_mode; + /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -86,12 +98,6 @@ struct scan_control { * are scanned. */ nodemask_t *nodemask; - - /* Pluggable isolate pages callback */ - unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, - unsigned long *scanned, int order, int mode, - struct zone *z, struct mem_cgroup *mem_cont, - int active, int file); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -148,8 +154,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, return &zone->reclaim_stat; } -static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, - enum lru_list lru) +static unsigned long zone_nr_lru_pages(struct zone *zone, + struct scan_control *sc, enum lru_list lru) { if (!scanning_global_lru(sc)) return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); @@ -216,8 +222,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; - unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); + unsigned long max_pass; + max_pass = (*shrinker->shrink)(shrinker, 0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; delta *= max_pass; do_div(delta, lru_pages + 1); @@ -245,8 +252,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, int shrink_ret; int nr_before; - nr_before = (*shrinker->shrink)(0, gfp_mask); - shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); + nr_before = (*shrinker->shrink)(shrinker, 0, gfp_mask); + shrink_ret = (*shrinker->shrink)(shrinker, this_scan, + gfp_mask); if (shrink_ret == -1) break; if (shrink_ret < nr_before) @@ -263,33 +271,48 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } -/* Called without lock on whether page is mapped, so answer is unstable */ -static inline int page_mapping_inuse(struct page *page) +static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, + bool sync) { - struct address_space *mapping; + enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; - /* Page is in somebody's page tables. */ - if (page_mapped(page)) - return 1; - - /* Be more reluctant to reclaim swapcache than pagecache */ - if (PageSwapCache(page)) - return 1; + /* + * Some reclaim have alredy been failed. No worth to try synchronous + * lumpy reclaim. + */ + if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + return; - mapping = page_mapping(page); - if (!mapping) - return 0; + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + sc->lumpy_reclaim_mode = mode; + else if (sc->order && priority < DEF_PRIORITY - 2) + sc->lumpy_reclaim_mode = mode; + else + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; +} - /* File is mmap'd by somebody? */ - return mapping_mapped(mapping); +static void disable_lumpy_reclaim_mode(struct scan_control *sc) +{ + sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; } static inline int is_page_cache_freeable(struct page *page) { - return page_count(page) - !!page_has_private(page) == 2; + /* + * A freeable page cache page is referenced only by the caller + * that isolated the page, the page cache radix tree and + * optional buffer heads at page->private. + */ + return page_count(page) - page_has_private(page) == 2; } -static int may_write_to_queue(struct backing_dev_info *bdi) +static int may_write_to_queue(struct backing_dev_info *bdi, + struct scan_control *sc) { if (current->flags & PF_SWAPWRITE) return 1; @@ -297,6 +320,10 @@ static int may_write_to_queue(struct backing_dev_info *bdi) return 1; if (bdi == current->backing_dev_info) return 1; + + /* lumpy reclaim for hugepage often need a lot of write */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + return 1; return 0; } @@ -315,18 +342,12 @@ static int may_write_to_queue(struct backing_dev_info *bdi) static void handle_write_error(struct address_space *mapping, struct page *page, int error) { - lock_page(page); + lock_page_nosync(page); if (page_mapping(page) == mapping) mapping_set_error(mapping, error); unlock_page(page); } -/* Request for sync pageout. */ -enum pageout_io { - PAGEOUT_IO_ASYNC, - PAGEOUT_IO_SYNC, -}; - /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -344,7 +365,7 @@ typedef enum { * Calls ->writepage(). */ static pageout_t pageout(struct page *page, struct address_space *mapping, - enum pageout_io sync_writeback) + struct scan_control *sc) { /* * If the page is dirty, only perform writeback if that write @@ -353,7 +374,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in generic_file_write() against + * If this process is currently in __generic_file_aio_write() against * this page's queue, we can perform writeback even if that * will block. * @@ -361,7 +382,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. - * See swapfile.c:page_queue_congested(). */ if (!is_page_cache_freeable(page)) return PAGE_KEEP; @@ -381,7 +401,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, } if (mapping->a_ops->writepage == NULL) return PAGE_ACTIVATE; - if (!may_write_to_queue(mapping->backing_dev_info)) + if (!may_write_to_queue(mapping->backing_dev_info, sc)) return PAGE_KEEP; if (clear_page_dirty_for_io(page)) { @@ -391,7 +411,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, .nr_to_write = SWAP_CLUSTER_MAX, .range_start = 0, .range_end = LLONG_MAX, - .nonblocking = 1, .for_reclaim = 1, }; @@ -409,13 +428,16 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * direct reclaiming a large contiguous area and the * first attempt to free a range of pages fails. */ - if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) + if (PageWriteback(page) && + sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) wait_on_page_writeback(page); if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); } + trace_mm_vmscan_writepage(page, + trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -472,9 +494,16 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) spin_unlock_irq(&mapping->tree_lock); swapcache_free(swap, page); } else { + void (*freepage)(struct page *); + + freepage = mapping->a_ops->freepage; + __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); + + if (freepage != NULL) + freepage(page); } return 1; @@ -531,7 +560,7 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_is_file_cache(page); + lru = active + page_lru_base_type(page); lru_cache_add_lru(page, lru); } else { /* @@ -540,6 +569,16 @@ redo: */ lru = LRU_UNEVICTABLE; add_page_to_unevictable_list(page); + /* + * When racing with an mlock clearing (page is + * unlocked), make sure that if the other thread does + * not observe our setting of PG_lru and fails + * isolation, we see PG_mlocked cleared below and move + * the page back to the evictable list. + * + * The other side is TestClearPageMlocked(). + */ + smp_mb(); } /* @@ -566,27 +605,104 @@ redo: put_page(page); /* drop ref from isolate */ } +enum page_references { + PAGEREF_RECLAIM, + PAGEREF_RECLAIM_CLEAN, + PAGEREF_KEEP, + PAGEREF_ACTIVATE, +}; + +static enum page_references page_check_references(struct page *page, + struct scan_control *sc) +{ + int referenced_ptes, referenced_page; + unsigned long vm_flags; + + referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags); + referenced_page = TestClearPageReferenced(page); + + /* Lumpy reclaim - ignore references */ + if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) + return PAGEREF_RECLAIM; + + /* + * Mlock lost the isolation race with us. Let try_to_unmap() + * move the page to the unevictable list. + */ + if (vm_flags & VM_LOCKED) + return PAGEREF_RECLAIM; + + if (referenced_ptes) { + if (PageAnon(page)) + return PAGEREF_ACTIVATE; + /* + * All mapped pages start out with page table + * references from the instantiating fault, so we need + * to look twice if a mapped file page is used more + * than once. + * + * Mark it and spare it for another trip around the + * inactive list. Another page table reference will + * lead to its activation. + * + * Note: the mark is set for activated pages as well + * so that recently deactivated but used pages are + * quickly recovered. + */ + SetPageReferenced(page); + + if (referenced_page) + return PAGEREF_ACTIVATE; + + return PAGEREF_KEEP; + } + + /* Reclaim if clean, defer dirty pages to writeback */ + if (referenced_page && !PageSwapBacked(page)) + return PAGEREF_RECLAIM_CLEAN; + + return PAGEREF_RECLAIM; +} + +static noinline_for_stack void free_page_list(struct list_head *free_pages) +{ + struct pagevec freed_pvec; + struct page *page, *tmp; + + pagevec_init(&freed_pvec, 1); + + list_for_each_entry_safe(page, tmp, free_pages, lru) { + list_del(&page->lru); + if (!pagevec_add(&freed_pvec, page)) { + __pagevec_free(&freed_pvec); + pagevec_reinit(&freed_pvec); + } + } + + pagevec_free(&freed_pvec); +} + /* * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc, - enum pageout_io sync_writeback) + struct zone *zone, + struct scan_control *sc) { LIST_HEAD(ret_pages); - struct pagevec freed_pvec; + LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; - unsigned long vm_flags; cond_resched(); - pagevec_init(&freed_pvec, 1); while (!list_empty(page_list)) { + enum page_references references; struct address_space *mapping; struct page *page; int may_enter_fs; - int referenced; cond_resched(); @@ -597,6 +713,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep; VM_BUG_ON(PageActive(page)); + VM_BUG_ON(page_zone(page) != zone); sc->nr_scanned++; @@ -622,18 +739,25 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) + if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && + may_enter_fs) wait_on_page_writeback(page); - else - goto keep_locked; + else { + unlock_page(page); + goto keep_lumpy; + } } - referenced = page_referenced(page, 1, - sc->mem_cgroup, &vm_flags); - /* In active use or really unfreeable? Activate it. */ - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page)) + references = page_check_references(page, sc); + switch (references) { + case PAGEREF_ACTIVATE: goto activate_locked; + case PAGEREF_KEEP: + goto keep_locked; + case PAGEREF_RECLAIM: + case PAGEREF_RECLAIM_CLEAN: + ; /* try to reclaim the page below */ + } /* * Anonymous process memory has backing store? @@ -654,7 +778,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, 0)) { + switch (try_to_unmap(page, TTU_UNMAP)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -667,7 +791,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { - if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced) + nr_dirty++; + + if (references == PAGEREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs) goto keep_locked; @@ -675,14 +801,18 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Page is dirty, try to write it out here */ - switch (pageout(page, mapping, sync_writeback)) { + switch (pageout(page, mapping, sc)) { case PAGE_KEEP: + nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; case PAGE_SUCCESS: - if (PageWriteback(page) || PageDirty(page)) + if (PageWriteback(page)) + goto keep_lumpy; + if (PageDirty(page)) goto keep; + /* * A synchronous write - probably a ramdisk. Go * ahead and try to reclaim the page. @@ -752,10 +882,12 @@ static unsigned long shrink_page_list(struct list_head *page_list, __clear_page_locked(page); free_it: nr_reclaimed++; - if (!pagevec_add(&freed_pvec, page)) { - __pagevec_free(&freed_pvec); - pagevec_reinit(&freed_pvec); - } + + /* + * Is there need to periodically free_page_list? It would + * appear not as the counts should be low + */ + list_add(&page->lru, &free_pages); continue; cull_mlocked: @@ -763,6 +895,7 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); + disable_lumpy_reclaim_mode(sc); continue; activate_locked: @@ -775,21 +908,28 @@ activate_locked: keep_locked: unlock_page(page); keep: + disable_lumpy_reclaim_mode(sc); +keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } + + /* + * Tag a zone as congested if all the dirty pages encountered were + * backed by a congested BDI. In this case, reclaimers should just + * back off and wait for congestion to clear because further reclaim + * will encounter the same problem + */ + if (nr_dirty == nr_congested && nr_dirty != 0) + zone_set_flag(zone, ZONE_CONGESTED); + + free_page_list(&free_pages); + list_splice(&ret_pages, page_list); - if (pagevec_count(&freed_pvec)) - __pagevec_free(&freed_pvec); count_vm_events(PGACTIVATE, pgactivate); return nr_reclaimed; } -/* LRU Isolation modes. */ -#define ISOLATE_INACTIVE 0 /* Isolate inactive pages. */ -#define ISOLATE_ACTIVE 1 /* Isolate active pages. */ -#define ISOLATE_BOTH 2 /* Isolate both active and inactive pages. */ - /* * Attempt to remove the specified page from its LRU. Only take this page * if it is of the appropriate PageActive status. Pages which are being @@ -816,7 +956,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) return ret; - if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) + if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) return ret; /* @@ -867,6 +1007,9 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, unsigned long *scanned, int order, int mode, int file) { unsigned long nr_taken = 0; + unsigned long nr_lumpy_taken = 0; + unsigned long nr_lumpy_dirty = 0; + unsigned long nr_lumpy_failed = 0; unsigned long scan; for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { @@ -929,17 +1072,45 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) - continue; + break; + + /* + * If we don't have enough swap space, reclaiming of + * anon page which don't already have a swap slot is + * pointless. + */ + if (nr_swap_pages <= 0 && PageAnon(cursor_page) && + !PageSwapCache(cursor_page)) + break; + if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); - mem_cgroup_del_lru(page); + mem_cgroup_del_lru(cursor_page); nr_taken++; + nr_lumpy_taken++; + if (PageDirty(cursor_page)) + nr_lumpy_dirty++; scan++; + } else { + /* the page is freed already. */ + if (!page_count(cursor_page)) + continue; + break; } } + + /* If we break out of the loop above, lumpy reclaim failed */ + if (pfn < end_pfn) + nr_lumpy_failed++; } *scanned = scan; + + trace_mm_vmscan_lru_isolate(order, + nr_to_scan, scan, + nr_taken, + nr_lumpy_taken, nr_lumpy_dirty, nr_lumpy_failed, + mode); return nr_taken; } @@ -947,7 +1118,6 @@ static unsigned long isolate_pages_global(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, struct zone *z, - struct mem_cgroup *mem_cont, int active, int file) { int lru = LRU_BASE; @@ -956,7 +1126,7 @@ static unsigned long isolate_pages_global(unsigned long nr, if (file) lru += LRU_FILE; return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, - mode, !!file); + mode, file); } /* @@ -971,13 +1141,14 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { - lru = page_is_file_cache(page); + lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); nr_active++; } - count[lru]++; + if (count) + count[lru]++; } return nr_active; @@ -1029,155 +1200,234 @@ int isolate_lru_page(struct page *page) } /* - * shrink_inactive_list() is a helper for shrink_zone(). It returns the number - * of reclaimed pages + * Are there way too many processes in the direct reclaim path already? */ -static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc, - int priority, int file) +static int too_many_isolated(struct zone *zone, int file, + struct scan_control *sc) { - LIST_HEAD(page_list); + unsigned long inactive, isolated; + + if (current_is_kswapd()) + return 0; + + if (!scanning_global_lru(sc)) + return 0; + + if (file) { + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + isolated = zone_page_state(zone, NR_ISOLATED_FILE); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_ANON); + } + + return isolated > inactive; +} + +/* + * TODO: Try merging with migrations version of putback_lru_pages + */ +static noinline_for_stack void +putback_lru_pages(struct zone *zone, struct scan_control *sc, + unsigned long nr_anon, unsigned long nr_file, + struct list_head *page_list) +{ + struct page *page; struct pagevec pvec; - unsigned long nr_scanned = 0; - unsigned long nr_reclaimed = 0; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - int lumpy_reclaim = 0; + + pagevec_init(&pvec, 1); /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - * - * We use the same threshold as pageout congestion_wait below. + * Put back any unfreeable pages. + */ + spin_lock(&zone->lru_lock); + while (!list_empty(page_list)) { + int lru; + page = lru_to_page(page_list); + VM_BUG_ON(PageLRU(page)); + list_del(&page->lru); + if (unlikely(!page_evictable(page, NULL))) { + spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); + spin_lock_irq(&zone->lru_lock); + continue; + } + SetPageLRU(page); + lru = page_lru(page); + add_page_to_lru_list(zone, page, lru); + if (is_active_lru(lru)) { + int file = is_file_lru(lru); + reclaim_stat->recent_rotated[file]++; + } + if (!pagevec_add(&pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); +} + +static noinline_for_stack void update_isolated_counts(struct zone *zone, + struct scan_control *sc, + unsigned long *nr_anon, + unsigned long *nr_file, + struct list_head *isolated_list) +{ + unsigned long nr_active; + unsigned int count[NR_LRU_LISTS] = { 0, }; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + + nr_active = clear_active_flags(isolated_list, count); + __count_vm_events(PGDEACTIVATE, nr_active); + + __mod_zone_page_state(zone, NR_ACTIVE_FILE, + -count[LRU_ACTIVE_FILE]); + __mod_zone_page_state(zone, NR_INACTIVE_FILE, + -count[LRU_INACTIVE_FILE]); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, + -count[LRU_ACTIVE_ANON]); + __mod_zone_page_state(zone, NR_INACTIVE_ANON, + -count[LRU_INACTIVE_ANON]); + + *nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + *nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, *nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, *nr_file); + + reclaim_stat->recent_scanned[0] += *nr_anon; + reclaim_stat->recent_scanned[1] += *nr_file; +} + +/* + * Returns true if the caller should wait to clean dirty/writeback pages. + * + * If we are direct reclaiming for contiguous pages and we do not reclaim + * everything in the list, try again and wait for writeback IO to complete. + * This will stall high-order allocations noticeably. Only do that when really + * need to free the pages under high memory pressure. + */ +static inline bool should_reclaim_stall(unsigned long nr_taken, + unsigned long nr_freed, + int priority, + struct scan_control *sc) +{ + int lumpy_stall_priority; + + /* kswapd should not stall on sync IO */ + if (current_is_kswapd()) + return false; + + /* Only stall on lumpy reclaim */ + if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + return false; + + /* If we have relaimed everything on the isolated list, no stall */ + if (nr_freed == nr_taken) + return false; + + /* + * For high-order allocations, there are two stall thresholds. + * High-cost allocations stall immediately where as lower + * order allocations such as stacks require the scanning + * priority to be much higher before stalling. */ if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - lumpy_reclaim = 1; - else if (sc->order && priority < DEF_PRIORITY - 2) - lumpy_reclaim = 1; + lumpy_stall_priority = DEF_PRIORITY; + else + lumpy_stall_priority = DEF_PRIORITY / 3; - pagevec_init(&pvec, 1); + return priority <= lumpy_stall_priority; +} - lru_add_drain(); - spin_lock_irq(&zone->lru_lock); - do { - struct page *page; - unsigned long nr_taken; - unsigned long nr_scan; - unsigned long nr_freed; - unsigned long nr_active; - unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; - - nr_taken = sc->isolate_pages(sc->swap_cluster_max, - &page_list, &nr_scan, sc->order, mode, - zone, sc->mem_cgroup, 0, file); - nr_active = clear_active_flags(&page_list, count); - __count_vm_events(PGDEACTIVATE, nr_active); - - __mod_zone_page_state(zone, NR_ACTIVE_FILE, - -count[LRU_ACTIVE_FILE]); - __mod_zone_page_state(zone, NR_INACTIVE_FILE, - -count[LRU_INACTIVE_FILE]); - __mod_zone_page_state(zone, NR_ACTIVE_ANON, - -count[LRU_ACTIVE_ANON]); - __mod_zone_page_state(zone, NR_INACTIVE_ANON, - -count[LRU_INACTIVE_ANON]); - - if (scanning_global_lru(sc)) - zone->pages_scanned += nr_scan; - - reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; - reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; - reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; - reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; +/* + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages + */ +static noinline_for_stack unsigned long +shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, + struct scan_control *sc, int priority, int file) +{ + LIST_HEAD(page_list); + unsigned long nr_scanned; + unsigned long nr_reclaimed = 0; + unsigned long nr_taken; + unsigned long nr_anon; + unsigned long nr_file; - spin_unlock_irq(&zone->lru_lock); + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); - nr_scanned += nr_scan; - nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } + set_lumpy_reclaim_mode(priority, sc, false); + lru_add_drain(); + spin_lock_irq(&zone->lru_lock); + + if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(nr_to_scan, + &page_list, &nr_scanned, sc->order, + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + ISOLATE_INACTIVE : ISOLATE_BOTH, + zone, 0, file); + zone->pages_scanned += nr_scanned; + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, + nr_scanned); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, + nr_scanned); + } else { + nr_taken = mem_cgroup_isolate_pages(nr_to_scan, + &page_list, &nr_scanned, sc->order, + sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + ISOLATE_INACTIVE : ISOLATE_BOTH, + zone, sc->mem_cgroup, + 0, file); /* - * If we are direct reclaiming for contiguous pages and we do - * not reclaim everything in the list, try again and wait - * for IO to complete. This will stall high-order allocations - * but that should be acceptable to the caller + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. */ - if (nr_freed < nr_taken && !current_is_kswapd() && - lumpy_reclaim) { - congestion_wait(WRITE, HZ/10); + } - /* - * The attempt at page out may have made some - * of the pages active, mark them inactive again. - */ - nr_active = clear_active_flags(&page_list, count); - count_vm_events(PGDEACTIVATE, nr_active); + if (nr_taken == 0) { + spin_unlock_irq(&zone->lru_lock); + return 0; + } - nr_freed += shrink_page_list(&page_list, sc, - PAGEOUT_IO_SYNC); - } + update_isolated_counts(zone, sc, &nr_anon, &nr_file, &page_list); - nr_reclaimed += nr_freed; - local_irq_disable(); - if (current_is_kswapd()) { - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); - __count_vm_events(KSWAPD_STEAL, nr_freed); - } else if (scanning_global_lru(sc)) - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); + spin_unlock_irq(&zone->lru_lock); - __count_zone_vm_events(PGSTEAL, zone, nr_freed); + nr_reclaimed = shrink_page_list(&page_list, zone, sc); - if (nr_taken == 0) - goto done; + /* Check if we should syncronously wait for writeback */ + if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { + set_lumpy_reclaim_mode(priority, sc, true); + nr_reclaimed += shrink_page_list(&page_list, zone, sc); + } - spin_lock(&zone->lru_lock); - /* - * Put back any unfreeable pages. - */ - while (!list_empty(&page_list)) { - int lru; - page = lru_to_page(&page_list); - VM_BUG_ON(PageLRU(page)); - list_del(&page->lru); - if (unlikely(!page_evictable(page, NULL))) { - spin_unlock_irq(&zone->lru_lock); - putback_lru_page(page); - spin_lock_irq(&zone->lru_lock); - continue; - } - SetPageLRU(page); - lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); - if (PageActive(page)) { - int file = !!page_is_file_cache(page); - reclaim_stat->recent_rotated[file]++; - } - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - } while (nr_scanned < max_scan); - spin_unlock(&zone->lru_lock); -done: - local_irq_enable(); - pagevec_release(&pvec); - return nr_reclaimed; -} + local_irq_disable(); + if (current_is_kswapd()) + __count_vm_events(KSWAPD_STEAL, nr_reclaimed); + __count_zone_vm_events(PGSTEAL, zone, nr_reclaimed); -/* - * We are about to scan this zone at a certain priority level. If that priority - * level is smaller (ie: more urgent) than the previous priority, then note - * that priority level within the zone. This is done so that when the next - * process comes in to scan this zone, it will immediately start out at this - * priority level rather than having to build up its own scanning priority. - * Here, this priority affects only the reclaim-mapped threshold. - */ -static inline void note_zone_scanning_priority(struct zone *zone, int priority) -{ - if (priority < zone->prev_priority) - zone->prev_priority = priority; + putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list); + + trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, + zone_idx(zone), + nr_scanned, nr_reclaimed, + priority, + trace_shrink_flags(file, sc->lumpy_reclaim_mode)); + return nr_reclaimed; } /* @@ -1210,15 +1460,10 @@ static void move_active_pages_to_lru(struct zone *zone, while (!list_empty(list)) { page = lru_to_page(list); - prefetchw_prev_lru_page(page, list, flags); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - if (!is_active_lru(lru)) - ClearPageActive(page); /* we are de-activating */ - list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); pgmoved++; @@ -1239,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone, static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) { - unsigned long pgmoved; + unsigned long nr_taken; unsigned long pgscanned; unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ @@ -1247,29 +1492,37 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, LIST_HEAD(l_inactive); struct page *page; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + unsigned long nr_rotated = 0; lru_add_drain(); spin_lock_irq(&zone->lru_lock); - pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, - ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1, file); - /* - * zone->pages_scanned is used for detect zone's oom - * mem_cgroup remembers nr_scan by itself. - */ if (scanning_global_lru(sc)) { + nr_taken = isolate_pages_global(nr_pages, &l_hold, + &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + 1, file); zone->pages_scanned += pgscanned; + } else { + nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold, + &pgscanned, sc->order, + ISOLATE_ACTIVE, zone, + sc->mem_cgroup, 1, file); + /* + * mem_cgroup_isolate_pages() keeps track of + * scanned pages on its own. + */ } - reclaim_stat->recent_scanned[!!file] += pgmoved; + + reclaim_stat->recent_scanned[file] += nr_taken; __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) - __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); else - __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); spin_unlock_irq(&zone->lru_lock); - pgmoved = 0; /* count referenced (mapping) mapped pages */ while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); @@ -1280,10 +1533,8 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, continue; } - /* page_referenced clears PageReferenced */ - if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { - pgmoved++; + if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { + nr_rotated++; /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So @@ -1293,12 +1544,13 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * IO, plus JVM can create lots of anon VM_EXEC pages, * so we ignore them here. */ - if ((vm_flags & VM_EXEC) && !PageAnon(page)) { + if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { list_add(&page->lru, &l_active); continue; } } + ClearPageActive(page); /* we are de-activating */ list_add(&page->lru, &l_inactive); } @@ -1312,16 +1564,17 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * helps balance scan pressure between file and anonymous pages in * get_scan_ratio. */ - reclaim_stat->recent_rotated[!!file] += pgmoved; + reclaim_stat->recent_rotated[file] += nr_rotated; move_active_pages_to_lru(zone, &l_active, LRU_ACTIVE + file * LRU_FILE); move_active_pages_to_lru(zone, &l_inactive, LRU_BASE + file * LRU_FILE); - + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); } +#ifdef CONFIG_SWAP static int inactive_anon_is_low_global(struct zone *zone) { unsigned long active, inactive; @@ -1347,12 +1600,26 @@ static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) { int low; + /* + * If we don't have swap space, anonymous page deactivation + * is pointless. + */ + if (!total_swap_pages) + return 0; + if (scanning_global_lru(sc)) low = inactive_anon_is_low_global(zone); else low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); return low; } +#else +static inline int inactive_anon_is_low(struct zone *zone, + struct scan_control *sc) +{ + return 0; +} +#endif static int inactive_file_is_low_global(struct zone *zone) { @@ -1390,57 +1657,102 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) return low; } +static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, + int file) +{ + if (file) + return inactive_file_is_low(zone, sc); + else + return inactive_anon_is_low(zone, sc); +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); + if (is_active_lru(lru)) { + if (inactive_list_is_low(zone, sc, file)) + shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } - if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); - return 0; - } return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } /* + * Smallish @nr_to_scan's are deposited in @nr_saved_scan, + * until we collected @swap_cluster_max pages to scan. + */ +static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, + unsigned long *nr_saved_scan) +{ + unsigned long nr; + + *nr_saved_scan += nr_to_scan; + nr = *nr_saved_scan; + + if (nr >= SWAP_CLUSTER_MAX) + *nr_saved_scan = 0; + else + nr = 0; + + return nr; +} + +/* * Determine how aggressively the anon and file LRU lists should be * scanned. The relative value of each set of LRU lists is determined * by looking at the fraction of the pages scanned we did rotate back * onto the active list instead of evict. * - * percent[0] specifies how much pressure to put on ram/swap backed - * memory, while percent[1] determines pressure on the file LRUs. + * nr[0] = anon pages to scan; nr[1] = file pages to scan */ -static void get_scan_ratio(struct zone *zone, struct scan_control *sc, - unsigned long *percent) +static void get_scan_count(struct zone *zone, struct scan_control *sc, + unsigned long *nr, int priority) { unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + u64 fraction[2], denominator; + enum lru_list l; + int noswap = 0; + + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (nr_swap_pages <= 0)) { + noswap = 1; + fraction[0] = 0; + fraction[1] = 1; + denominator = 1; + goto out; + } - anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); /* If we have very few page cache pages, force-scan anon pages. */ if (unlikely(file + free <= high_wmark_pages(zone))) { - percent[0] = 100; - percent[1] = 0; - return; + fraction[0] = 1; + fraction[1] = 0; + denominator = 1; + goto out; } } /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; + + /* * OK, so we have swap space and a fair amount of page cache * pages. We use the recently rotated / recently scanned * ratios to determine how valuable each cache is. @@ -1451,28 +1763,18 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, * * anon in [0], file in [1] */ + spin_lock_irq(&zone->lru_lock); if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { - spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[0] /= 2; reclaim_stat->recent_rotated[0] /= 2; - spin_unlock_irq(&zone->lru_lock); } if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { - spin_lock_irq(&zone->lru_lock); reclaim_stat->recent_scanned[1] /= 2; reclaim_stat->recent_rotated[1] /= 2; - spin_unlock_irq(&zone->lru_lock); } /* - * With swappiness at 100, anonymous and file have the same priority. - * This scanning priority is essentially the inverse of IO cost. - */ - anon_prio = sc->swappiness; - file_prio = 200 - sc->swappiness; - - /* * The amount of pressure on anon vs file pages is inversely * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. @@ -1482,31 +1784,24 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); fp /= reclaim_stat->recent_rotated[1] + 1; + spin_unlock_irq(&zone->lru_lock); - /* Normalize to percentages */ - percent[0] = 100 * ap / (ap + fp + 1); - percent[1] = 100 - percent[0]; -} - -/* - * Smallish @nr_to_scan's are deposited in @nr_saved_scan, - * until we collected @swap_cluster_max pages to scan. - */ -static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan, - unsigned long swap_cluster_max) -{ - unsigned long nr; - - *nr_saved_scan += nr_to_scan; - nr = *nr_saved_scan; - - if (nr >= swap_cluster_max) - *nr_saved_scan = 0; - else - nr = 0; + fraction[0] = ap; + fraction[1] = fp; + denominator = ap + fp + 1; +out: + for_each_evictable_lru(l) { + int file = is_file_lru(l); + unsigned long scan; - return nr; + scan = zone_nr_lru_pages(zone, sc, l); + if (priority || noswap) { + scan >>= priority; + scan = div64_u64(scan * fraction[file], denominator); + } + nr[l] = nr_scan_try_batch(scan, + &reclaim_stat->nr_saved_scan[l]); + } } /* @@ -1517,42 +1812,18 @@ static void shrink_zone(int priority, struct zone *zone, { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; - unsigned long swap_cluster_max = sc->swap_cluster_max; - int noswap = 0; - - /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - noswap = 1; - percent[0] = 0; - percent[1] = 100; - } else - get_scan_ratio(zone, sc, percent); + unsigned long nr_to_reclaim = sc->nr_to_reclaim; - for_each_evictable_lru(l) { - int file = is_file_lru(l); - unsigned long scan; - - scan = zone_nr_pages(zone, sc, l); - if (priority || noswap) { - scan >>= priority; - scan = (scan * percent[file]) / 100; - } - if (scanning_global_lru(sc)) - nr[l] = nr_scan_try_batch(scan, - &zone->lru[l].nr_saved_scan, - swap_cluster_max); - else - nr[l] = scan; - } + get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { if (nr[l]) { - nr_to_scan = min(nr[l], swap_cluster_max); + nr_to_scan = min_t(unsigned long, + nr[l], SWAP_CLUSTER_MAX); nr[l] -= nr_to_scan; nr_reclaimed += shrink_list(l, nr_to_scan, @@ -1567,8 +1838,7 @@ static void shrink_zone(int priority, struct zone *zone, * with multiple processes reclaiming pages, the total * freeing target can get unreasonably large. */ - if (nr_reclaimed > swap_cluster_max && - priority < DEF_PRIORITY && !current_is_kswapd()) + if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } @@ -1578,7 +1848,7 @@ static void shrink_zone(int priority, struct zone *zone, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) + if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); @@ -1603,13 +1873,11 @@ static void shrink_zone(int priority, struct zone *zone, static void shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { - enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); struct zoneref *z; struct zone *zone; - sc->all_unreclaimable = 1; - for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, - sc->nodemask) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { if (!populated_zone(zone)) continue; /* @@ -1619,26 +1887,46 @@ static void shrink_zones(int priority, struct zonelist *zonelist, if (scanning_global_lru(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - note_zone_scanning_priority(zone, priority); - - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - sc->all_unreclaimable = 0; - } else { - /* - * Ignore cpuset limitation here. We just want to reduce - * # of used pages by us regardless of memory shortage. - */ - sc->all_unreclaimable = 0; - mem_cgroup_note_reclaim_priority(sc->mem_cgroup, - priority); } shrink_zone(priority, zone, sc); } } +static bool zone_reclaimable(struct zone *zone) +{ + return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; +} + +/* + * As hibernation is going on, kswapd is freezed so that it can't mark + * the zone into all_unreclaimable. It can't handle OOM during hibernation. + * So let's check zone's unreclaimable in direct reclaim as well as kswapd. + */ +static bool all_unreclaimable(struct zonelist *zonelist, + struct scan_control *sc) +{ + struct zoneref *z; + struct zone *zone; + bool all_unreclaimable = true; + + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_zone(sc->gfp_mask), sc->nodemask) { + if (!populated_zone(zone)) + continue; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + if (zone_reclaimable(zone)) { + all_unreclaimable = false; + break; + } + } + + return all_unreclaimable; +} + /* * This is the main entry point to direct page reclaim. * @@ -1647,10 +1935,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist, * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this - * caller can't do much about. We kick pdflush and take explicit naps in the - * hope that some of these pages can be written. But if the allocating task - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed @@ -1659,30 +1947,17 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct scan_control *sc) { int priority; - unsigned long ret = 0; unsigned long total_scanned = 0; struct reclaim_state *reclaim_state = current->reclaim_state; - unsigned long lru_pages = 0; struct zoneref *z; struct zone *zone; - enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + unsigned long writeback_threshold; + get_mems_allowed(); delayacct_freepages_start(); if (scanning_global_lru(sc)) count_vm_event(ALLOCSTALL); - /* - * mem_cgroup will not do shrink_slab. - */ - if (scanning_global_lru(sc)) { - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - lru_pages += zone_lru_pages(zone); - } - } for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc->nr_scanned = 0; @@ -1694,6 +1969,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * over limit cgroups */ if (scanning_global_lru(sc)) { + unsigned long lru_pages = 0; + for_each_zone_zonelist(zone, z, zonelist, + gfp_zone(sc->gfp_mask)) { + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + + lru_pages += zone_reclaimable_pages(zone); + } + shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -1701,10 +1985,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } } total_scanned += sc->nr_scanned; - if (sc->nr_reclaimed >= sc->swap_cluster_max) { - ret = sc->nr_reclaimed; + if (sc->nr_reclaimed >= sc->nr_to_reclaim) goto out; - } /* * Try to write back as many pages as we just scanned. This @@ -1713,92 +1995,162 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc->swap_cluster_max + - sc->swap_cluster_max / 2) { - wakeup_pdflush(laptop_mode ? 0 : total_scanned); + writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; + if (total_scanned > writeback_threshold) { + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ - if (sc->nr_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); - } - /* top priority shrink_zones still had more to do? don't OOM, then */ - if (!sc->all_unreclaimable && scanning_global_lru(sc)) - ret = sc->nr_reclaimed; -out: - /* - * Now that we've scanned all the zones at this priority level, note - * that level within the zone so that the next thread which performs - * scanning of this zone will immediately start out at this priority - * level. This affects only the decision whether or not to bring - * mapped pages onto the inactive list. - */ - if (priority < 0) - priority = 0; + if (!sc->hibernation_mode && sc->nr_scanned && + priority < DEF_PRIORITY - 2) { + struct zone *preferred_zone; - if (scanning_global_lru(sc)) { - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { - - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - zone->prev_priority = priority; + first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), + NULL, &preferred_zone); + wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); } - } else - mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); + } +out: delayacct_freepages_end(); + put_mems_allowed(); - return ret; + if (sc->nr_reclaimed) + return sc->nr_reclaimed; + + /* top priority shrink_zones still had more to do? don't OOM, then */ + if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc)) + return 1; + + return 0; } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { + unsigned long nr_reclaimed; struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, .nodemask = nodemask, }; - return do_try_to_free_pages(zonelist, &sc); + trace_mm_vmscan_direct_reclaim_begin(order, + sc.may_writepage, + gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_direct_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } #ifdef CONFIG_CGROUP_MEM_RES_CTLR +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, + gfp_t gfp_mask, bool noswap, + unsigned int swappiness, + struct zone *zone) +{ + struct scan_control sc = { + .nr_to_reclaim = SWAP_CLUSTER_MAX, + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, + .swappiness = swappiness, + .order = 0, + .mem_cgroup = mem, + }; + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + + trace_mm_vmscan_memcg_softlimit_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + + /* + * NOTE: Although we can get the priority field, using it + * here is not a good idea, since it limits the pages we can scan. + * if we don't reclaim here, the shrink_zone from balance_pgdat + * will pick up pages from other mem cgroup's as well. We hack + * the priority and make it zero. + */ + shrink_zone(0, zone, &sc); + + trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); + + return sc.nr_reclaimed; +} + unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, bool noswap, unsigned int swappiness) { + struct zonelist *zonelist; + unsigned long nr_reclaimed; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, - .isolate_pages = mem_cgroup_isolate_pages, .nodemask = NULL, /* we don't care the placement */ }; - struct zonelist *zonelist; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); zonelist = NODE_DATA(numa_node_id())->node_zonelists; - return do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_begin(0, + sc.may_writepage, + sc.gfp_mask); + + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); + + trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed); + + return nr_reclaimed; } #endif +/* is kswapd sleeping prematurely? */ +static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +{ + int i; + + /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ + if (remaining) + return 1; + + /* If after HZ/10, a zone is below the high mark, it's premature */ + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable) + continue; + + if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), + 0, 0)) + return 1; + } + + return 0; +} + /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). @@ -1831,31 +2183,25 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .gfp_mask = GFP_KERNEL, .may_unmap = 1, .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, + /* + * kswapd doesn't want to be bailed out while reclaim. because + * we want to put equal scanning pressure on each zone. + */ + .nr_to_reclaim = ULONG_MAX, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, - .isolate_pages = isolate_pages_global, }; - /* - * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to - * free_pages == high_wmark_pages(zone). - */ - int temp_priority[MAX_NR_ZONES]; - loop_again: total_scanned = 0; sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); - for (i = 0; i < pgdat->nr_zones; i++) - temp_priority[i] = DEF_PRIORITY; - for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; + int has_under_min_watermark_zone = 0; /* The swap token gets in the way of swapout... */ if (!priority) @@ -1873,8 +2219,7 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* @@ -1897,7 +2242,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_lru_pages(zone); + lru_pages += zone_reclaimable_pages(zone); } /* @@ -1916,16 +2261,17 @@ loop_again: if (!populated_zone(zone)) continue; - if (zone_is_all_unreclaimable(zone) && - priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; - if (!zone_watermark_ok(zone, order, - high_wmark_pages(zone), end_zone, 0)) - all_zones_ok = 0; - temp_priority[i] = priority; sc.nr_scanned = 0; - note_zone_scanning_priority(zone, priority); + + /* + * Call soft limit reclaim before calling shrink_zone. + * For now we ignore the return value + */ + mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask); + /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. @@ -1938,12 +2284,10 @@ loop_again: lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; - if (zone_is_all_unreclaimable(zone)) + if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && zone->pages_scanned >= - (zone_lru_pages(zone) * 6)) - zone_set_flag(zone, - ZONE_ALL_UNRECLAIMABLE); + if (nr_slab == 0 && !zone_reclaimable(zone)) + zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and * the reclaim ratio is low, start doing writepage @@ -1952,6 +2296,29 @@ loop_again: if (total_scanned > SWAP_CLUSTER_MAX * 2 && total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), end_zone, 0)) { + all_zones_ok = 0; + /* + * We are still under min water mark. This + * means that we have a GFP_ATOMIC allocation + * failure risk. Hurry up! + */ + if (!zone_watermark_ok(zone, order, + min_wmark_pages(zone), end_zone, 0)) + has_under_min_watermark_zone = 1; + } else { + /* + * If a zone reaches its high watermark, + * consider it to be no longer congested. It's + * possible there are dirty pages backed by + * congested BDIs but as pressure is relieved, + * spectulatively avoid congestion waits + */ + zone_clear_flag(zone, ZONE_CONGESTED); + } + } if (all_zones_ok) break; /* kswapd: all done */ @@ -1959,8 +2326,12 @@ loop_again: * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + if (total_scanned && (priority < DEF_PRIORITY - 2)) { + if (has_under_min_watermark_zone) + count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); + else + congestion_wait(BLK_RW_ASYNC, HZ/10); + } /* * We do this so kswapd doesn't build up large priorities for @@ -1972,16 +2343,6 @@ loop_again: break; } out: - /* - * Note within each zone the priority level at which this zone was - * brought into a happy state. So that the next thread which scans this - * zone will start out at that priority level. - */ - for (i = 0; i < pgdat->nr_zones; i++) { - struct zone *zone = pgdat->node_zones + i; - - zone->prev_priority = temp_priority[i]; - } if (!all_zones_ok) { cond_resched(); @@ -2058,6 +2419,7 @@ static int kswapd(void *p) order = 0; for ( ; ; ) { unsigned long new_order; + int ret; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; @@ -2069,17 +2431,46 @@ static int kswapd(void *p) */ order = new_order; } else { - if (!freezing(current)) - schedule(); + if (!freezing(current) && !kthread_should_stop()) { + long remaining = 0; + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a + * premature sleep. If not, then go fully + * to sleep until explicitly woken up + */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + schedule(); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + } order = pgdat->kswapd_max_order; } finish_wait(&pgdat->kswapd_wait, &wait); - if (!try_to_freeze()) { - /* We can speed up thawing tasks if we don't call - * balance_pgdat after returning from the refrigerator - */ + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + /* + * We can speed up thawing tasks if we don't call balance_pgdat + * after returning from the refrigerator + */ + if (!ret) { + trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); balance_pgdat(pgdat, order); } } @@ -2101,6 +2492,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; if (!waitqueue_active(&pgdat->kswapd_wait)) @@ -2108,153 +2500,79 @@ void wakeup_kswapd(struct zone *zone, int order) wake_up_interruptible(&pgdat->kswapd_wait); } -unsigned long global_lru_pages(void) -{ - return global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_INACTIVE_FILE); -} - -#ifdef CONFIG_HIBERNATION /* - * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority. - * - * For pass > 3 we also try to shrink the LRU lists that contain a few pages + * The reclaimable count would be mostly accurate. + * The less reclaimable pages may be + * - mlocked pages, which will be moved to unevictable list when encountered + * - mapped pages, which may require several travels to be reclaimed + * - dirty pages, which is not "instantly" reclaimable */ -static void shrink_all_zones(unsigned long nr_pages, int prio, - int pass, struct scan_control *sc) +unsigned long global_reclaimable_pages(void) { - struct zone *zone; - unsigned long nr_reclaimed = 0; + int nr; - for_each_populated_zone(zone) { - enum lru_list l; + nr = global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE); - if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) - continue; + if (nr_swap_pages > 0) + nr += global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON); - for_each_evictable_lru(l) { - enum zone_stat_item ls = NR_LRU_BASE + l; - unsigned long lru_pages = zone_page_state(zone, ls); + return nr; +} - /* For pass = 0, we don't shrink the active list */ - if (pass == 0 && (l == LRU_ACTIVE_ANON || - l == LRU_ACTIVE_FILE)) - continue; +unsigned long zone_reclaimable_pages(struct zone *zone) +{ + int nr; - zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; - if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { - unsigned long nr_to_scan; - - zone->lru[l].nr_saved_scan = 0; - nr_to_scan = min(nr_pages, lru_pages); - nr_reclaimed += shrink_list(l, nr_to_scan, zone, - sc, prio); - if (nr_reclaimed >= nr_pages) { - sc->nr_reclaimed += nr_reclaimed; - return; - } - } - } - } - sc->nr_reclaimed += nr_reclaimed; + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (nr_swap_pages > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; } +#ifdef CONFIG_HIBERNATION /* - * Try to free `nr_pages' of memory, system-wide, and return the number of + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ -unsigned long shrink_all_memory(unsigned long nr_pages) +unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { - unsigned long lru_pages, nr_slab; - int pass; struct reclaim_state reclaim_state; struct scan_control sc = { - .gfp_mask = GFP_KERNEL, - .may_unmap = 0, + .gfp_mask = GFP_HIGHUSER_MOVABLE, + .may_swap = 1, + .may_unmap = 1, .may_writepage = 1, - .isolate_pages = isolate_pages_global, - .nr_reclaimed = 0, + .nr_to_reclaim = nr_to_reclaim, + .hibernation_mode = 1, + .swappiness = vm_swappiness, + .order = 0, }; + struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + struct task_struct *p = current; + unsigned long nr_reclaimed; - current->reclaim_state = &reclaim_state; - - lru_pages = global_lru_pages(); - nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); - /* If slab caches are huge, it's better to hit them first */ - while (nr_slab >= lru_pages) { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); - if (!reclaim_state.reclaimed_slab) - break; - - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - nr_slab -= reclaim_state.reclaimed_slab; - } - - /* - * We try to shrink LRUs in 5 passes: - * 0 = Reclaim from inactive_list only - * 1 = Reclaim from active list but don't reclaim mapped - * 2 = 2nd pass of type 1 - * 3 = Reclaim mapped (normal reclaim) - * 4 = 2nd pass of type 3 - */ - for (pass = 0; pass < 5; pass++) { - int prio; - - /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) - sc.may_unmap = 1; - - for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; - - sc.nr_scanned = 0; - sc.swap_cluster_max = nr_to_scan; - shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (sc.nr_reclaimed >= nr_pages) - goto out; - - reclaim_state.reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_lru_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ / 10); - } - } - - /* - * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be - * something in slab caches - */ - if (!sc.nr_reclaimed) { - do { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - } while (sc.nr_reclaimed < nr_pages && - reclaim_state.reclaimed_slab > 0); - } + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(sc.gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); -out: - current->reclaim_state = NULL; + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; - return sc.nr_reclaimed; + return nr_reclaimed; } #endif /* CONFIG_HIBERNATION */ @@ -2304,6 +2622,17 @@ int kswapd_run(int nid) return ret; } +/* + * Called by memory hotplug when all memory in a node is offlined. + */ +void kswapd_stop(int nid) +{ + struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + + if (kswapd) + kthread_stop(kswapd); +} + static int __init kswapd_init(void) { int nid; @@ -2406,16 +2735,14 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, - .swap_cluster_max = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), + .nr_to_reclaim = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, .order = order, - .isolate_pages = isolate_pages_global, }; - unsigned long slab_reclaimable; + unsigned long nr_slab_pages0, nr_slab_pages1; - disable_swap_token(); cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP @@ -2423,6 +2750,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * and RECLAIM_SWAP. */ p->flags |= PF_MEMALLOC | PF_SWAPWRITE; + lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -2433,14 +2761,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ priority = ZONE_RECLAIM_PRIORITY; do { - note_zone_scanning_priority(zone, priority); shrink_zone(priority, zone, &sc); priority--; } while (priority >= 0 && sc.nr_reclaimed < nr_pages); } - slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); - if (slab_reclaimable > zone->min_slab_pages) { + nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages0 > zone->min_slab_pages) { /* * shrink_slab() does not currently allow us to determine how * many pages were freed in this zone. So we take the current @@ -2451,21 +2778,32 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Note that shrink_slab will free memory on all zones and may * take a long time. */ - while (shrink_slab(sc.nr_scanned, gfp_mask, order) && - zone_page_state(zone, NR_SLAB_RECLAIMABLE) > - slab_reclaimable - nr_pages) - ; + for (;;) { + unsigned long lru_pages = zone_reclaimable_pages(zone); + + /* No reclaimable slab or very low memory pressure */ + if (!shrink_slab(sc.nr_scanned, gfp_mask, lru_pages)) + break; + + /* Freed enough memory */ + nr_slab_pages1 = zone_page_state(zone, + NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 + nr_pages <= nr_slab_pages0) + break; + } /* * Update nr_reclaimed by the number of slab pages we * reclaimed from this zone. */ - sc.nr_reclaimed += slab_reclaimable - - zone_page_state(zone, NR_SLAB_RECLAIMABLE); + nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE); + if (nr_slab_pages1 < nr_slab_pages0) + sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1; } p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); + lockdep_clear_current_reclaim_state(); return sc.nr_reclaimed >= nr_pages; } @@ -2488,7 +2826,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) return ZONE_RECLAIM_FULL; - if (zone_is_all_unreclaimable(zone)) + if (zone->all_unreclaimable) return ZONE_RECLAIM_FULL; /* @@ -2564,7 +2902,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone) retry: ClearPageUnevictable(page); if (page_evictable(page, NULL)) { - enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); + enum lru_list l = page_lru_base_type(page); __dec_zone_state(zone, NR_UNEVICTABLE); list_move(&page->lru, &zone->lru[l].list); @@ -2707,10 +3045,10 @@ static void scan_all_zones_unevictable_pages(void) unsigned long scan_unevictable_pages; int scan_unevictable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write && *(unsigned long *)table->data) scan_all_zones_unevictable_pages(); @@ -2719,6 +3057,7 @@ int scan_unevictable_handler(struct ctl_table *table, int write, return 0; } +#ifdef CONFIG_NUMA /* * per node 'scan_unevictable_pages' attribute. On demand re-scan of * a specified node's per zone unevictable lists for evictable pages. @@ -2765,4 +3104,4 @@ void scan_unevictable_unregister_node(struct node *node) { sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } - +#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 138bed5..312d728 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -12,22 +12,26 @@ #include <linux/mm.h> #include <linux/err.h> #include <linux/module.h> +#include <linux/slab.h> #include <linux/cpu.h> #include <linux/vmstat.h> #include <linux/sched.h> +#include <linux/math64.h> +#include <linux/writeback.h> +#include <linux/compaction.h> #ifdef CONFIG_VM_EVENT_COUNTERS DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}}; EXPORT_PER_CPU_SYMBOL(vm_event_states); -static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) +static void sum_vm_events(unsigned long *ret) { int cpu; int i; memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long)); - for_each_cpu(cpu, cpumask) { + for_each_online_cpu(cpu) { struct vm_event_state *this = &per_cpu(vm_event_states, cpu); for (i = 0; i < NR_VM_EVENT_ITEMS; i++) @@ -43,7 +47,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask) void all_vm_events(unsigned long *ret) { get_online_cpus(); - sum_vm_events(ret, cpu_online_mask); + sum_vm_events(ret); put_online_cpus(); } EXPORT_SYMBOL_GPL(all_vm_events); @@ -136,10 +140,24 @@ static void refresh_zone_stat_thresholds(void) int threshold; for_each_populated_zone(zone) { + unsigned long max_drift, tolerate_drift; + threshold = calculate_threshold(zone); for_each_online_cpu(cpu) - zone_pcp(zone, cpu)->stat_threshold = threshold; + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + + /* + * Only set percpu_drift_mark if there is a danger that + * NR_FREE_PAGES reports the low watermark is ok when in fact + * the min watermark could be breached by an allocation + */ + tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone); + max_drift = num_online_cpus() * threshold; + if (max_drift > tolerate_drift) + zone->percpu_drift_mark = high_wmark_pages(zone) + + max_drift; } } @@ -149,35 +167,24 @@ static void refresh_zone_stat_thresholds(void) void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); - s8 *p = pcp->vm_stat_diff + item; + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; long x; + long t; - x = delta + *p; + x = delta + __this_cpu_read(*p); - if (unlikely(x > pcp->stat_threshold || x < -pcp->stat_threshold)) { + t = __this_cpu_read(pcp->stat_threshold); + + if (unlikely(x > t || x < -t)) { zone_page_state_add(x, zone, item); x = 0; } - *p = x; + __this_cpu_write(*p, x); } EXPORT_SYMBOL(__mod_zone_page_state); /* - * For an unknown interrupt state - */ -void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, - int delta) -{ - unsigned long flags; - - local_irq_save(flags); - __mod_zone_page_state(zone, item, delta); - local_irq_restore(flags); -} -EXPORT_SYMBOL(mod_zone_page_state); - -/* * Optimized increment and decrement functions. * * These are only for a single page and therefore can take a struct page * @@ -202,16 +209,17 @@ EXPORT_SYMBOL(mod_zone_page_state); */ void __inc_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); - s8 *p = pcp->vm_stat_diff + item; - - (*p)++; + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; - if (unlikely(*p > pcp->stat_threshold)) { - int overstep = pcp->stat_threshold / 2; + v = __this_cpu_inc_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v > t)) { + s8 overstep = t >> 1; - zone_page_state_add(*p + overstep, zone, item); - *p = -overstep; + zone_page_state_add(v + overstep, zone, item); + __this_cpu_write(*p, -overstep); } } @@ -223,16 +231,17 @@ EXPORT_SYMBOL(__inc_zone_page_state); void __dec_zone_state(struct zone *zone, enum zone_stat_item item) { - struct per_cpu_pageset *pcp = zone_pcp(zone, smp_processor_id()); - s8 *p = pcp->vm_stat_diff + item; - - (*p)--; + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + s8 v, t; - if (unlikely(*p < - pcp->stat_threshold)) { - int overstep = pcp->stat_threshold / 2; + v = __this_cpu_dec_return(*p); + t = __this_cpu_read(pcp->stat_threshold); + if (unlikely(v < - t)) { + s8 overstep = t >> 1; - zone_page_state_add(*p - overstep, zone, item); - *p = overstep; + zone_page_state_add(v - overstep, zone, item); + __this_cpu_write(*p, overstep); } } @@ -242,6 +251,92 @@ void __dec_zone_page_state(struct page *page, enum zone_stat_item item) } EXPORT_SYMBOL(__dec_zone_page_state); +#ifdef CONFIG_CMPXCHG_LOCAL +/* + * If we have cmpxchg_local support then we do not need to incur the overhead + * that comes with local_irq_save/restore if we use this_cpu_cmpxchg. + * + * mod_state() modifies the zone counter state through atomic per cpu + * operations. + * + * Overstep mode specifies how overstep should handled: + * 0 No overstepping + * 1 Overstepping half of threshold + * -1 Overstepping minus half of threshold +*/ +static inline void mod_state(struct zone *zone, + enum zone_stat_item item, int delta, int overstep_mode) +{ + struct per_cpu_pageset __percpu *pcp = zone->pageset; + s8 __percpu *p = pcp->vm_stat_diff + item; + long o, n, t, z; + + do { + z = 0; /* overflow to zone counters */ + + /* + * The fetching of the stat_threshold is racy. We may apply + * a counter threshold to the wrong the cpu if we get + * rescheduled while executing here. However, the following + * will apply the threshold again and therefore bring the + * counter under the threshold. + */ + t = this_cpu_read(pcp->stat_threshold); + + o = this_cpu_read(*p); + n = delta + o; + + if (n > t || n < -t) { + int os = overstep_mode * (t >> 1) ; + + /* Overflow must be added to zone counters */ + z = n + os; + n = -os; + } + } while (this_cpu_cmpxchg(*p, o, n) != o); + + if (z) + zone_page_state_add(z, zone, item); +} + +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + mod_state(zone, item, delta, 0); +} +EXPORT_SYMBOL(mod_zone_page_state); + +void inc_zone_state(struct zone *zone, enum zone_stat_item item) +{ + mod_state(zone, item, 1, 1); +} + +void inc_zone_page_state(struct page *page, enum zone_stat_item item) +{ + mod_state(page_zone(page), item, 1, 1); +} +EXPORT_SYMBOL(inc_zone_page_state); + +void dec_zone_page_state(struct page *page, enum zone_stat_item item) +{ + mod_state(page_zone(page), item, -1, -1); +} +EXPORT_SYMBOL(dec_zone_page_state); +#else +/* + * Use interrupt disable to serialize counter updates + */ +void mod_zone_page_state(struct zone *zone, enum zone_stat_item item, + int delta) +{ + unsigned long flags; + + local_irq_save(flags); + __mod_zone_page_state(zone, item, delta); + local_irq_restore(flags); +} +EXPORT_SYMBOL(mod_zone_page_state); + void inc_zone_state(struct zone *zone, enum zone_stat_item item) { unsigned long flags; @@ -272,6 +367,7 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item) local_irq_restore(flags); } EXPORT_SYMBOL(dec_zone_page_state); +#endif /* * Update the zone counters for one cpu. @@ -300,7 +396,7 @@ void refresh_cpu_vm_stats(int cpu) for_each_populated_zone(zone) { struct per_cpu_pageset *p; - p = zone_pcp(zone, cpu); + p = per_cpu_ptr(zone->pageset, cpu); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) if (p->vm_stat_diff[i]) { @@ -376,7 +472,87 @@ void zone_statistics(struct zone *preferred_zone, struct zone *z) } #endif -#ifdef CONFIG_PROC_FS +#ifdef CONFIG_COMPACTION + +struct contig_page_info { + unsigned long free_pages; + unsigned long free_blocks_total; + unsigned long free_blocks_suitable; +}; + +/* + * Calculate the number of free pages in a zone, how many contiguous + * pages are free and how many are large enough to satisfy an allocation of + * the target size. Note that this function makes no attempt to estimate + * how many suitable free blocks there *might* be if MOVABLE pages were + * migrated. Calculating that is possible, but expensive and can be + * figured out from userspace + */ +static void fill_contig_page_info(struct zone *zone, + unsigned int suitable_order, + struct contig_page_info *info) +{ + unsigned int order; + + info->free_pages = 0; + info->free_blocks_total = 0; + info->free_blocks_suitable = 0; + + for (order = 0; order < MAX_ORDER; order++) { + unsigned long blocks; + + /* Count number of free blocks */ + blocks = zone->free_area[order].nr_free; + info->free_blocks_total += blocks; + + /* Count free base pages */ + info->free_pages += blocks << order; + + /* Count the suitable free blocks */ + if (order >= suitable_order) + info->free_blocks_suitable += blocks << + (order - suitable_order); + } +} + +/* + * A fragmentation index only makes sense if an allocation of a requested + * size would fail. If that is true, the fragmentation index indicates + * whether external fragmentation or a lack of memory was the problem. + * The value can be used to determine if page reclaim or compaction + * should be used + */ +static int __fragmentation_index(unsigned int order, struct contig_page_info *info) +{ + unsigned long requested = 1UL << order; + + if (!info->free_blocks_total) + return 0; + + /* Fragmentation index only makes sense when a request would fail */ + if (info->free_blocks_suitable) + return -1000; + + /* + * Index is between 0 and 1 so return within 3 decimal places + * + * 0 => allocation would fail due to lack of memory + * 1 => allocation would fail due to fragmentation + */ + return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total); +} + +/* Same as __fragmentation index but allocs contig_page_info on stack */ +int fragmentation_index(struct zone *zone, unsigned int order) +{ + struct contig_page_info info; + + fill_contig_page_info(zone, order, &info); + return __fragmentation_index(order, &info); +} +#endif + +#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION) #include <linux/proc_fs.h> #include <linux/seq_file.h> @@ -429,7 +605,9 @@ static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat, spin_unlock_irqrestore(&zone->lock, flags); } } +#endif +#ifdef CONFIG_PROC_FS static void frag_show_print(struct seq_file *m, pg_data_t *pgdat, struct zone *zone) { @@ -639,10 +817,16 @@ static const char * const vmstat_text[] = { "nr_slab_reclaimable", "nr_slab_unreclaimable", "nr_page_table_pages", + "nr_kernel_stack", "nr_unstable", "nr_bounce", "nr_vmscan_write", "nr_writeback_temp", + "nr_isolated_anon", + "nr_isolated_file", + "nr_shmem", + "nr_dirtied", + "nr_written", #ifdef CONFIG_NUMA "numa_hit", @@ -652,6 +836,8 @@ static const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_dirty_threshold", + "nr_dirty_background_threshold", #ifdef CONFIG_VM_EVENT_COUNTERS "pgpgin", @@ -680,10 +866,23 @@ static const char * const vmstat_text[] = { "slabs_scanned", "kswapd_steal", "kswapd_inodesteal", + "kswapd_low_wmark_hit_quickly", + "kswapd_high_wmark_hit_quickly", + "kswapd_skip_congestion_wait", "pageoutrun", "allocstall", "pgrotated", + +#ifdef CONFIG_COMPACTION + "compact_blocks_moved", + "compact_pages_moved", + "compact_pagemigrate_failed", + "compact_stall", + "compact_fail", + "compact_success", +#endif + #ifdef CONFIG_HUGETLB_PAGE "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", @@ -712,7 +911,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_page_state(zone, NR_FREE_PAGES), + zone_nr_free_pages(zone), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), @@ -735,7 +934,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, for_each_online_cpu(i) { struct per_cpu_pageset *pageset; - pageset = zone_pcp(zone, i); + pageset = per_cpu_ptr(zone->pageset, i); seq_printf(m, "\n cpu: %i" "\n count: %i" @@ -752,11 +951,9 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, } seq_printf(m, "\n all_unreclaimable: %u" - "\n prev_priority: %i" "\n start_pfn: %lu" "\n inactive_ratio: %u", - zone_is_all_unreclaimable(zone), - zone->prev_priority, + zone->all_unreclaimable, zone->zone_start_pfn, zone->inactive_ratio); seq_putc(m, '\n'); @@ -792,36 +989,44 @@ static const struct file_operations proc_zoneinfo_file_operations = { .release = seq_release, }; +enum writeback_stat_item { + NR_DIRTY_THRESHOLD, + NR_DIRTY_BG_THRESHOLD, + NR_VM_WRITEBACK_STAT_ITEMS, +}; + static void *vmstat_start(struct seq_file *m, loff_t *pos) { unsigned long *v; -#ifdef CONFIG_VM_EVENT_COUNTERS - unsigned long *e; -#endif - int i; + int i, stat_items_size; if (*pos >= ARRAY_SIZE(vmstat_text)) return NULL; + stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) + + NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long); #ifdef CONFIG_VM_EVENT_COUNTERS - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) - + sizeof(struct vm_event_state), GFP_KERNEL); -#else - v = kmalloc(NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long), - GFP_KERNEL); + stat_items_size += sizeof(struct vm_event_state); #endif + + v = kmalloc(stat_items_size, GFP_KERNEL); m->private = v; if (!v) return ERR_PTR(-ENOMEM); for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) v[i] = global_page_state(i); + v += NR_VM_ZONE_STAT_ITEMS; + + global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD, + v + NR_DIRTY_THRESHOLD); + v += NR_VM_WRITEBACK_STAT_ITEMS; + #ifdef CONFIG_VM_EVENT_COUNTERS - e = v + NR_VM_ZONE_STAT_ITEMS; - all_vm_events(e); - e[PGPGIN] /= 2; /* sectors -> kbytes */ - e[PGPGOUT] /= 2; + all_vm_events(v); + v[PGPGIN] /= 2; /* sectors -> kbytes */ + v[PGPGOUT] /= 2; #endif - return v + *pos; + return (unsigned long *)m->private + *pos; } static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) @@ -880,11 +1085,10 @@ static void vmstat_update(struct work_struct *w) static void __cpuinit start_cpu_timer(int cpu) { - struct delayed_work *vmstat_work = &per_cpu(vmstat_work, cpu); + struct delayed_work *work = &per_cpu(vmstat_work, cpu); - INIT_DELAYED_WORK_DEFERRABLE(vmstat_work, vmstat_update); - schedule_delayed_work_on(cpu, vmstat_work, - __round_jiffies_relative(HZ, cpu)); + INIT_DELAYED_WORK_DEFERRABLE(work, vmstat_update); + schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } /* @@ -900,11 +1104,13 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: + refresh_zone_stat_thresholds(); start_cpu_timer(cpu); + node_set_state(cpu_to_node(cpu), N_CPU); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - cancel_rearming_delayed_work(&per_cpu(vmstat_work, cpu)); + cancel_delayed_work_sync(&per_cpu(vmstat_work, cpu)); per_cpu(vmstat_work, cpu).work.func = NULL; break; case CPU_DOWN_FAILED: @@ -945,3 +1151,162 @@ static int __init setup_vmstat(void) return 0; } module_init(setup_vmstat) + +#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION) +#include <linux/debugfs.h> + +static struct dentry *extfrag_debug_root; + +/* + * Return an index indicating how much of the available free memory is + * unusable for an allocation of the requested size. + */ +static int unusable_free_index(unsigned int order, + struct contig_page_info *info) +{ + /* No free memory is interpreted as all free memory is unusable */ + if (info->free_pages == 0) + return 1000; + + /* + * Index should be a value between 0 and 1. Return a value to 3 + * decimal places. + * + * 0 => no fragmentation + * 1 => high fragmentation + */ + return div_u64((info->free_pages - (info->free_blocks_suitable << order)) * 1000ULL, info->free_pages); + +} + +static void unusable_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = unusable_free_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display unusable free space index + * + * The unusable free space index measures how much of the available free + * memory cannot be used to satisfy an allocation of a given size and is a + * value between 0 and 1. The higher the value, the more of free memory is + * unusable and by implication, the worse the external fragmentation is. This + * can be expressed as a percentage by multiplying by 100. + */ +static int unusable_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + /* check memoryless node */ + if (!node_state(pgdat->node_id, N_HIGH_MEMORY)) + return 0; + + walk_zones_in_node(m, pgdat, unusable_show_print); + + return 0; +} + +static const struct seq_operations unusable_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = unusable_show, +}; + +static int unusable_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &unusable_op); +} + +static const struct file_operations unusable_file_ops = { + .open = unusable_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static void extfrag_show_print(struct seq_file *m, + pg_data_t *pgdat, struct zone *zone) +{ + unsigned int order; + int index; + + /* Alloc on stack as interrupts are disabled for zone walk */ + struct contig_page_info info; + + seq_printf(m, "Node %d, zone %8s ", + pgdat->node_id, + zone->name); + for (order = 0; order < MAX_ORDER; ++order) { + fill_contig_page_info(zone, order, &info); + index = __fragmentation_index(order, &info); + seq_printf(m, "%d.%03d ", index / 1000, index % 1000); + } + + seq_putc(m, '\n'); +} + +/* + * Display fragmentation index for orders that allocations would fail for + */ +static int extfrag_show(struct seq_file *m, void *arg) +{ + pg_data_t *pgdat = (pg_data_t *)arg; + + walk_zones_in_node(m, pgdat, extfrag_show_print); + + return 0; +} + +static const struct seq_operations extfrag_op = { + .start = frag_start, + .next = frag_next, + .stop = frag_stop, + .show = extfrag_show, +}; + +static int extfrag_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &extfrag_op); +} + +static const struct file_operations extfrag_file_ops = { + .open = extfrag_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +static int __init extfrag_debug_init(void) +{ + extfrag_debug_root = debugfs_create_dir("extfrag", NULL); + if (!extfrag_debug_root) + return -ENOMEM; + + if (!debugfs_create_file("unusable_index", 0444, + extfrag_debug_root, NULL, &unusable_file_ops)) + return -ENOMEM; + + if (!debugfs_create_file("extfrag_index", 0444, + extfrag_debug_root, NULL, &extfrag_file_ops)) + return -ENOMEM; + + return 0; +} + +module_init(extfrag_debug_init); +#endif |