summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig1
-rw-r--r--mm/Kconfig.debug9
-rw-r--r--mm/Makefile2
-rw-r--r--mm/debug.c1
-rw-r--r--mm/filemap.c30
-rw-r--r--mm/filemap_xip.c1
-rw-r--r--mm/fremap.c283
-rw-r--r--mm/gup.c6
-rw-r--r--mm/hugetlb.c2
-rw-r--r--mm/interval_tree.c34
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/madvise.c13
-rw-r--r--mm/memcontrol.c210
-rw-r--r--mm/memory.c299
-rw-r--r--mm/migrate.c32
-rw-r--r--mm/mincore.c9
-rw-r--r--mm/mmap.c106
-rw-r--r--mm/mprotect.c2
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/msync.c5
-rw-r--r--mm/nommu.c9
-rw-r--r--mm/page-writeback.c43
-rw-r--r--mm/page_alloc.c90
-rw-r--r--mm/pagewalk.c5
-rw-r--r--mm/rmap.c267
-rw-r--r--mm/shmem.c3
-rw-r--r--mm/slab.h4
-rw-r--r--mm/slab_common.c151
-rw-r--r--mm/slub.c37
-rw-r--r--mm/swap.c4
-rw-r--r--mm/vmscan.c26
-rw-r--r--mm/vmstat.c124
32 files changed, 567 insertions, 1247 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 1d1ae6b..4395b12 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -325,6 +325,7 @@ config VIRT_TO_BUS
config MMU_NOTIFIER
bool
+ select SRCU
config KSM
bool "Enable KSM for page merging"
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 56badfc..957d3da 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC
depends on !KMEMCHECK
select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
- select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
@@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC
that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image.
-config WANT_PAGE_DEBUG_FLAGS
- bool
-
config PAGE_POISONING
bool
- select WANT_PAGE_DEBUG_FLAGS
-
-config PAGE_GUARD
- bool
- select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/Makefile b/mm/Makefile
index 4bf586e..3548460 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -3,7 +3,7 @@
#
mmu-y := nommu.o
-mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \
+mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
vmalloc.o pagewalk.o pgtable-generic.o
diff --git a/mm/debug.c b/mm/debug.c
index 0e58f32..d69cb5a 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = {
{VM_ACCOUNT, "account" },
{VM_NORESERVE, "noreserve" },
{VM_HUGETLB, "hugetlb" },
- {VM_NONLINEAR, "nonlinear" },
#if defined(CONFIG_X86)
{VM_PAT, "pat" },
#elif defined(CONFIG_PPC)
diff --git a/mm/filemap.c b/mm/filemap.c
index bd8543c..bf7a271 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1046,8 +1046,7 @@ EXPORT_SYMBOL(find_lock_entry);
* @mapping: the address_space to search
* @offset: the page index
* @fgp_flags: PCG flags
- * @cache_gfp_mask: gfp mask to use for the page cache data page allocation
- * @radix_gfp_mask: gfp mask to use for radix tree node allocation
+ * @gfp_mask: gfp mask to use for the page cache data page allocation
*
* Looks up the page cache slot at @mapping & @offset.
*
@@ -1056,11 +1055,9 @@ EXPORT_SYMBOL(find_lock_entry);
* FGP_ACCESSED: the page will be marked accessed
* FGP_LOCK: Page is return locked
* FGP_CREAT: If page is not present then a new page is allocated using
- * @cache_gfp_mask and added to the page cache and the VM's LRU
- * list. If radix tree nodes are allocated during page cache
- * insertion then @radix_gfp_mask is used. The page is returned
- * locked and with an increased refcount. Otherwise, %NULL is
- * returned.
+ * @gfp_mask and added to the page cache and the VM's LRU
+ * list. The page is returned locked and with an increased
+ * refcount. Otherwise, %NULL is returned.
*
* If FGP_LOCK or FGP_CREAT are specified then the function may sleep even
* if the GFP flags specified for FGP_CREAT are atomic.
@@ -1068,7 +1065,7 @@ EXPORT_SYMBOL(find_lock_entry);
* If there is a page cache page, it is returned with an increased refcount.
*/
struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset,
- int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask)
+ int fgp_flags, gfp_t gfp_mask)
{
struct page *page;
@@ -1105,13 +1102,11 @@ no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
int err;
if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping))
- cache_gfp_mask |= __GFP_WRITE;
- if (fgp_flags & FGP_NOFS) {
- cache_gfp_mask &= ~__GFP_FS;
- radix_gfp_mask &= ~__GFP_FS;
- }
+ gfp_mask |= __GFP_WRITE;
+ if (fgp_flags & FGP_NOFS)
+ gfp_mask &= ~__GFP_FS;
- page = __page_cache_alloc(cache_gfp_mask);
+ page = __page_cache_alloc(gfp_mask);
if (!page)
return NULL;
@@ -1122,7 +1117,8 @@ no_page:
if (fgp_flags & FGP_ACCESSED)
__SetPageReferenced(page);
- err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask);
+ err = add_to_page_cache_lru(page, mapping, offset,
+ gfp_mask & GFP_RECLAIM_MASK);
if (unlikely(err)) {
page_cache_release(page);
page = NULL;
@@ -2091,7 +2087,6 @@ const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = filemap_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
/* This is used for a general mmap of a disk file */
@@ -2443,8 +2438,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping,
fgp_flags |= FGP_NOFS;
page = pagecache_get_page(mapping, index, fgp_flags,
- mapping_gfp_mask(mapping),
- GFP_KERNEL);
+ mapping_gfp_mask(mapping));
if (page)
wait_for_stable_page(page);
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index 0d105ae..70c09da 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -301,7 +301,6 @@ out:
static const struct vm_operations_struct xip_file_vm_ops = {
.fault = xip_file_fault,
.page_mkwrite = filemap_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
diff --git a/mm/fremap.c b/mm/fremap.c
deleted file mode 100644
index 2805d71..0000000
--- a/mm/fremap.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * linux/mm/fremap.c
- *
- * Explicit pagetable population and nonlinear (random) mappings support.
- *
- * started by Ingo Molnar, Copyright (C) 2002, 2003
- */
-#include <linux/export.h>
-#include <linux/backing-dev.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/file.h>
-#include <linux/mman.h>
-#include <linux/pagemap.h>
-#include <linux/swapops.h>
-#include <linux/rmap.h>
-#include <linux/syscalls.h>
-#include <linux/mmu_notifier.h>
-
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-#include "internal.h"
-
-static int mm_counter(struct page *page)
-{
- return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
-}
-
-static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- pte_t pte = *ptep;
- struct page *page;
- swp_entry_t entry;
-
- if (pte_present(pte)) {
- flush_cache_page(vma, addr, pte_pfn(pte));
- pte = ptep_clear_flush_notify(vma, addr, ptep);
- page = vm_normal_page(vma, addr, pte);
- if (page) {
- if (pte_dirty(pte))
- set_page_dirty(page);
- update_hiwater_rss(mm);
- dec_mm_counter(mm, mm_counter(page));
- page_remove_rmap(page);
- page_cache_release(page);
- }
- } else { /* zap_pte() is not called when pte_none() */
- if (!pte_file(pte)) {
- update_hiwater_rss(mm);
- entry = pte_to_swp_entry(pte);
- if (non_swap_entry(entry)) {
- if (is_migration_entry(entry)) {
- page = migration_entry_to_page(entry);
- dec_mm_counter(mm, mm_counter(page));
- }
- } else {
- free_swap_and_cache(entry);
- dec_mm_counter(mm, MM_SWAPENTS);
- }
- }
- pte_clear_not_present_full(mm, addr, ptep, 0);
- }
-}
-
-/*
- * Install a file pte to a given virtual memory address, release any
- * previously existing mapping.
- */
-static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, unsigned long pgoff, pgprot_t prot)
-{
- int err = -ENOMEM;
- pte_t *pte, ptfile;
- spinlock_t *ptl;
-
- pte = get_locked_pte(mm, addr, &ptl);
- if (!pte)
- goto out;
-
- ptfile = pgoff_to_pte(pgoff);
-
- if (!pte_none(*pte))
- zap_pte(mm, vma, addr, pte);
-
- set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
- /*
- * We don't need to run update_mmu_cache() here because the "file pte"
- * being installed by install_file_pte() is not a real pte - it's a
- * non-present entry (like a swap entry), noting what file offset should
- * be mapped there when there's a fault (in a non-linear vma where
- * that's not obvious).
- */
- pte_unmap_unlock(pte, ptl);
- err = 0;
-out:
- return err;
-}
-
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
- unsigned long size, pgoff_t pgoff)
-{
- struct mm_struct *mm = vma->vm_mm;
- int err;
-
- do {
- err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
- if (err)
- return err;
-
- size -= PAGE_SIZE;
- addr += PAGE_SIZE;
- pgoff++;
- } while (size);
-
- return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
-/**
- * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
- * @start: start of the remapped virtual memory range
- * @size: size of the remapped virtual memory range
- * @prot: new protection bits of the range (see NOTE)
- * @pgoff: to-be-mapped page of the backing store file
- * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
- *
- * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
- * (shared backing store file).
- *
- * This syscall works purely via pagetables, so it's the most efficient
- * way to map the same (large) file into a given virtual window. Unlike
- * mmap()/mremap() it does not create any new vmas. The new mappings are
- * also safe across swapout.
- *
- * NOTE: the @prot parameter right now is ignored (but must be zero),
- * and the vma's default protection is used. Arbitrary protections
- * might be implemented in the future.
- */
-SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
- unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
-{
- struct mm_struct *mm = current->mm;
- struct address_space *mapping;
- struct vm_area_struct *vma;
- int err = -EINVAL;
- int has_write_lock = 0;
- vm_flags_t vm_flags = 0;
-
- pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
- "See Documentation/vm/remap_file_pages.txt.\n",
- current->comm, current->pid);
-
- if (prot)
- return err;
- /*
- * Sanitize the syscall parameters:
- */
- start = start & PAGE_MASK;
- size = size & PAGE_MASK;
-
- /* Does the address range wrap, or is the span zero-sized? */
- if (start + size <= start)
- return err;
-
- /* Does pgoff wrap? */
- if (pgoff + (size >> PAGE_SHIFT) < pgoff)
- return err;
-
- /* Can we represent this offset inside this architecture's pte's? */
-#if PTE_FILE_MAX_BITS < BITS_PER_LONG
- if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
- return err;
-#endif
-
- /* We need down_write() to change vma->vm_flags. */
- down_read(&mm->mmap_sem);
- retry:
- vma = find_vma(mm, start);
-
- /*
- * Make sure the vma is shared, that it supports prefaulting,
- * and that the remapped range is valid and fully within
- * the single existing vma.
- */
- if (!vma || !(vma->vm_flags & VM_SHARED))
- goto out;
-
- if (!vma->vm_ops || !vma->vm_ops->remap_pages)
- goto out;
-
- if (start < vma->vm_start || start + size > vma->vm_end)
- goto out;
-
- /* Must set VM_NONLINEAR before any pages are populated. */
- if (!(vma->vm_flags & VM_NONLINEAR)) {
- /*
- * vm_private_data is used as a swapout cursor
- * in a VM_NONLINEAR vma.
- */
- if (vma->vm_private_data)
- goto out;
-
- /* Don't need a nonlinear mapping, exit success */
- if (pgoff == linear_page_index(vma, start)) {
- err = 0;
- goto out;
- }
-
- if (!has_write_lock) {
-get_write_lock:
- up_read(&mm->mmap_sem);
- down_write(&mm->mmap_sem);
- has_write_lock = 1;
- goto retry;
- }
- mapping = vma->vm_file->f_mapping;
- /*
- * page_mkclean doesn't work on nonlinear vmas, so if
- * dirty pages need to be accounted, emulate with linear
- * vmas.
- */
- if (mapping_cap_account_dirty(mapping)) {
- unsigned long addr;
- struct file *file = get_file(vma->vm_file);
- /* mmap_region may free vma; grab the info now */
- vm_flags = vma->vm_flags;
-
- addr = mmap_region(file, start, size, vm_flags, pgoff);
- fput(file);
- if (IS_ERR_VALUE(addr)) {
- err = addr;
- } else {
- BUG_ON(addr != start);
- err = 0;
- }
- goto out_freed;
- }
- i_mmap_lock_write(mapping);
- flush_dcache_mmap_lock(mapping);
- vma->vm_flags |= VM_NONLINEAR;
- vma_interval_tree_remove(vma, &mapping->i_mmap);
- vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
- flush_dcache_mmap_unlock(mapping);
- i_mmap_unlock_write(mapping);
- }
-
- if (vma->vm_flags & VM_LOCKED) {
- /*
- * drop PG_Mlocked flag for over-mapped range
- */
- if (!has_write_lock)
- goto get_write_lock;
- vm_flags = vma->vm_flags;
- munlock_vma_pages_range(vma, start, start + size);
- vma->vm_flags = vm_flags;
- }
-
- mmu_notifier_invalidate_range_start(mm, start, start + size);
- err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
- mmu_notifier_invalidate_range_end(mm, start, start + size);
-
- /*
- * We can't clear VM_NONLINEAR because we'd have to do
- * it after ->populate completes, and that would prevent
- * downgrading the lock. (Locks can't be upgraded).
- */
-
-out:
- if (vma)
- vm_flags = vma->vm_flags;
-out_freed:
- if (likely(!has_write_lock))
- up_read(&mm->mmap_sem);
- else
- up_write(&mm->mmap_sem);
- if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
- mm_populate(start, size);
-
- return err;
-}
diff --git a/mm/gup.c b/mm/gup.c
index a900759..12bc2bc 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -55,7 +55,7 @@ retry:
*/
if (likely(!(flags & FOLL_MIGRATION)))
goto no_page;
- if (pte_none(pte) || pte_file(pte))
+ if (pte_none(pte))
goto no_page;
entry = pte_to_swp_entry(pte);
if (!is_migration_entry(entry))
@@ -296,7 +296,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
return -ENOMEM;
if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT;
- if (ret & VM_FAULT_SIGBUS)
+ if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
return -EFAULT;
BUG();
}
@@ -571,7 +571,7 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
return -ENOMEM;
if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE))
return -EHWPOISON;
- if (ret & VM_FAULT_SIGBUS)
+ if (ret & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV))
return -EFAULT;
BUG();
}
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85032de..be0e5d0 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,7 +35,7 @@
#include <linux/node.h>
#include "internal.h"
-unsigned long hugepages_treat_as_movable;
+int hugepages_treat_as_movable;
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 8da581f..f2c2492 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
}
-INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
- unsigned long, shared.linear.rb_subtree_last,
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
+ unsigned long, shared.rb_subtree_last,
vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
/* Insert node immediately after prev in the interval tree */
@@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
- if (!prev->shared.linear.rb.rb_right) {
+ if (!prev->shared.rb.rb_right) {
parent = prev;
- link = &prev->shared.linear.rb.rb_right;
+ link = &prev->shared.rb.rb_right;
} else {
- parent = rb_entry(prev->shared.linear.rb.rb_right,
- struct vm_area_struct, shared.linear.rb);
- if (parent->shared.linear.rb_subtree_last < last)
- parent->shared.linear.rb_subtree_last = last;
- while (parent->shared.linear.rb.rb_left) {
- parent = rb_entry(parent->shared.linear.rb.rb_left,
- struct vm_area_struct, shared.linear.rb);
- if (parent->shared.linear.rb_subtree_last < last)
- parent->shared.linear.rb_subtree_last = last;
+ parent = rb_entry(prev->shared.rb.rb_right,
+ struct vm_area_struct, shared.rb);
+ if (parent->shared.rb_subtree_last < last)
+ parent->shared.rb_subtree_last = last;
+ while (parent->shared.rb.rb_left) {
+ parent = rb_entry(parent->shared.rb.rb_left,
+ struct vm_area_struct, shared.rb);
+ if (parent->shared.rb_subtree_last < last)
+ parent->shared.rb_subtree_last = last;
}
- link = &parent->shared.linear.rb.rb_left;
+ link = &parent->shared.rb.rb_left;
}
- node->shared.linear.rb_subtree_last = last;
- rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
- rb_insert_augmented(&node->shared.linear.rb, root,
+ node->shared.rb_subtree_last = last;
+ rb_link_node(&node->shared.rb, &parent->shared.rb, link);
+ rb_insert_augmented(&node->shared.rb, root,
&vma_interval_tree_augment);
}
diff --git a/mm/ksm.c b/mm/ksm.c
index d247efa..4162dce 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -376,7 +376,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
else
ret = VM_FAULT_WRITE;
put_page(page);
- } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_OOM)));
+ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM)));
/*
* We must loop because handle_mm_fault() may back out if there's
* any difficulty e.g. if pte accessed bit gets updated concurrently.
@@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
*/
if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
+ VM_HUGETLB | VM_MIXEDMAP))
return 0; /* just ignore the advice */
#ifdef VM_SAO
diff --git a/mm/madvise.c b/mm/madvise.c
index a271adc..d79fb5e 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
pte = *(orig_pte + ((index - start) / PAGE_SIZE));
pte_unmap_unlock(orig_pte, ptl);
- if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+ if (pte_present(pte) || pte_none(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
@@ -278,14 +278,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
return -EINVAL;
- if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
- struct zap_details details = {
- .nonlinear_vma = vma,
- .last_index = ULONG_MAX,
- };
- zap_page_range(vma, start, end - start, &details);
- } else
- zap_page_range(vma, start, end - start, NULL);
+ zap_page_range(vma, start, end - start, NULL);
return 0;
}
@@ -303,7 +296,7 @@ static long madvise_remove(struct vm_area_struct *vma,
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
- if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+ if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
return -EINVAL;
f = vma->vm_file;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index ef91e85..f3f8a4f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -343,9 +343,6 @@ struct mem_cgroup {
struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
- /* analogous to slab_common's slab_caches list, but per-memcg;
- * protected by memcg_slab_mutex */
- struct list_head memcg_slab_caches;
/* Index in the kmem_cache->memcg_params->memcg_caches array */
int kmemcg_id;
#endif
@@ -1477,9 +1474,9 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
pr_info("Task in ");
pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
- pr_info(" killed as a result of limit of ");
+ pr_cont(" killed as a result of limit of ");
pr_cont_cgroup_path(memcg->css.cgroup);
- pr_info("\n");
+ pr_cont("\n");
rcu_read_unlock();
@@ -2476,27 +2473,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
}
#ifdef CONFIG_MEMCG_KMEM
-/*
- * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
- * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
- */
-static DEFINE_MUTEX(memcg_slab_mutex);
-
-/*
- * This is a bit cumbersome, but it is rarely used and avoids a backpointer
- * in the memcg_cache_params struct.
- */
-static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
-{
- struct kmem_cache *cachep;
-
- VM_BUG_ON(p->is_root_cache);
- cachep = p->root_cache;
- return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
-}
-
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned long nr_pages)
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+ unsigned long nr_pages)
{
struct page_counter *counter;
int ret = 0;
@@ -2533,8 +2511,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
return ret;
}
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
- unsigned long nr_pages)
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
{
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_swap_account)
@@ -2579,10 +2556,7 @@ static int memcg_alloc_cache_id(void)
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;
- mutex_lock(&memcg_slab_mutex);
err = memcg_update_all_caches(size);
- mutex_unlock(&memcg_slab_mutex);
-
if (err) {
ida_simple_remove(&kmem_limited_groups, id);
return err;
@@ -2605,123 +2579,20 @@ void memcg_update_array_size(int num)
memcg_limited_groups_array_size = num;
}
-static void memcg_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *root_cache)
-{
- static char memcg_name_buf[NAME_MAX + 1]; /* protected by
- memcg_slab_mutex */
- struct kmem_cache *cachep;
- int id;
-
- lockdep_assert_held(&memcg_slab_mutex);
-
- id = memcg_cache_id(memcg);
-
- /*
- * Since per-memcg caches are created asynchronously on first
- * allocation (see memcg_kmem_get_cache()), several threads can try to
- * create the same cache, but only one of them may succeed.
- */
- if (cache_from_memcg_idx(root_cache, id))
- return;
-
- cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
- cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
- /*
- * If we could not create a memcg cache, do not complain, because
- * that's not critical at all as we can always proceed with the root
- * cache.
- */
- if (!cachep)
- return;
-
- list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-
- /*
- * Since readers won't lock (see cache_from_memcg_idx()), we need a
- * barrier here to ensure nobody will see the kmem_cache partially
- * initialized.
- */
- smp_wmb();
-
- BUG_ON(root_cache->memcg_params->memcg_caches[id]);
- root_cache->memcg_params->memcg_caches[id] = cachep;
-}
-
-static void memcg_unregister_cache(struct kmem_cache *cachep)
-{
- struct kmem_cache *root_cache;
- struct mem_cgroup *memcg;
- int id;
-
- lockdep_assert_held(&memcg_slab_mutex);
-
- BUG_ON(is_root_cache(cachep));
-
- root_cache = cachep->memcg_params->root_cache;
- memcg = cachep->memcg_params->memcg;
- id = memcg_cache_id(memcg);
-
- BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
- root_cache->memcg_params->memcg_caches[id] = NULL;
-
- list_del(&cachep->memcg_params->list);
-
- kmem_cache_destroy(cachep);
-}
-
-int __memcg_cleanup_cache_params(struct kmem_cache *s)
-{
- struct kmem_cache *c;
- int i, failed = 0;
-
- mutex_lock(&memcg_slab_mutex);
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
-
- memcg_unregister_cache(c);
-
- if (cache_from_memcg_idx(s, i))
- failed++;
- }
- mutex_unlock(&memcg_slab_mutex);
- return failed;
-}
-
-static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
- struct kmem_cache *cachep;
- struct memcg_cache_params *params, *tmp;
-
- if (!memcg_kmem_is_active(memcg))
- return;
-
- mutex_lock(&memcg_slab_mutex);
- list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
- cachep = memcg_params_to_cache(params);
- memcg_unregister_cache(cachep);
- }
- mutex_unlock(&memcg_slab_mutex);
-}
-
-struct memcg_register_cache_work {
+struct memcg_kmem_cache_create_work {
struct mem_cgroup *memcg;
struct kmem_cache *cachep;
struct work_struct work;
};
-static void memcg_register_cache_func(struct work_struct *w)
+static void memcg_kmem_cache_create_func(struct work_struct *w)
{
- struct memcg_register_cache_work *cw =
- container_of(w, struct memcg_register_cache_work, work);
+ struct memcg_kmem_cache_create_work *cw =
+ container_of(w, struct memcg_kmem_cache_create_work, work);
struct mem_cgroup *memcg = cw->memcg;
struct kmem_cache *cachep = cw->cachep;
- mutex_lock(&memcg_slab_mutex);
- memcg_register_cache(memcg, cachep);
- mutex_unlock(&memcg_slab_mutex);
+ memcg_create_kmem_cache(memcg, cachep);
css_put(&memcg->css);
kfree(cw);
@@ -2730,10 +2601,10 @@ static void memcg_register_cache_func(struct work_struct *w)
/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
-static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
- struct memcg_register_cache_work *cw;
+ struct memcg_kmem_cache_create_work *cw;
cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
if (!cw)
@@ -2743,18 +2614,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
cw->memcg = memcg;
cw->cachep = cachep;
+ INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
- INIT_WORK(&cw->work, memcg_register_cache_func);
schedule_work(&cw->work);
}
-static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
/*
* We need to stop accounting when we kmalloc, because if the
* corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_schedule_register_cache will recurse.
+ * in __memcg_schedule_kmem_cache_create will recurse.
*
* However, it is better to enclose the whole function. Depending on
* the debugging options enabled, INIT_WORK(), for instance, can
@@ -2763,24 +2634,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
* the safest choice is to do it like this, wrapping the whole function.
*/
current->memcg_kmem_skip_account = 1;
- __memcg_schedule_register_cache(memcg, cachep);
+ __memcg_schedule_kmem_cache_create(memcg, cachep);
current->memcg_kmem_skip_account = 0;
}
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
-{
- unsigned int nr_pages = 1 << order;
-
- return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
-}
-
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
-{
- unsigned int nr_pages = 1 << order;
-
- memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
-}
-
/*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
@@ -2825,7 +2682,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
* could happen with the slab_mutex held. So it's better to
* defer everything.
*/
- memcg_schedule_register_cache(memcg, cachep);
+ memcg_schedule_kmem_cache_create(memcg, cachep);
out:
css_put(&memcg->css);
return cachep;
@@ -3043,18 +2900,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
mem_cgroup_swap_statistics(to, true);
- /*
- * This function is only called from task migration context now.
- * It postpones page_counter and refcount handling till the end
- * of task migration(mem_cgroup_clear_mc()) for performance
- * improvement. But we cannot postpone css_get(to) because if
- * the process that has been moved to @to does swap-in, the
- * refcount of @to might be decreased to 0.
- *
- * We are in attach() phase, so the cgroup is guaranteed to be
- * alive, so we can just call css_get().
- */
- css_get(&to->css);
return 0;
}
return -EINVAL;
@@ -4166,7 +4011,7 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
- memcg_unregister_all_caches(memcg);
+ memcg_destroy_kmem_caches(memcg);
mem_cgroup_sockets_destroy(memcg);
}
#else
@@ -4679,6 +4524,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent_css == NULL) {
root_mem_cgroup = memcg;
page_counter_init(&memcg->memory, NULL);
+ memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
}
@@ -4693,7 +4539,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
spin_lock_init(&memcg->event_list_lock);
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
#endif
return &memcg->css;
@@ -4724,6 +4569,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent->use_hierarchy) {
page_counter_init(&memcg->memory, &parent->memory);
+ memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4733,6 +4579,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
*/
} else {
page_counter_init(&memcg->memory, NULL);
+ memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
/*
@@ -4807,7 +4654,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
- memcg->soft_limit = 0;
+ memcg->soft_limit = PAGE_COUNTER_MAX;
}
#ifdef CONFIG_MMU
@@ -4935,10 +4782,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
return NULL;
mapping = vma->vm_file->f_mapping;
- if (pte_none(ptent))
- pgoff = linear_page_index(vma, addr);
- else /* pte_file(ptent) is true */
- pgoff = pte_to_pgoff(ptent);
+ pgoff = linear_page_index(vma, addr);
/* page is moved even if it's not RSS of this task(page-faulted). */
#ifdef CONFIG_SWAP
@@ -4970,7 +4814,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
page = mc_handle_present_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, addr, ptent, &ent);
- else if (pte_none(ptent) || pte_file(ptent))
+ else if (pte_none(ptent))
page = mc_handle_file_pte(vma, addr, ptent, &ent);
if (!page && !ent.val)
@@ -5782,7 +5626,7 @@ void mem_cgroup_uncharge_list(struct list_head *page_list)
* mem_cgroup_migrate - migrate a charge to another page
* @oldpage: currently charged page
* @newpage: page to transfer the charge to
- * @lrucare: both pages might be on the LRU already
+ * @lrucare: either or both pages might be on the LRU already
*
* Migrate the charge from @oldpage to @newpage.
*
diff --git a/mm/memory.c b/mm/memory.c
index 649e7d44..d63849b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -235,6 +235,9 @@ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long
static void tlb_flush_mmu_tlbonly(struct mmu_gather *tlb)
{
+ if (!tlb->end)
+ return;
+
tlb_flush(tlb);
mmu_notifier_invalidate_range(tlb->mm, tlb->start, tlb->end);
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
@@ -247,7 +250,7 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
{
struct mmu_gather_batch *batch;
- for (batch = &tlb->local; batch; batch = batch->next) {
+ for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
free_pages_and_swap_cache(batch->pages, batch->nr);
batch->nr = 0;
}
@@ -256,9 +259,6 @@ static void tlb_flush_mmu_free(struct mmu_gather *tlb)
void tlb_flush_mmu(struct mmu_gather *tlb)
{
- if (!tlb->end)
- return;
-
tlb_flush_mmu_tlbonly(tlb);
tlb_flush_mmu_free(tlb);
}
@@ -754,6 +754,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
if (HAVE_PTE_SPECIAL) {
if (likely(!pte_special(pte)))
goto check_pfn;
+ if (vma->vm_ops && vma->vm_ops->find_special_page)
+ return vma->vm_ops->find_special_page(vma, addr);
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (!is_zero_pfn(pfn))
@@ -811,42 +813,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
/* pte contains position in swap or file, so copy. */
if (unlikely(!pte_present(pte))) {
- if (!pte_file(pte)) {
- swp_entry_t entry = pte_to_swp_entry(pte);
-
- if (likely(!non_swap_entry(entry))) {
- if (swap_duplicate(entry) < 0)
- return entry.val;
-
- /* make sure dst_mm is on swapoff's mmlist. */
- if (unlikely(list_empty(&dst_mm->mmlist))) {
- spin_lock(&mmlist_lock);
- if (list_empty(&dst_mm->mmlist))
- list_add(&dst_mm->mmlist,
- &src_mm->mmlist);
- spin_unlock(&mmlist_lock);
- }
- rss[MM_SWAPENTS]++;
- } else if (is_migration_entry(entry)) {
- page = migration_entry_to_page(entry);
-
- if (PageAnon(page))
- rss[MM_ANONPAGES]++;
- else
- rss[MM_FILEPAGES]++;
-
- if (is_write_migration_entry(entry) &&
- is_cow_mapping(vm_flags)) {
- /*
- * COW mappings require pages in both
- * parent and child to be set to read.
- */
- make_migration_entry_read(&entry);
- pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*src_pte))
- pte = pte_swp_mksoft_dirty(pte);
- set_pte_at(src_mm, addr, src_pte, pte);
- }
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if (likely(!non_swap_entry(entry))) {
+ if (swap_duplicate(entry) < 0)
+ return entry.val;
+
+ /* make sure dst_mm is on swapoff's mmlist. */
+ if (unlikely(list_empty(&dst_mm->mmlist))) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist,
+ &src_mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ rss[MM_SWAPENTS]++;
+ } else if (is_migration_entry(entry)) {
+ page = migration_entry_to_page(entry);
+
+ if (PageAnon(page))
+ rss[MM_ANONPAGES]++;
+ else
+ rss[MM_FILEPAGES]++;
+
+ if (is_write_migration_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ /*
+ * COW mappings require pages in both
+ * parent and child to be set to read.
+ */
+ make_migration_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*src_pte))
+ pte = pte_swp_mksoft_dirty(pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
}
}
goto out_set_pte;
@@ -1020,11 +1020,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
- if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
- VM_PFNMAP | VM_MIXEDMAP))) {
- if (!vma->anon_vma)
- return 0;
- }
+ if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+ !vma->anon_vma)
+ return 0;
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
@@ -1082,6 +1080,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
+ swp_entry_t entry;
again:
init_rss_vec(rss);
@@ -1107,28 +1106,12 @@ again:
if (details->check_mapping &&
details->check_mapping != page->mapping)
continue;
- /*
- * Each page->index must be checked when
- * invalidating or truncating nonlinear.
- */
- if (details->nonlinear_vma &&
- (page->index < details->first_index ||
- page->index > details->last_index))
- continue;
}
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
- if (unlikely(details) && details->nonlinear_vma
- && linear_page_index(details->nonlinear_vma,
- addr) != page->index) {
- pte_t ptfile = pgoff_to_pte(page->index);
- if (pte_soft_dirty(ptent))
- ptfile = pte_file_mksoft_dirty(ptfile);
- set_pte_at(mm, addr, pte, ptfile);
- }
if (PageAnon(page))
rss[MM_ANONPAGES]--;
else {
@@ -1151,33 +1134,25 @@ again:
}
continue;
}
- /*
- * If details->check_mapping, we leave swap entries;
- * if details->nonlinear_vma, we leave file entries.
- */
+ /* If details->check_mapping, we leave swap entries. */
if (unlikely(details))
continue;
- if (pte_file(ptent)) {
- if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
- print_bad_pte(vma, addr, ptent, NULL);
- } else {
- swp_entry_t entry = pte_to_swp_entry(ptent);
- if (!non_swap_entry(entry))
- rss[MM_SWAPENTS]--;
- else if (is_migration_entry(entry)) {
- struct page *page;
+ entry = pte_to_swp_entry(ptent);
+ if (!non_swap_entry(entry))
+ rss[MM_SWAPENTS]--;
+ else if (is_migration_entry(entry)) {
+ struct page *page;
- page = migration_entry_to_page(entry);
+ page = migration_entry_to_page(entry);
- if (PageAnon(page))
- rss[MM_ANONPAGES]--;
- else
- rss[MM_FILEPAGES]--;
- }
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
+ if (PageAnon(page))
+ rss[MM_ANONPAGES]--;
+ else
+ rss[MM_FILEPAGES]--;
}
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1277,7 +1252,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
pgd_t *pgd;
unsigned long next;
- if (details && !details->check_mapping && !details->nonlinear_vma)
+ if (details && !details->check_mapping)
details = NULL;
BUG_ON(addr >= end);
@@ -1371,7 +1346,7 @@ void unmap_vmas(struct mmu_gather *tlb,
* @vma: vm_area_struct holding the applicable pages
* @start: starting address of pages to zap
* @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
*
* Caller must protect the VMA list
*/
@@ -1397,7 +1372,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
* @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
*
* The range must fit into one VMA.
*/
@@ -1922,12 +1897,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
EXPORT_SYMBOL_GPL(apply_to_page_range);
/*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically. Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
+ * handle_pte_fault chooses page fault handler according to an entry which was
+ * read non-atomically. Before making any commitment, on those architectures
+ * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
+ * parts, do_swap_page must check under lock before unmapping the pte and
+ * proceeding (but do_wp_page is only called after already making such a check;
* and do_anonymous_page can safely check later on).
*/
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
@@ -2033,7 +2007,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t entry;
int ret = 0;
int page_mkwrite = 0;
- struct page *dirty_page = NULL;
+ bool dirty_shared = false;
unsigned long mmun_start = 0; /* For mmu_notifiers */
unsigned long mmun_end = 0; /* For mmu_notifiers */
struct mem_cgroup *memcg;
@@ -2084,6 +2058,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
+ page_cache_get(old_page);
/*
* Only catch write-faults on shared writable pages,
* read-only shared pages can get COWed by
@@ -2091,7 +2066,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
- page_cache_get(old_page);
+
pte_unmap_unlock(page_table, ptl);
tmp = do_page_mkwrite(vma, old_page, address);
if (unlikely(!tmp || (tmp &
@@ -2111,11 +2086,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unlock_page(old_page);
goto unlock;
}
-
page_mkwrite = 1;
}
- dirty_page = old_page;
- get_page(dirty_page);
+
+ dirty_shared = true;
reuse:
/*
@@ -2134,38 +2108,29 @@ reuse:
pte_unmap_unlock(page_table, ptl);
ret |= VM_FAULT_WRITE;
- if (!dirty_page)
- return ret;
+ if (dirty_shared) {
+ struct address_space *mapping;
+ int dirtied;
- /*
- * Yes, Virginia, this is actually required to prevent a race
- * with clear_page_dirty_for_io() from clearing the page dirty
- * bit after it clear all dirty ptes, but before a racing
- * do_wp_page installs a dirty pte.
- *
- * do_shared_fault is protected similarly.
- */
- if (!page_mkwrite) {
- wait_on_page_locked(dirty_page);
- set_page_dirty_balance(dirty_page);
- /* file_update_time outside page_lock */
- if (vma->vm_file)
- file_update_time(vma->vm_file);
- }
- put_page(dirty_page);
- if (page_mkwrite) {
- struct address_space *mapping = dirty_page->mapping;
-
- set_page_dirty(dirty_page);
- unlock_page(dirty_page);
- page_cache_release(dirty_page);
- if (mapping) {
+ if (!page_mkwrite)
+ lock_page(old_page);
+
+ dirtied = set_page_dirty(old_page);
+ VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
+ mapping = old_page->mapping;
+ unlock_page(old_page);
+ page_cache_release(old_page);
+
+ if ((dirtied || page_mkwrite) && mapping) {
/*
* Some device drivers do not set page.mapping
* but still dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
+
+ if (!page_mkwrite)
+ file_update_time(vma->vm_file);
}
return ret;
@@ -2324,25 +2289,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
}
}
-static inline void unmap_mapping_range_list(struct list_head *head,
- struct zap_details *details)
-{
- struct vm_area_struct *vma;
-
- /*
- * In nonlinear VMAs there is no correspondence between virtual address
- * offset and file offset. So we must perform an exhaustive search
- * across *all* the pages in each nonlinear VMA, not just the pages
- * whose virtual address lies outside the file truncation point.
- */
- list_for_each_entry(vma, head, shared.nonlinear) {
- details->nonlinear_vma = vma;
- unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
- }
-}
-
/**
- * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified
+ * address_space corresponding to the specified page range in the underlying
+ * file.
+ *
* @mapping: the address space containing mmaps to be unmapped.
* @holebegin: byte in first page to unmap, relative to the start of
* the underlying file. This will be rounded down to a PAGE_SIZE
@@ -2371,19 +2322,16 @@ void unmap_mapping_range(struct address_space *mapping,
}
details.check_mapping = even_cows? NULL: mapping;
- details.nonlinear_vma = NULL;
details.first_index = hba;
details.last_index = hba + hlen - 1;
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
- i_mmap_lock_read(mapping);
+ i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
- if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
- unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
- i_mmap_unlock_read(mapping);
+ i_mmap_unlock_write(mapping);
}
EXPORT_SYMBOL(unmap_mapping_range);
@@ -2593,7 +2541,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
if (prev && prev->vm_end == address)
return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
- expand_downwards(vma, address - PAGE_SIZE);
+ return expand_downwards(vma, address - PAGE_SIZE);
}
if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
struct vm_area_struct *next = vma->vm_next;
@@ -2602,7 +2550,7 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo
if (next && next->vm_start == address + PAGE_SIZE)
return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
- expand_upwards(vma, address + PAGE_SIZE);
+ return expand_upwards(vma, address + PAGE_SIZE);
}
return 0;
}
@@ -2625,7 +2573,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Check if we need to add a guard page to the stack */
if (check_stack_guard_page(vma, address) < 0)
- return VM_FAULT_SIGBUS;
+ return VM_FAULT_SIGSEGV;
/* Use the zero-page for reads */
if (!(flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(mm)) {
@@ -2743,8 +2691,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
entry = mk_pte(page, vma->vm_page_prot);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
- entry = pte_mksoft_dirty(entry);
if (anon) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
@@ -2879,8 +2825,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
- if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
- fault_around_bytes >> PAGE_SHIFT > 1) {
+ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
do_fault_around(vma, address, pte, pgoff, flags);
if (!pte_same(*pte, orig_pte))
@@ -3012,8 +2957,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
balance_dirty_pages_ratelimited(mapping);
}
- /* file_update_time outside page_lock */
- if (vma->vm_file && !vma->vm_ops->page_mkwrite)
+ if (!vma->vm_ops->page_mkwrite)
file_update_time(vma->vm_file);
return ret;
@@ -3025,7 +2969,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
@@ -3042,46 +2986,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
-/*
- * Fault of a previously existing named mapping. Repopulate the pte
- * from the encoded file_pte if possible. This enables swappable
- * nonlinear vmas.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with pte unmapped and unlocked.
- * The mmap_sem may have been released depending on flags and our
- * return value. See filemap_fault() and __lock_page_or_retry().
- */
-static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte)
-{
- pgoff_t pgoff;
-
- flags |= FAULT_FLAG_NONLINEAR;
-
- if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
- return 0;
-
- if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
- /*
- * Page table corrupted: show pte and kill process.
- */
- print_bad_pte(vma, address, orig_pte, NULL);
- return VM_FAULT_SIGBUS;
- }
-
- pgoff = pte_to_pgoff(orig_pte);
- if (!(flags & FAULT_FLAG_WRITE))
- return do_read_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
- if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
- return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
-}
-
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid,
int *flags)
@@ -3209,15 +3113,12 @@ static int handle_pte_fault(struct mm_struct *mm,
if (pte_none(entry)) {
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
- return do_linear_fault(mm, vma, address,
- pte, pmd, flags, entry);
+ return do_fault(mm, vma, address, pte,
+ pmd, flags, entry);
}
return do_anonymous_page(mm, vma, address,
pte, pmd, flags);
}
- if (pte_file(entry))
- return do_nonlinear_fault(mm, vma, address,
- pte, pmd, flags, entry);
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 344cdf6..6e284bc 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -179,37 +179,6 @@ out:
}
/*
- * Congratulations to trinity for discovering this bug.
- * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
- * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
- * replace the specified range by file ptes throughout (maybe populated after).
- * If page migration finds a page within that range, while it's still located
- * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
- * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
- * But if the migrating page is in a part of the vma outside the range to be
- * remapped, then it will not be cleared, and remove_migration_ptes() needs to
- * deal with it. Fortunately, this part of the vma is of course still linear,
- * so we just need to use linear location on the nonlinear list.
- */
-static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
- struct address_space *mapping, void *arg)
-{
- struct vm_area_struct *vma;
- /* hugetlbfs does not support remap_pages, so no huge pgoff worries */
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- unsigned long addr;
-
- list_for_each_entry(vma,
- &mapping->i_mmap_nonlinear, shared.nonlinear) {
-
- addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- if (addr >= vma->vm_start && addr < vma->vm_end)
- remove_migration_pte(page, vma, addr, arg);
- }
- return SWAP_AGAIN;
-}
-
-/*
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
@@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new)
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
.arg = old,
- .file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
};
rmap_walk(new, &rwc);
diff --git a/mm/mincore.c b/mm/mincore.c
index c8c528b..46527c0 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -124,17 +124,13 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
pte_t pte = *ptep;
- pgoff_t pgoff;
next = addr + PAGE_SIZE;
if (pte_none(pte))
mincore_unmapped_range(vma, addr, next, vec);
else if (pte_present(pte))
*vec = 1;
- else if (pte_file(pte)) {
- pgoff = pte_to_pgoff(pte);
- *vec = mincore_page(vma->vm_file->f_mapping, pgoff);
- } else { /* pte is a swap entry */
+ else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
if (non_swap_entry(entry)) {
@@ -145,9 +141,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
*vec = 1;
} else {
#ifdef CONFIG_SWAP
- pgoff = entry.val;
*vec = mincore_page(swap_address_space(entry),
- pgoff);
+ entry.val);
#else
WARN_ON(1);
*vec = 1;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7b36aa7..14d8466 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
mapping_unmap_writable(mapping);
flush_dcache_mmap_lock(mapping);
- if (unlikely(vma->vm_flags & VM_NONLINEAR))
- list_del_init(&vma->shared.nonlinear);
- else
- vma_interval_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
@@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
- if (unlikely(vma->vm_flags & VM_NONLINEAR))
- vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
- else
- vma_interval_tree_insert(vma, &mapping->i_mmap);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
}
@@ -778,23 +772,22 @@ again: remove_next = 1 + (end > next->vm_end);
if (exporter && exporter->anon_vma && !importer->anon_vma) {
int error;
+ importer->anon_vma = exporter->anon_vma;
error = anon_vma_clone(importer, exporter);
- if (error)
+ if (error) {
+ importer->anon_vma = NULL;
return error;
- importer->anon_vma = exporter->anon_vma;
+ }
}
}
if (file) {
mapping = file->f_mapping;
- if (!(vma->vm_flags & VM_NONLINEAR)) {
- root = &mapping->i_mmap;
- uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+ root = &mapping->i_mmap;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
- if (adjust_next)
- uprobe_munmap(next, next->vm_start,
- next->vm_end);
- }
+ if (adjust_next)
+ uprobe_munmap(next, next->vm_start, next->vm_end);
i_mmap_lock_write(mapping);
if (insert) {
@@ -2099,14 +2092,17 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
{
struct mm_struct *mm = vma->vm_mm;
struct rlimit *rlim = current->signal->rlim;
- unsigned long new_start;
+ unsigned long new_start, actual_size;
/* address space limit tests */
if (!may_expand_vm(mm, grow))
return -ENOMEM;
/* Stack limit test */
- if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
+ actual_size = size;
+ if (size && (vma->vm_flags & (VM_GROWSUP | VM_GROWSDOWN)))
+ actual_size -= PAGE_SIZE;
+ if (actual_size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur))
return -ENOMEM;
/* mlock limit tests */
@@ -2629,6 +2625,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
return vm_munmap(addr, len);
}
+
+/*
+ * Emulation of deprecated remap_file_pages() syscall.
+ */
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+ unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
+{
+
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long populate = 0;
+ unsigned long ret = -EINVAL;
+ struct file *file;
+
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
+ "See Documentation/vm/remap_file_pages.txt.\n",
+ current->comm, current->pid);
+
+ if (prot)
+ return ret;
+ start = start & PAGE_MASK;
+ size = size & PAGE_MASK;
+
+ if (start + size <= start)
+ return ret;
+
+ /* Does pgoff wrap? */
+ if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+ return ret;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+
+ if (!vma || !(vma->vm_flags & VM_SHARED))
+ goto out;
+
+ if (start < vma->vm_start || start + size > vma->vm_end)
+ goto out;
+
+ if (pgoff == linear_page_index(vma, start)) {
+ ret = 0;
+ goto out;
+ }
+
+ prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+ prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+ prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+ flags &= MAP_NONBLOCK;
+ flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+ if (vma->vm_flags & VM_LOCKED) {
+ flags |= MAP_LOCKED;
+ /* drop PG_Mlocked flag for over-mapped range */
+ munlock_vma_pages_range(vma, start, start + size);
+ }
+
+ file = get_file(vma->vm_file);
+ ret = do_mmap_pgoff(vma->vm_file, start, size,
+ prot, flags, pgoff, &populate);
+ fput(file);
+out:
+ up_write(&mm->mmap_sem);
+ if (populate)
+ mm_populate(ret, populate);
+ if (!IS_ERR_VALUE(ret))
+ ret = 0;
+ return ret;
+}
+
static inline void verify_mm_writelocked(struct mm_struct *mm)
{
#ifdef CONFIG_DEBUG_VM
@@ -3103,8 +3168,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
*
* mmap_sem in write mode is required in order to block all operations
* that could modify pagetables and free pages without need of
- * altering the vma layout (for example populate_range() with
- * nonlinear vmas). It's also needed in write mode to avoid new
+ * altering the vma layout. It's also needed in write mode to avoid new
* anon_vmas to be associated with existing vmas.
*
* A single task can't take more than one mm_take_all_locks() in a row
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ace9345..3312166 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -105,7 +105,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
if (updated)
pages++;
- } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
+ } else if (IS_ENABLED(CONFIG_MIGRATION)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
if (is_write_migration_entry(entry)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 17fa018..57dadc0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte)
pte = pte_mksoft_dirty(pte);
else if (is_swap_pte(pte))
pte = pte_swp_mksoft_dirty(pte);
- else if (pte_file(pte))
- pte = pte_file_mksoft_dirty(pte);
#endif
return pte;
}
diff --git a/mm/msync.c b/mm/msync.c
index 992a167..bb04d53 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
(vma->vm_flags & VM_SHARED)) {
get_file(file);
up_read(&mm->mmap_sem);
- if (vma->vm_flags & VM_NONLINEAR)
- error = vfs_fsync(file, 1);
- else
- error = vfs_fsync_range(file, fstart, fend, 1);
+ error = vfs_fsync_range(file, fstart, fend, 1);
fput(file);
if (error || start >= end)
goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index b51eadf..541bed6 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -59,6 +59,7 @@
#endif
void *high_memory;
+EXPORT_SYMBOL(high_memory);
struct page *mem_map;
unsigned long max_mapnr;
unsigned long highest_memmap_pfn;
@@ -1983,14 +1984,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_map_pages);
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
- unsigned long size, pgoff_t pgoff)
-{
- BUG();
- return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, int write)
{
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index d5d81f5..6f43352 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1541,16 +1541,6 @@ pause:
bdi_start_background_writeback(bdi);
}
-void set_page_dirty_balance(struct page *page)
-{
- if (set_page_dirty(page)) {
- struct address_space *mapping = page_mapping(page);
-
- if (mapping)
- balance_dirty_pages_ratelimited(mapping);
- }
-}
-
static DEFINE_PER_CPU(int, bdp_ratelimits);
/*
@@ -2123,32 +2113,25 @@ EXPORT_SYMBOL(account_page_dirtied);
* page dirty in that case, but not all the buffers. This is a "bottom-up"
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
*
- * Most callers have locked the page, which pins the address_space in memory.
- * But zap_pte_range() does not lock the page, however in that case the
- * mapping is pinned by the vma's ->vm_file reference.
- *
- * We take care to handle the case where the page was truncated from the
- * mapping by re-checking page_mapping() inside tree_lock.
+ * The caller must ensure this doesn't race with truncation. Most will simply
+ * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
+ * the pte lock held, which also locks out truncation.
*/
int __set_page_dirty_nobuffers(struct page *page)
{
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
- struct address_space *mapping2;
unsigned long flags;
if (!mapping)
return 1;
spin_lock_irqsave(&mapping->tree_lock, flags);
- mapping2 = page_mapping(page);
- if (mapping2) { /* Race with truncate? */
- BUG_ON(mapping2 != mapping);
- WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
- account_page_dirtied(page, mapping);
- radix_tree_tag_set(&mapping->page_tree,
- page_index(page), PAGECACHE_TAG_DIRTY);
- }
+ BUG_ON(page_mapping(page) != mapping);
+ WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
+ account_page_dirtied(page, mapping);
+ radix_tree_tag_set(&mapping->page_tree, page_index(page),
+ PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
if (mapping->host) {
/* !PageAnon && !swapper_space */
@@ -2305,12 +2288,10 @@ int clear_page_dirty_for_io(struct page *page)
/*
* We carefully synchronise fault handlers against
* installing a dirty pte and marking the page dirty
- * at this point. We do this by having them hold the
- * page lock at some point after installing their
- * pte, but before marking the page dirty.
- * Pages are always locked coming in here, so we get
- * the desired exclusion. See mm/memory.c:do_wp_page()
- * for more comments.
+ * at this point. We do this by having them hold the
+ * page lock while dirtying the page, and pages are
+ * always locked coming in here, so we get the desired
+ * exclusion.
*/
if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7633c50..f121050 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -552,17 +552,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
return 0;
if (page_is_guard(buddy) && page_order(buddy) == order) {
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
-
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+
return 1;
}
if (PageBuddy(buddy) && page_order(buddy) == order) {
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
-
/*
* zone check is done late to avoid uselessly
* calculating zone/node ids for pages that could
@@ -571,6 +569,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+
return 1;
}
return 0;
@@ -2332,12 +2332,21 @@ static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, enum zone_type high_zoneidx,
nodemask_t *nodemask, struct zone *preferred_zone,
- int classzone_idx, int migratetype)
+ int classzone_idx, int migratetype, unsigned long *did_some_progress)
{
struct page *page;
- /* Acquire the per-zone oom lock for each zone */
+ *did_some_progress = 0;
+
+ if (oom_killer_disabled)
+ return NULL;
+
+ /*
+ * Acquire the per-zone oom lock for each zone. If that
+ * fails, somebody else is making progress for us.
+ */
if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
+ *did_some_progress = 1;
schedule_timeout_uninterruptible(1);
return NULL;
}
@@ -2363,12 +2372,18 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
if (!(gfp_mask & __GFP_NOFAIL)) {
+ /* Coredumps can quickly deplete all memory reserves */
+ if (current->flags & PF_DUMPCORE)
+ goto out;
/* The OOM killer will not help higher order allocs */
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
if (high_zoneidx < ZONE_NORMAL)
goto out;
+ /* The OOM killer does not compensate for light reclaim */
+ if (!(gfp_mask & __GFP_FS))
+ goto out;
/*
* GFP_THISNODE contains __GFP_NORETRY and we never hit this.
* Sanity check for bare calls of __GFP_THISNODE, not real OOM.
@@ -2381,7 +2396,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
}
/* Exhausted what can be done so it's blamo time */
out_of_memory(zonelist, gfp_mask, order, nodemask, false);
-
+ *did_some_progress = 1;
out:
oom_zonelist_unlock(zonelist, gfp_mask);
return page;
@@ -2658,7 +2673,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
(gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
-restart:
+retry:
if (!(gfp_mask & __GFP_NO_KSWAPD))
wake_all_kswapds(order, zonelist, high_zoneidx,
preferred_zone, nodemask);
@@ -2681,7 +2696,6 @@ restart:
classzone_idx = zonelist_zone_idx(preferred_zoneref);
}
-rebalance:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
@@ -2788,54 +2802,28 @@ rebalance:
if (page)
goto got_pg;
- /*
- * If we failed to make any progress reclaiming, then we are
- * running out of options and have to consider going OOM
- */
- if (!did_some_progress) {
- if (oom_gfp_allowed(gfp_mask)) {
- if (oom_killer_disabled)
- goto nopage;
- /* Coredumps can quickly deplete all memory reserves */
- if ((current->flags & PF_DUMPCORE) &&
- !(gfp_mask & __GFP_NOFAIL))
- goto nopage;
- page = __alloc_pages_may_oom(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask, preferred_zone,
- classzone_idx, migratetype);
- if (page)
- goto got_pg;
-
- if (!(gfp_mask & __GFP_NOFAIL)) {
- /*
- * The oom killer is not called for high-order
- * allocations that may fail, so if no progress
- * is being made, there are no other options and
- * retrying is unlikely to help.
- */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
- goto nopage;
- /*
- * The oom killer is not called for lowmem
- * allocations to prevent needlessly killing
- * innocent tasks.
- */
- if (high_zoneidx < ZONE_NORMAL)
- goto nopage;
- }
-
- goto restart;
- }
- }
-
/* Check if we should retry the allocation */
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, did_some_progress,
pages_reclaimed)) {
+ /*
+ * If we fail to make progress by freeing individual
+ * pages, but the allocation wants us to keep going,
+ * start OOM killing tasks.
+ */
+ if (!did_some_progress) {
+ page = __alloc_pages_may_oom(gfp_mask, order, zonelist,
+ high_zoneidx, nodemask,
+ preferred_zone, classzone_idx,
+ migratetype,&did_some_progress);
+ if (page)
+ goto got_pg;
+ if (!did_some_progress)
+ goto nopage;
+ }
/* Wait for some write requests to complete then retry */
wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
- goto rebalance;
+ goto retry;
} else {
/*
* High-order allocations do not necessarily loop after
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index ad83195..b264bda 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -199,7 +199,10 @@ int walk_page_range(unsigned long addr, unsigned long end,
*/
if ((vma->vm_start <= addr) &&
(vma->vm_flags & VM_PFNMAP)) {
- next = vma->vm_end;
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
pgd = pgd_offset(walk->mm, next);
continue;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index c5bc241..70b3249 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
atomic_set(&anon_vma->refcount, 1);
+ anon_vma->degree = 1; /* Reference for first vma */
+ anon_vma->parent = anon_vma;
/*
* Initialise the anon_vma root to point to itself. If called
* from fork, the root will be reset to the parents anon_vma.
@@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma);
+ /* vma reference or self-parent link for new root */
+ anon_vma->degree++;
allocated = NULL;
avc = NULL;
}
@@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
/*
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
+ *
+ * If dst->anon_vma is NULL this function tries to find and reuse existing
+ * anon_vma which has no vmas and only one child anon_vma. This prevents
+ * degradation of anon_vma hierarchy to endless linear chain in case of
+ * constantly forking task. On the other hand, an anon_vma with more than one
+ * child isn't reused even if there was no alive vma, thus rmap walker has a
+ * good chance of avoiding scanning the whole hierarchy when it searches where
+ * page is mapped.
*/
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
@@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
anon_vma = pavc->anon_vma;
root = lock_anon_vma_root(root, anon_vma);
anon_vma_chain_link(dst, avc, anon_vma);
+
+ /*
+ * Reuse existing anon_vma if its degree lower than two,
+ * that means it has no vma and only one anon_vma child.
+ *
+ * Do not chose parent anon_vma, otherwise first child
+ * will always reuse it. Root anon_vma is never reused:
+ * it has self-parent reference and at least one child.
+ */
+ if (!dst->anon_vma && anon_vma != src->anon_vma &&
+ anon_vma->degree < 2)
+ dst->anon_vma = anon_vma;
}
+ if (dst->anon_vma)
+ dst->anon_vma->degree++;
unlock_anon_vma_root(root);
return 0;
@@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
if (!pvma->anon_vma)
return 0;
+ /* Drop inherited anon_vma, we'll reuse existing or allocate new. */
+ vma->anon_vma = NULL;
+
/*
* First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes.
@@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
if (error)
return error;
+ /* An existing anon_vma has been reused, all done then. */
+ if (vma->anon_vma)
+ return 0;
+
/* Then add our own anon_vma. */
anon_vma = anon_vma_alloc();
if (!anon_vma)
@@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
* lock any of the anon_vmas in this anon_vma tree.
*/
anon_vma->root = pvma->anon_vma->root;
+ anon_vma->parent = pvma->anon_vma;
/*
* With refcounts, an anon_vma can stay around longer than the
* process it belongs to. The root anon_vma needs to be pinned until
@@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
vma->anon_vma = anon_vma;
anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
+ anon_vma->parent->degree++;
anon_vma_unlock_write(anon_vma);
return 0;
@@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
* Leave empty anon_vmas on the list - we'll need
* to free them outside the lock.
*/
- if (RB_EMPTY_ROOT(&anon_vma->rb_root))
+ if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
+ anon_vma->parent->degree--;
continue;
+ }
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
+ if (vma->anon_vma)
+ vma->anon_vma->degree--;
unlock_anon_vma_root(root);
/*
@@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
+ BUG_ON(anon_vma->degree);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
@@ -550,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
- } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
- if (!vma->vm_file ||
- vma->vm_file->f_mapping != page->mapping)
+ } else if (page->mapping) {
+ if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
return -EFAULT;
} else
return -EFAULT;
@@ -1234,7 +1273,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, address, pte, swp_pte);
- BUG_ON(pte_file(*pte));
} else if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & TTU_MIGRATION)) {
/* Establish migration entry for a file page */
@@ -1276,211 +1314,6 @@ out_mlock:
return ret;
}
-/*
- * objrmap doesn't work for nonlinear VMAs because the assumption that
- * offset-into-file correlates with offset-into-virtual-addresses does not hold.
- * Consequently, given a particular page and its ->index, we cannot locate the
- * ptes which are mapping that page without an exhaustive linear search.
- *
- * So what this code does is a mini "virtual scan" of each nonlinear VMA which
- * maps the file to which the target page belongs. The ->vm_private_data field
- * holds the current cursor into that scan. Successive searches will circulate
- * around the vma's virtual address space.
- *
- * So as more replacement pressure is applied to the pages in a nonlinear VMA,
- * more scanning pressure is placed against them as well. Eventually pages
- * will become fully unmapped and are eligible for eviction.
- *
- * For very sparsely populated VMAs this is a little inefficient - chances are
- * there there won't be many ptes located within the scan cluster. In this case
- * maybe we could scan further - to the end of the pte page, perhaps.
- *
- * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
- * acquire it without blocking. If vma locked, mlock the pages in the cluster,
- * rather than unmapping them. If we encounter the "check_page" that vmscan is
- * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
- */
-#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
-#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
-
-static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
- struct vm_area_struct *vma, struct page *check_page)
-{
- struct mm_struct *mm = vma->vm_mm;
- pmd_t *pmd;
- pte_t *pte;
- pte_t pteval;
- spinlock_t *ptl;
- struct page *page;
- unsigned long address;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
- unsigned long end;
- int ret = SWAP_AGAIN;
- int locked_vma = 0;
-
- address = (vma->vm_start + cursor) & CLUSTER_MASK;
- end = address + CLUSTER_SIZE;
- if (address < vma->vm_start)
- address = vma->vm_start;
- if (end > vma->vm_end)
- end = vma->vm_end;
-
- pmd = mm_find_pmd(mm, address);
- if (!pmd)
- return ret;
-
- mmun_start = address;
- mmun_end = end;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-
- /*
- * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
- * keep the sem while scanning the cluster for mlocking pages.
- */
- if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
- locked_vma = (vma->vm_flags & VM_LOCKED);
- if (!locked_vma)
- up_read(&vma->vm_mm->mmap_sem); /* don't need it */
- }
-
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-
- /* Update high watermark before we lower rss */
- update_hiwater_rss(mm);
-
- for (; address < end; pte++, address += PAGE_SIZE) {
- if (!pte_present(*pte))
- continue;
- page = vm_normal_page(vma, address, *pte);
- BUG_ON(!page || PageAnon(page));
-
- if (locked_vma) {
- if (page == check_page) {
- /* we know we have check_page locked */
- mlock_vma_page(page);
- ret = SWAP_MLOCK;
- } else if (trylock_page(page)) {
- /*
- * If we can lock the page, perform mlock.
- * Otherwise leave the page alone, it will be
- * eventually encountered again later.
- */
- mlock_vma_page(page);
- unlock_page(page);
- }
- continue; /* don't unmap */
- }
-
- /*
- * No need for _notify because we're within an
- * mmu_notifier_invalidate_range_ {start|end} scope.
- */
- if (ptep_clear_flush_young(vma, address, pte))
- continue;
-
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush_notify(vma, address, pte);
-
- /* If nonlinear, store the file page offset in the pte. */
- if (page->index != linear_page_index(vma, address)) {
- pte_t ptfile = pgoff_to_pte(page->index);
- if (pte_soft_dirty(pteval))
- ptfile = pte_file_mksoft_dirty(ptfile);
- set_pte_at(mm, address, pte, ptfile);
- }
-
- /* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
- set_page_dirty(page);
-
- page_remove_rmap(page);
- page_cache_release(page);
- dec_mm_counter(mm, MM_FILEPAGES);
- (*mapcount)--;
- }
- pte_unmap_unlock(pte - 1, ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- if (locked_vma)
- up_read(&vma->vm_mm->mmap_sem);
- return ret;
-}
-
-static int try_to_unmap_nonlinear(struct page *page,
- struct address_space *mapping, void *arg)
-{
- struct vm_area_struct *vma;
- int ret = SWAP_AGAIN;
- unsigned long cursor;
- unsigned long max_nl_cursor = 0;
- unsigned long max_nl_size = 0;
- unsigned int mapcount;
-
- list_for_each_entry(vma,
- &mapping->i_mmap_nonlinear, shared.nonlinear) {
-
- cursor = (unsigned long) vma->vm_private_data;
- if (cursor > max_nl_cursor)
- max_nl_cursor = cursor;
- cursor = vma->vm_end - vma->vm_start;
- if (cursor > max_nl_size)
- max_nl_size = cursor;
- }
-
- if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
- return SWAP_FAIL;
- }
-
- /*
- * We don't try to search for this page in the nonlinear vmas,
- * and page_referenced wouldn't have found it anyway. Instead
- * just walk the nonlinear vmas trying to age and unmap some.
- * The mapcount of the page we came in with is irrelevant,
- * but even so use it as a guide to how hard we should try?
- */
- mapcount = page_mapcount(page);
- if (!mapcount)
- return ret;
-
- cond_resched();
-
- max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
- if (max_nl_cursor == 0)
- max_nl_cursor = CLUSTER_SIZE;
-
- do {
- list_for_each_entry(vma,
- &mapping->i_mmap_nonlinear, shared.nonlinear) {
-
- cursor = (unsigned long) vma->vm_private_data;
- while (cursor < max_nl_cursor &&
- cursor < vma->vm_end - vma->vm_start) {
- if (try_to_unmap_cluster(cursor, &mapcount,
- vma, page) == SWAP_MLOCK)
- ret = SWAP_MLOCK;
- cursor += CLUSTER_SIZE;
- vma->vm_private_data = (void *) cursor;
- if ((int)mapcount <= 0)
- return ret;
- }
- vma->vm_private_data = (void *) max_nl_cursor;
- }
- cond_resched();
- max_nl_cursor += CLUSTER_SIZE;
- } while (max_nl_cursor <= max_nl_size);
-
- /*
- * Don't loop forever (perhaps all the remaining pages are
- * in locked vmas). Reset cursor on all unreserved nonlinear
- * vmas, now forgetting on which ones it had fallen behind.
- */
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
- vma->vm_private_data = NULL;
-
- return ret;
-}
-
bool is_vma_temporary_stack(struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1526,7 +1359,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
.done = page_not_mapped,
- .file_nonlinear = try_to_unmap_nonlinear,
.anon_lock = page_lock_anon_vma_read,
};
@@ -1572,12 +1404,6 @@ int try_to_munlock(struct page *page)
.rmap_one = try_to_unmap_one,
.arg = (void *)TTU_MUNLOCK,
.done = page_not_mapped,
- /*
- * We don't bother to try to find the munlocked page in
- * nonlinears. It's costly. Instead, later, page reclaim logic
- * may call try_to_unmap() and recover PG_mlocked lazily.
- */
- .file_nonlinear = NULL,
.anon_lock = page_lock_anon_vma_read,
};
@@ -1708,13 +1534,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
goto done;
}
- if (!rwc->file_nonlinear)
- goto done;
-
- if (list_empty(&mapping->i_mmap_nonlinear))
- goto done;
-
- ret = rwc->file_nonlinear(page, mapping, rwc->arg);
done:
i_mmap_unlock_read(mapping);
return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 73ba1df..b3e4031 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1013,7 +1013,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
oldpage = newpage;
} else {
- mem_cgroup_migrate(oldpage, newpage, false);
+ mem_cgroup_migrate(oldpage, newpage, true);
lru_cache_add_anon(newpage);
*pagep = newpage;
}
@@ -3201,7 +3201,6 @@ static const struct vm_operations_struct shmem_vm_ops = {
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
- .remap_pages = generic_file_remap_pages,
};
static struct dentry *shmem_mount(struct file_system_type *fs_type,
diff --git a/mm/slab.h b/mm/slab.h
index 1cf40054..90430d6 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -235,7 +235,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
return 0;
if (is_root_cache(s))
return 0;
- return __memcg_charge_slab(s, gfp, order);
+ return memcg_charge_kmem(s->memcg_params->memcg, gfp, 1 << order);
}
static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,7 +244,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
return;
if (is_root_cache(s))
return;
- __memcg_uncharge_slab(s, order);
+ memcg_uncharge_kmem(s->memcg_params->memcg, 1 << order);
}
#else
static inline bool is_root_cache(struct kmem_cache *s)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e03dd6f..6e1e4cf 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -331,7 +331,7 @@ out:
out_free_cache:
memcg_free_cache_params(s);
- kfree(s);
+ kmem_cache_free(kmem_cache, s);
goto out;
}
@@ -425,21 +425,64 @@ out_unlock:
}
EXPORT_SYMBOL(kmem_cache_create);
+static int do_kmem_cache_shutdown(struct kmem_cache *s,
+ struct list_head *release, bool *need_rcu_barrier)
+{
+ if (__kmem_cache_shutdown(s) != 0) {
+ printk(KERN_ERR "kmem_cache_destroy %s: "
+ "Slab cache still has objects\n", s->name);
+ dump_stack();
+ return -EBUSY;
+ }
+
+ if (s->flags & SLAB_DESTROY_BY_RCU)
+ *need_rcu_barrier = true;
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (!is_root_cache(s)) {
+ struct kmem_cache *root_cache = s->memcg_params->root_cache;
+ int memcg_id = memcg_cache_id(s->memcg_params->memcg);
+
+ BUG_ON(root_cache->memcg_params->memcg_caches[memcg_id] != s);
+ root_cache->memcg_params->memcg_caches[memcg_id] = NULL;
+ }
+#endif
+ list_move(&s->list, release);
+ return 0;
+}
+
+static void do_kmem_cache_release(struct list_head *release,
+ bool need_rcu_barrier)
+{
+ struct kmem_cache *s, *s2;
+
+ if (need_rcu_barrier)
+ rcu_barrier();
+
+ list_for_each_entry_safe(s, s2, release, list) {
+#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_remove(s);
+#else
+ slab_kmem_cache_release(s);
+#endif
+ }
+}
+
#ifdef CONFIG_MEMCG_KMEM
/*
* memcg_create_kmem_cache - Create a cache for a memory cgroup.
* @memcg: The memory cgroup the new cache is for.
* @root_cache: The parent of the new cache.
- * @memcg_name: The name of the memory cgroup (used for naming the new cache).
*
* This function attempts to create a kmem cache that will serve allocation
* requests going from @memcg to @root_cache. The new cache inherits properties
* from its parent.
*/
-struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *root_cache,
- const char *memcg_name)
+void memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache)
{
+ static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
+ int memcg_id = memcg_cache_id(memcg);
struct kmem_cache *s = NULL;
char *cache_name;
@@ -448,8 +491,18 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
mutex_lock(&slab_mutex);
+ /*
+ * Since per-memcg caches are created asynchronously on first
+ * allocation (see memcg_kmem_get_cache()), several threads can try to
+ * create the same cache, but only one of them may succeed.
+ */
+ if (cache_from_memcg_idx(root_cache, memcg_id))
+ goto out_unlock;
+
+ cgroup_name(mem_cgroup_css(memcg)->cgroup,
+ memcg_name_buf, sizeof(memcg_name_buf));
cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
- memcg_cache_id(memcg), memcg_name);
+ memcg_cache_id(memcg), memcg_name_buf);
if (!cache_name)
goto out_unlock;
@@ -457,49 +510,73 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
root_cache->size, root_cache->align,
root_cache->flags, root_cache->ctor,
memcg, root_cache);
+ /*
+ * If we could not create a memcg cache, do not complain, because
+ * that's not critical at all as we can always proceed with the root
+ * cache.
+ */
if (IS_ERR(s)) {
kfree(cache_name);
- s = NULL;
+ goto out_unlock;
}
+ /*
+ * Since readers won't lock (see cache_from_memcg_idx()), we need a
+ * barrier here to ensure nobody will see the kmem_cache partially
+ * initialized.
+ */
+ smp_wmb();
+ root_cache->memcg_params->memcg_caches[memcg_id] = s;
+
out_unlock:
mutex_unlock(&slab_mutex);
put_online_mems();
put_online_cpus();
-
- return s;
}
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
+void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
{
- int rc;
+ LIST_HEAD(release);
+ bool need_rcu_barrier = false;
+ struct kmem_cache *s, *s2;
- if (!s->memcg_params ||
- !s->memcg_params->is_root_cache)
- return 0;
+ get_online_cpus();
+ get_online_mems();
- mutex_unlock(&slab_mutex);
- rc = __memcg_cleanup_cache_params(s);
mutex_lock(&slab_mutex);
+ list_for_each_entry_safe(s, s2, &slab_caches, list) {
+ if (is_root_cache(s) || s->memcg_params->memcg != memcg)
+ continue;
+ /*
+ * The cgroup is about to be freed and therefore has no charges
+ * left. Hence, all its caches must be empty by now.
+ */
+ BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
+ }
+ mutex_unlock(&slab_mutex);
- return rc;
-}
-#else
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
-{
- return 0;
+ put_online_mems();
+ put_online_cpus();
+
+ do_kmem_cache_release(&release, need_rcu_barrier);
}
#endif /* CONFIG_MEMCG_KMEM */
void slab_kmem_cache_release(struct kmem_cache *s)
{
+ memcg_free_cache_params(s);
kfree(s->name);
kmem_cache_free(kmem_cache, s);
}
void kmem_cache_destroy(struct kmem_cache *s)
{
+ int i;
+ LIST_HEAD(release);
+ bool need_rcu_barrier = false;
+ bool busy = false;
+
get_online_cpus();
get_online_mems();
@@ -509,35 +586,23 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
- if (memcg_cleanup_cache_params(s) != 0)
- goto out_unlock;
+ for_each_memcg_cache_index(i) {
+ struct kmem_cache *c = cache_from_memcg_idx(s, i);
- if (__kmem_cache_shutdown(s) != 0) {
- printk(KERN_ERR "kmem_cache_destroy %s: "
- "Slab cache still has objects\n", s->name);
- dump_stack();
- goto out_unlock;
+ if (c && do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+ busy = true;
}
- list_del(&s->list);
-
- mutex_unlock(&slab_mutex);
- if (s->flags & SLAB_DESTROY_BY_RCU)
- rcu_barrier();
-
- memcg_free_cache_params(s);
-#ifdef SLAB_SUPPORTS_SYSFS
- sysfs_slab_remove(s);
-#else
- slab_kmem_cache_release(s);
-#endif
- goto out;
+ if (!busy)
+ do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
out_unlock:
mutex_unlock(&slab_mutex);
-out:
+
put_online_mems();
put_online_cpus();
+
+ do_kmem_cache_release(&release, need_rcu_barrier);
}
EXPORT_SYMBOL(kmem_cache_destroy);
diff --git a/mm/slub.c b/mm/slub.c
index fe376fe..8b8508a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2398,13 +2398,24 @@ redo:
* reading from one cpu area. That does not matter as long
* as we end up on the original cpu again when doing the cmpxchg.
*
- * Preemption is disabled for the retrieval of the tid because that
- * must occur from the current processor. We cannot allow rescheduling
- * on a different processor between the determination of the pointer
- * and the retrieval of the tid.
+ * We should guarantee that tid and kmem_cache are retrieved on
+ * the same cpu. It could be different if CONFIG_PREEMPT so we need
+ * to check if it is matched or not.
*/
- preempt_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ do {
+ tid = this_cpu_read(s->cpu_slab->tid);
+ c = raw_cpu_ptr(s->cpu_slab);
+ } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+
+ /*
+ * Irqless object alloc/free algorithm used here depends on sequence
+ * of fetching cpu_slab's data. tid should be fetched before anything
+ * on c to guarantee that object and page associated with previous tid
+ * won't be used with current tid. If we fetch tid first, object and
+ * page could be one associated with next tid and our alloc/free
+ * request will be failed. In this case, we will retry. So, no problem.
+ */
+ barrier();
/*
* The transaction ids are globally unique per cpu and per operation on
@@ -2412,8 +2423,6 @@ redo:
* occurs on the right processor and that there was no operation on the
* linked list in between.
*/
- tid = c->tid;
- preempt_enable();
object = c->freelist;
page = c->page;
@@ -2512,7 +2521,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
/*
- * Slow patch handling. This may still be called frequently since objects
+ * Slow path handling. This may still be called frequently since objects
* have a longer lifetime than the cpu slabs in most processing loads.
*
* So we still attempt to reduce cache line usage. Just take the slab
@@ -2659,11 +2668,13 @@ redo:
* data is retrieved via this pointer. If we are on the same cpu
* during the cmpxchg then the free will succedd.
*/
- preempt_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ do {
+ tid = this_cpu_read(s->cpu_slab->tid);
+ c = raw_cpu_ptr(s->cpu_slab);
+ } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
- tid = c->tid;
- preempt_enable();
+ /* Same with comment on barrier() in slab_alloc_node() */
+ barrier();
if (likely(page == c->page)) {
set_freepointer(s, object, c->freelist);
diff --git a/mm/swap.c b/mm/swap.c
index 8a12b33..5b30872 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1140,10 +1140,8 @@ void __init swap_setup(void)
if (bdi_init(swapper_spaces[0].backing_dev_info))
panic("Failed to init swap bdi");
- for (i = 0; i < MAX_SWAPFILES; i++) {
+ for (i = 0; i < MAX_SWAPFILES; i++)
spin_lock_init(&swapper_spaces[i].tree_lock);
- INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
- }
#endif
/* Use a smaller cluster for small-memory machines */
diff --git a/mm/vmscan.c b/mm/vmscan.c
index bd9a72b..dcd90c8 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2656,7 +2656,7 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist,
* should make reasonable progress.
*/
for_each_zone_zonelist_nodemask(zone, z, zonelist,
- gfp_mask, nodemask) {
+ gfp_zone(gfp_mask), nodemask) {
if (zone_idx(zone) > ZONE_NORMAL)
continue;
@@ -2921,18 +2921,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
return false;
/*
- * There is a potential race between when kswapd checks its watermarks
- * and a process gets throttled. There is also a potential race if
- * processes get throttled, kswapd wakes, a large process exits therby
- * balancing the zones that causes kswapd to miss a wakeup. If kswapd
- * is going to sleep, no process should be sleeping on pfmemalloc_wait
- * so wake them now if necessary. If necessary, processes will wake
- * kswapd and get throttled again
+ * The throttled processes are normally woken up in balance_pgdat() as
+ * soon as pfmemalloc_watermark_ok() is true. But there is a potential
+ * race between when kswapd checks the watermarks and a process gets
+ * throttled. There is also a potential race if processes get
+ * throttled, kswapd wakes, a large process exits thereby balancing the
+ * zones, which causes kswapd to exit balance_pgdat() before reaching
+ * the wake up checks. If kswapd is going to sleep, no process should
+ * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
+ * the wake up is premature, processes will wake kswapd and get
+ * throttled again. The difference from wake ups in balance_pgdat() is
+ * that here we are under prepare_to_wait().
*/
- if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
- wake_up(&pgdat->pfmemalloc_wait);
- return false;
- }
+ if (waitqueue_active(&pgdat->pfmemalloc_wait))
+ wake_up_all(&pgdat->pfmemalloc_wait);
return pgdat_balanced(pgdat, order, classzone_idx);
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1284f89..9943e5f 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,9 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/vmstat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
#include <linux/sched.h>
#include <linux/math64.h>
#include <linux/writeback.h>
@@ -670,66 +673,6 @@ int fragmentation_index(struct zone *zone, unsigned int order)
}
#endif
-#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-
-static char * const migratetype_names[MIGRATE_TYPES] = {
- "Unmovable",
- "Reclaimable",
- "Movable",
- "Reserve",
-#ifdef CONFIG_CMA
- "CMA",
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- "Isolate",
-#endif
-};
-
-static void *frag_start(struct seq_file *m, loff_t *pos)
-{
- pg_data_t *pgdat;
- loff_t node = *pos;
- for (pgdat = first_online_pgdat();
- pgdat && node;
- pgdat = next_online_pgdat(pgdat))
- --node;
-
- return pgdat;
-}
-
-static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
-{
- pg_data_t *pgdat = (pg_data_t *)arg;
-
- (*pos)++;
- return next_online_pgdat(pgdat);
-}
-
-static void frag_stop(struct seq_file *m, void *arg)
-{
-}
-
-/* Walk all the zones in a node and print using a callback */
-static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
- void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
-{
- struct zone *zone;
- struct zone *node_zones = pgdat->node_zones;
- unsigned long flags;
-
- for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!populated_zone(zone))
- continue;
-
- spin_lock_irqsave(&zone->lock, flags);
- print(m, pgdat, zone);
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-}
-#endif
-
#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
#ifdef CONFIG_ZONE_DMA
#define TEXT_FOR_DMA(xx) xx "_dma",
@@ -907,7 +850,66 @@ const char * const vmstat_text[] = {
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
+#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
+ defined(CONFIG_PROC_FS)
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+ pg_data_t *pgdat;
+ loff_t node = *pos;
+
+ for (pgdat = first_online_pgdat();
+ pgdat && node;
+ pgdat = next_online_pgdat(pgdat))
+ --node;
+
+ return pgdat;
+}
+
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ (*pos)++;
+ return next_online_pgdat(pgdat);
+}
+
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+
+/* Walk all the zones in a node and print using a callback */
+static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+ void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
+{
+ struct zone *zone;
+ struct zone *node_zones = pgdat->node_zones;
+ unsigned long flags;
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!populated_zone(zone))
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ print(m, pgdat, zone);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+}
+#endif
+
#ifdef CONFIG_PROC_FS
+static char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Reclaimable",
+ "Movable",
+ "Reserve",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -1536,8 +1538,6 @@ static int __init setup_vmstat(void)
module_init(setup_vmstat)
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
-#include <linux/debugfs.h>
-
/*
* Return an index indicating how much of the available free memory is