summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig3
-rw-r--r--mm/Makefile3
-rw-r--r--mm/backing-dev.c5
-rw-r--r--mm/compaction.c26
-rw-r--r--mm/debug-pagealloc.c56
-rw-r--r--mm/filemap.c3
-rw-r--r--mm/highmem.c2
-rw-r--r--mm/huge_memory.c54
-rw-r--r--mm/ksm.c3
-rw-r--r--mm/memblock.c11
-rw-r--r--mm/memcontrol.c3
-rw-r--r--mm/memory-failure.c12
-rw-r--r--mm/mempolicy.c2
-rw-r--r--mm/migrate.c83
-rw-r--r--mm/mlock.c13
-rw-r--r--mm/mmap.c9
-rw-r--r--mm/mremap.c42
-rw-r--r--mm/oom_kill.c53
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c17
-rw-r--r--mm/page_cgroup.c3
-rw-r--r--mm/process_vm_access.c496
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/shmem.c10
-rw-r--r--mm/slab.c19
-rw-r--r--mm/slub.c605
-rw-r--r--mm/swapfile.c2
-rw-r--r--mm/thrash.c2
-rw-r--r--mm/vmalloc.c80
-rw-r--r--mm/vmscan.c327
-rw-r--r--mm/vmstat.c3
31 files changed, 1391 insertions, 562 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f2f1ca1..011b110 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -131,6 +131,9 @@ config SPARSEMEM_VMEMMAP
config HAVE_MEMBLOCK
boolean
+config NO_BOOTMEM
+ boolean
+
# eventually, we can have this option just 'select SPARSEMEM'
config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
diff --git a/mm/Makefile b/mm/Makefile
index 836e416..50ec00e 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -5,7 +5,8 @@
mmu-y := nommu.o
mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
- vmalloc.o pagewalk.o pgtable-generic.o
+ vmalloc.o pagewalk.o pgtable-generic.o \
+ process_vm_access.o
obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index a87da52..7520ef0 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -404,9 +404,8 @@ static int bdi_forker_thread(void *ptr)
/*
* In the following loop we are going to check whether we have
* some work to do without any synchronization with tasks
- * waking us up to do work for them. So we have to set task
- * state already here so that we don't miss wakeups coming
- * after we verify some condition.
+ * waking us up to do work for them. Set the task state here
+ * so that we don't miss wakeups after verifying conditions.
*/
set_current_state(TASK_INTERRUPTIBLE);
diff --git a/mm/compaction.c b/mm/compaction.c
index 6cc604b..899d956 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -35,10 +35,6 @@ struct compact_control {
unsigned long migrate_pfn; /* isolate_migratepages search base */
bool sync; /* Synchronous migration */
- /* Account for isolated anon and file pages */
- unsigned long nr_anon;
- unsigned long nr_file;
-
unsigned int order; /* order a direct compactor needs */
int migratetype; /* MOVABLE, RECLAIMABLE etc */
struct zone *zone;
@@ -223,17 +219,13 @@ static void isolate_freepages(struct zone *zone,
static void acct_isolated(struct zone *zone, struct compact_control *cc)
{
struct page *page;
- unsigned int count[NR_LRU_LISTS] = { 0, };
+ unsigned int count[2] = { 0, };
- list_for_each_entry(page, &cc->migratepages, lru) {
- int lru = page_lru_base_type(page);
- count[lru]++;
- }
+ list_for_each_entry(page, &cc->migratepages, lru)
+ count[!!page_is_file_cache(page)]++;
- cc->nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON];
- cc->nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE];
- __mod_zone_page_state(zone, NR_ISOLATED_ANON, cc->nr_anon);
- __mod_zone_page_state(zone, NR_ISOLATED_FILE, cc->nr_file);
+ __mod_zone_page_state(zone, NR_ISOLATED_ANON, count[0]);
+ __mod_zone_page_state(zone, NR_ISOLATED_FILE, count[1]);
}
/* Similar to reclaim, but different enough that they don't share logic */
@@ -269,6 +261,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
unsigned long last_pageblock_nr = 0, pageblock_nr;
unsigned long nr_scanned = 0, nr_isolated = 0;
struct list_head *migratelist = &cc->migratepages;
+ isolate_mode_t mode = ISOLATE_ACTIVE|ISOLATE_INACTIVE;
/* Do not scan outside zone boundaries */
low_pfn = max(cc->migrate_pfn, zone->zone_start_pfn);
@@ -356,8 +349,11 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
continue;
}
+ if (!cc->sync)
+ mode |= ISOLATE_CLEAN;
+
/* Try isolate the page */
- if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0)
+ if (__isolate_lru_page(page, mode, 0) != 0)
continue;
VM_BUG_ON(PageTransCompound(page));
@@ -586,7 +582,7 @@ out:
return ret;
}
-unsigned long compact_zone_order(struct zone *zone,
+static unsigned long compact_zone_order(struct zone *zone,
int order, gfp_t gfp_mask,
bool sync)
{
diff --git a/mm/debug-pagealloc.c b/mm/debug-pagealloc.c
index a1e3324..7cea557 100644
--- a/mm/debug-pagealloc.c
+++ b/mm/debug-pagealloc.c
@@ -1,7 +1,10 @@
#include <linux/kernel.h>
+#include <linux/string.h>
#include <linux/mm.h>
+#include <linux/highmem.h>
#include <linux/page-debug-flags.h>
#include <linux/poison.h>
+#include <linux/ratelimit.h>
static inline void set_page_poison(struct page *page)
{
@@ -18,28 +21,13 @@ static inline bool page_poison(struct page *page)
return test_bit(PAGE_DEBUG_FLAG_POISON, &page->debug_flags);
}
-static void poison_highpage(struct page *page)
-{
- /*
- * Page poisoning for highmem pages is not implemented.
- *
- * This can be called from interrupt contexts.
- * So we need to create a new kmap_atomic slot for this
- * application and it will need interrupt protection.
- */
-}
-
static void poison_page(struct page *page)
{
- void *addr;
+ void *addr = kmap_atomic(page);
- if (PageHighMem(page)) {
- poison_highpage(page);
- return;
- }
set_page_poison(page);
- addr = page_address(page);
memset(addr, PAGE_POISON, PAGE_SIZE);
+ kunmap_atomic(addr);
}
static void poison_pages(struct page *page, int n)
@@ -59,14 +47,12 @@ static bool single_bit_flip(unsigned char a, unsigned char b)
static void check_poison_mem(unsigned char *mem, size_t bytes)
{
+ static DEFINE_RATELIMIT_STATE(ratelimit, 5 * HZ, 10);
unsigned char *start;
unsigned char *end;
- for (start = mem; start < mem + bytes; start++) {
- if (*start != PAGE_POISON)
- break;
- }
- if (start == mem + bytes)
+ start = memchr_inv(mem, PAGE_POISON, bytes);
+ if (!start)
return;
for (end = mem + bytes - 1; end > start; end--) {
@@ -74,7 +60,7 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
break;
}
- if (!printk_ratelimit())
+ if (!__ratelimit(&ratelimit))
return;
else if (start == end && single_bit_flip(*start, PAGE_POISON))
printk(KERN_ERR "pagealloc: single bit error\n");
@@ -86,27 +72,17 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
dump_stack();
}
-static void unpoison_highpage(struct page *page)
-{
- /*
- * See comment in poison_highpage().
- * Highmem pages should not be poisoned for now
- */
- BUG_ON(page_poison(page));
-}
-
static void unpoison_page(struct page *page)
{
- if (PageHighMem(page)) {
- unpoison_highpage(page);
+ void *addr;
+
+ if (!page_poison(page))
return;
- }
- if (page_poison(page)) {
- void *addr = page_address(page);
- check_poison_mem(addr, PAGE_SIZE);
- clear_page_poison(page);
- }
+ addr = kmap_atomic(page);
+ check_poison_mem(addr, PAGE_SIZE);
+ clear_page_poison(page);
+ kunmap_atomic(addr);
}
static void unpoison_pages(struct page *page, int n)
diff --git a/mm/filemap.c b/mm/filemap.c
index 7771871..5cf820a 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2115,6 +2115,7 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
} else {
const struct iovec *iov = i->iov;
size_t base = i->iov_offset;
+ unsigned long nr_segs = i->nr_segs;
/*
* The !iov->iov_len check ensures we skip over unlikely
@@ -2130,11 +2131,13 @@ void iov_iter_advance(struct iov_iter *i, size_t bytes)
base += copy;
if (iov->iov_len == base) {
iov++;
+ nr_segs--;
base = 0;
}
}
i->iov = iov;
i->iov_offset = base;
+ i->nr_segs = nr_segs;
}
}
EXPORT_SYMBOL(iov_iter_advance);
diff --git a/mm/highmem.c b/mm/highmem.c
index 5ef672c..e159a7b 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -250,7 +250,7 @@ void *kmap_high_get(struct page *page)
#endif
/**
- * kunmap_high - map a highmem page into memory
+ * kunmap_high - unmap a highmem page into memory
* @page: &struct page to unmap
*
* If ARCH_NEEDS_KMAP_HIGH_GET is not defined then this may be called
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index e2d1587..860ec21 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -89,7 +89,8 @@ struct khugepaged_scan {
struct list_head mm_head;
struct mm_slot *mm_slot;
unsigned long address;
-} khugepaged_scan = {
+};
+static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
@@ -829,7 +830,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
for (i = 0; i < HPAGE_PMD_NR; i++) {
copy_user_highpage(pages[i], page + i,
- haddr + PAGE_SHIFT*i, vma);
+ haddr + PAGE_SIZE * i, vma);
__SetPageUptodate(pages[i]);
cond_resched();
}
@@ -1052,6 +1053,51 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
return ret;
}
+int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
+ unsigned long old_addr,
+ unsigned long new_addr, unsigned long old_end,
+ pmd_t *old_pmd, pmd_t *new_pmd)
+{
+ int ret = 0;
+ pmd_t pmd;
+
+ struct mm_struct *mm = vma->vm_mm;
+
+ if ((old_addr & ~HPAGE_PMD_MASK) ||
+ (new_addr & ~HPAGE_PMD_MASK) ||
+ old_end - old_addr < HPAGE_PMD_SIZE ||
+ (new_vma->vm_flags & VM_NOHUGEPAGE))
+ goto out;
+
+ /*
+ * The destination pmd shouldn't be established, free_pgtables()
+ * should have release it.
+ */
+ if (WARN_ON(!pmd_none(*new_pmd))) {
+ VM_BUG_ON(pmd_trans_huge(*new_pmd));
+ goto out;
+ }
+
+ spin_lock(&mm->page_table_lock);
+ if (likely(pmd_trans_huge(*old_pmd))) {
+ if (pmd_trans_splitting(*old_pmd)) {
+ spin_unlock(&mm->page_table_lock);
+ wait_split_huge_page(vma->anon_vma, old_pmd);
+ ret = -1;
+ } else {
+ pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
+ VM_BUG_ON(!pmd_none(*new_pmd));
+ set_pmd_at(mm, new_addr, new_pmd, pmd);
+ spin_unlock(&mm->page_table_lock);
+ ret = 1;
+ }
+ } else {
+ spin_unlock(&mm->page_table_lock);
+ }
+out:
+ return ret;
+}
+
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot)
{
@@ -1906,7 +1952,7 @@ static void collapse_huge_page(struct mm_struct *mm,
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
set_pmd_at(mm, address, pmd, _pmd);
- update_mmu_cache(vma, address, entry);
+ update_mmu_cache(vma, address, _pmd);
prepare_pmd_huge_pte(pgtable, mm);
mm->nr_ptes--;
spin_unlock(&mm->page_table_lock);
@@ -2024,6 +2070,8 @@ static void collect_mm_slot(struct mm_slot *mm_slot)
static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
struct page **hpage)
+ __releases(&khugepaged_mm_lock)
+ __acquires(&khugepaged_mm_lock)
{
struct mm_slot *mm_slot;
struct mm_struct *mm;
diff --git a/mm/ksm.c b/mm/ksm.c
index 9a68b0c..310544a 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1905,7 +1905,8 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
err = unmerge_and_remove_all_rmap_items();
- test_set_oom_score_adj(oom_score_adj);
+ compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX,
+ oom_score_adj);
if (err) {
ksm_run = KSM_RUN_STOP;
count = err;
diff --git a/mm/memblock.c b/mm/memblock.c
index ccbf973..84bec49 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -58,7 +58,8 @@ static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, p
return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
}
-long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
+ phys_addr_t base, phys_addr_t size)
{
unsigned long i;
@@ -267,7 +268,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type)
return 0;
}
-extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
+int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
phys_addr_t addr2, phys_addr_t size2)
{
return 1;
@@ -626,6 +627,12 @@ phys_addr_t __init memblock_phys_mem_size(void)
return memblock.memory_size;
}
+/* lowest address */
+phys_addr_t __init_memblock memblock_start_of_DRAM(void)
+{
+ return memblock.memory.regions[0].base;
+}
+
phys_addr_t __init_memblock memblock_end_of_DRAM(void)
{
int idx = memblock.memory.cnt - 1;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3508777..2d57555 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1185,7 +1185,8 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page)
unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
struct list_head *dst,
unsigned long *scanned, int order,
- int mode, struct zone *z,
+ isolate_mode_t mode,
+ struct zone *z,
struct mem_cgroup *mem_cont,
int active, int file)
{
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 2b43ba0..edc388d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1310,7 +1310,7 @@ int unpoison_memory(unsigned long pfn)
* to the end.
*/
if (PageHuge(page)) {
- pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
+ pr_info("MCE: Memory failure is now running on free hugepage %#lx\n", pfn);
return 0;
}
if (TestClearPageHWPoison(p))
@@ -1419,7 +1419,7 @@ static int soft_offline_huge_page(struct page *page, int flags)
if (PageHWPoison(hpage)) {
put_page(hpage);
- pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn);
+ pr_info("soft offline: %#lx hugepage already poisoned\n", pfn);
return -EBUSY;
}
@@ -1433,8 +1433,8 @@ static int soft_offline_huge_page(struct page *page, int flags)
list_for_each_entry_safe(page1, page2, &pagelist, lru)
put_page(page1);
- pr_debug("soft offline: %#lx: migration failed %d, type %lx\n",
- pfn, ret, page->flags);
+ pr_info("soft offline: %#lx: migration failed %d, type %lx\n",
+ pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
return ret;
@@ -1505,7 +1505,7 @@ int soft_offline_page(struct page *page, int flags)
}
if (!PageLRU(page)) {
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
- pfn, page->flags);
+ pfn, page->flags);
return -EIO;
}
@@ -1566,7 +1566,7 @@ int soft_offline_page(struct page *page, int flags)
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
- pfn, ret, page_count(page), page->flags);
+ pfn, ret, page_count(page), page->flags);
}
if (ret)
return ret;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 9c51f9f..cd237f4 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -111,7 +111,7 @@ enum zone_type policy_zone = 0;
/*
* run-time system-wide default policy => local allocation
*/
-struct mempolicy default_policy = {
+static struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
.mode = MPOL_PREFERRED,
.flags = MPOL_F_LOCAL,
diff --git a/mm/migrate.c b/mm/migrate.c
index 666e4e6..33358f8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -120,10 +120,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
ptep = pte_offset_map(pmd, addr);
- if (!is_swap_pte(*ptep)) {
- pte_unmap(ptep);
- goto out;
- }
+ /*
+ * Peek to check is_swap_pte() before taking ptlock? No, we
+ * can race mremap's move_ptes(), which skips anon_vma lock.
+ */
ptl = pte_lockptr(mm, pmd);
}
@@ -621,38 +621,18 @@ static int move_to_new_page(struct page *newpage, struct page *page,
return rc;
}
-/*
- * Obtain the lock on page, remove all ptes and migrate the page
- * to the newly allocated page in newpage.
- */
-static int unmap_and_move(new_page_t get_new_page, unsigned long private,
- struct page *page, int force, bool offlining, bool sync)
+static int __unmap_and_move(struct page *page, struct page *newpage,
+ int force, bool offlining, bool sync)
{
- int rc = 0;
- int *result = NULL;
- struct page *newpage = get_new_page(page, private, &result);
+ int rc = -EAGAIN;
int remap_swapcache = 1;
int charge = 0;
struct mem_cgroup *mem;
struct anon_vma *anon_vma = NULL;
- if (!newpage)
- return -ENOMEM;
-
- if (page_count(page) == 1) {
- /* page was freed from under us. So we are done. */
- goto move_newpage;
- }
- if (unlikely(PageTransHuge(page)))
- if (unlikely(split_huge_page(page)))
- goto move_newpage;
-
- /* prepare cgroup just returns 0 or -ENOMEM */
- rc = -EAGAIN;
-
if (!trylock_page(page)) {
if (!force || !sync)
- goto move_newpage;
+ goto out;
/*
* It's not safe for direct compaction to call lock_page.
@@ -668,7 +648,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private,
* altogether.
*/
if (current->flags & PF_MEMALLOC)
- goto move_newpage;
+ goto out;
lock_page(page);
}
@@ -785,27 +765,52 @@ uncharge:
mem_cgroup_end_migration(mem, page, newpage, rc == 0);
unlock:
unlock_page(page);
+out:
+ return rc;
+}
-move_newpage:
+/*
+ * Obtain the lock on page, remove all ptes and migrate the page
+ * to the newly allocated page in newpage.
+ */
+static int unmap_and_move(new_page_t get_new_page, unsigned long private,
+ struct page *page, int force, bool offlining, bool sync)
+{
+ int rc = 0;
+ int *result = NULL;
+ struct page *newpage = get_new_page(page, private, &result);
+
+ if (!newpage)
+ return -ENOMEM;
+
+ if (page_count(page) == 1) {
+ /* page was freed from under us. So we are done. */
+ goto out;
+ }
+
+ if (unlikely(PageTransHuge(page)))
+ if (unlikely(split_huge_page(page)))
+ goto out;
+
+ rc = __unmap_and_move(page, newpage, force, offlining, sync);
+out:
if (rc != -EAGAIN) {
- /*
- * A page that has been migrated has all references
- * removed and will be freed. A page that has not been
- * migrated will have kepts its references and be
- * restored.
- */
- list_del(&page->lru);
+ /*
+ * A page that has been migrated has all references
+ * removed and will be freed. A page that has not been
+ * migrated will have kepts its references and be
+ * restored.
+ */
+ list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
putback_lru_page(page);
}
-
/*
* Move the new page to the LRU. If migration was not successful
* then this will free the page.
*/
putback_lru_page(newpage);
-
if (result) {
if (rc)
*result = rc;
diff --git a/mm/mlock.c b/mm/mlock.c
index 048260c..bd34b3a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -110,7 +110,15 @@ void munlock_vma_page(struct page *page)
if (TestClearPageMlocked(page)) {
dec_zone_page_state(page, NR_MLOCK);
if (!isolate_lru_page(page)) {
- int ret = try_to_munlock(page);
+ int ret = SWAP_AGAIN;
+
+ /*
+ * Optimization: if the page was mapped just once,
+ * that's our mapping and we don't need to check all the
+ * other vmas.
+ */
+ if (page_mapcount(page) > 1)
+ ret = try_to_munlock(page);
/*
* did try_to_unlock() succeed or punt?
*/
@@ -549,7 +557,8 @@ SYSCALL_DEFINE1(mlockall, int, flags)
if (!can_do_mlock())
goto out;
- lru_add_drain_all(); /* flush pagevec */
+ if (flags & MCL_CURRENT)
+ lru_add_drain_all(); /* flush pagevec */
down_write(&current->mm->mmap_sem);
diff --git a/mm/mmap.c b/mm/mmap.c
index a65efd4..3c0061f 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2558,7 +2558,6 @@ int mm_take_all_locks(struct mm_struct *mm)
{
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- int ret = -EINTR;
BUG_ON(down_read_trylock(&mm->mmap_sem));
@@ -2579,13 +2578,11 @@ int mm_take_all_locks(struct mm_struct *mm)
vm_lock_anon_vma(mm, avc->anon_vma);
}
- ret = 0;
+ return 0;
out_unlock:
- if (ret)
- mm_drop_all_locks(mm);
-
- return ret;
+ mm_drop_all_locks(mm);
+ return -EINTR;
}
static void vm_unlock_anon_vma(struct anon_vma *anon_vma)
diff --git a/mm/mremap.c b/mm/mremap.c
index 506fa44..d6959cb 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -41,8 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return NULL;
pmd = pmd_offset(pud, addr);
- split_huge_page_pmd(mm, pmd);
- if (pmd_none_or_clear_bad(pmd))
+ if (pmd_none(*pmd))
return NULL;
return pmd;
@@ -65,8 +64,6 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
return NULL;
VM_BUG_ON(pmd_trans_huge(*pmd));
- if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr))
- return NULL;
return pmd;
}
@@ -80,11 +77,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl, *new_ptl;
- unsigned long old_start;
- old_start = old_addr;
- mmu_notifier_invalidate_range_start(vma->vm_mm,
- old_start, old_end);
if (vma->vm_file) {
/*
* Subtle point from Rajesh Venkatasubramanian: before
@@ -111,7 +104,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
new_pte++, new_addr += PAGE_SIZE) {
if (pte_none(*old_pte))
continue;
- pte = ptep_clear_flush(vma, old_addr, old_pte);
+ pte = ptep_get_and_clear(mm, old_addr, old_pte);
pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
set_pte_at(mm, new_addr, new_pte, pte);
}
@@ -123,7 +116,6 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping)
mutex_unlock(&mapping->i_mmap_mutex);
- mmu_notifier_invalidate_range_end(vma->vm_mm, old_start, old_end);
}
#define LATENCY_LIMIT (64 * PAGE_SIZE)
@@ -134,22 +126,43 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
{
unsigned long extent, next, old_end;
pmd_t *old_pmd, *new_pmd;
+ bool need_flush = false;
old_end = old_addr + len;
flush_cache_range(vma, old_addr, old_end);
+ mmu_notifier_invalidate_range_start(vma->vm_mm, old_addr, old_end);
+
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
next = (old_addr + PMD_SIZE) & PMD_MASK;
- if (next - 1 > old_end)
- next = old_end;
+ /* even if next overflowed, extent below will be ok */
extent = next - old_addr;
+ if (extent > old_end - old_addr)
+ extent = old_end - old_addr;
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
+ if (pmd_trans_huge(*old_pmd)) {
+ int err = 0;
+ if (extent == HPAGE_PMD_SIZE)
+ err = move_huge_pmd(vma, new_vma, old_addr,
+ new_addr, old_end,
+ old_pmd, new_pmd);
+ if (err > 0) {
+ need_flush = true;
+ continue;
+ } else if (!err) {
+ split_huge_page_pmd(vma->vm_mm, old_pmd);
+ }
+ VM_BUG_ON(pmd_trans_huge(*old_pmd));
+ }
+ if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
+ new_pmd, new_addr))
+ break;
next = (new_addr + PMD_SIZE) & PMD_MASK;
if (extent > next - new_addr)
extent = next - new_addr;
@@ -157,7 +170,12 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
extent = LATENCY_LIMIT;
move_ptes(vma, old_pmd, old_addr, old_addr + extent,
new_vma, new_pmd, new_addr);
+ need_flush = true;
}
+ if (likely(need_flush))
+ flush_tlb_range(vma, old_end-len, old_addr);
+
+ mmu_notifier_invalidate_range_end(vma->vm_mm, old_end-len, old_end);
return len + old_addr - old_end; /* how much done */
}
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 626303b..e916168 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -32,12 +32,32 @@
#include <linux/mempolicy.h>
#include <linux/security.h>
#include <linux/ptrace.h>
+#include <linux/freezer.h>
int sysctl_panic_on_oom;
int sysctl_oom_kill_allocating_task;
int sysctl_oom_dump_tasks = 1;
static DEFINE_SPINLOCK(zone_scan_lock);
+/*
+ * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj
+ * @old_val: old oom_score_adj for compare
+ * @new_val: new oom_score_adj for swap
+ *
+ * Sets the oom_score_adj value for current to @new_val iff its present value is
+ * @old_val. Usually used to reinstate a previous value to prevent racing with
+ * userspacing tuning the value in the interim.
+ */
+void compare_swap_oom_score_adj(int old_val, int new_val)
+{
+ struct sighand_struct *sighand = current->sighand;
+
+ spin_lock_irq(&sighand->siglock);
+ if (current->signal->oom_score_adj == old_val)
+ current->signal->oom_score_adj = new_val;
+ spin_unlock_irq(&sighand->siglock);
+}
+
/**
* test_set_oom_score_adj() - set current's oom_score_adj and return old value
* @new_val: new oom_score_adj value
@@ -53,13 +73,7 @@ int test_set_oom_score_adj(int new_val)
spin_lock_irq(&sighand->siglock);
old_val = current->signal->oom_score_adj;
- if (new_val != old_val) {
- if (new_val == OOM_SCORE_ADJ_MIN)
- atomic_inc(&current->mm->oom_disable_count);
- else if (old_val == OOM_SCORE_ADJ_MIN)
- atomic_dec(&current->mm->oom_disable_count);
- current->signal->oom_score_adj = new_val;
- }
+ current->signal->oom_score_adj = new_val;
spin_unlock_irq(&sighand->siglock);
return old_val;
@@ -172,16 +186,6 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
return 0;
/*
- * Shortcut check for a thread sharing p->mm that is OOM_SCORE_ADJ_MIN
- * so the entire heuristic doesn't need to be executed for something
- * that cannot be killed.
- */
- if (atomic_read(&p->mm->oom_disable_count)) {
- task_unlock(p);
- return 0;
- }
-
- /*
* The memory controller may have a limit of 0 bytes, so avoid a divide
* by zero, if necessary.
*/
@@ -317,8 +321,11 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
* blocked waiting for another task which itself is waiting
* for memory. Is there a better alternative?
*/
- if (test_tsk_thread_flag(p, TIF_MEMDIE))
+ if (test_tsk_thread_flag(p, TIF_MEMDIE)) {
+ if (unlikely(frozen(p)))
+ thaw_process(p);
return ERR_PTR(-1UL);
+ }
if (!p->mm)
continue;
@@ -435,7 +442,7 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
task_unlock(p);
/*
- * Kill all processes sharing p->mm in other thread groups, if any.
+ * Kill all user processes sharing p->mm in other thread groups, if any.
* They don't get access to memory reserves or a higher scheduler
* priority, though, to avoid depletion of all memory or task
* starvation. This prevents mm->mmap_sem livelock when an oom killed
@@ -445,7 +452,11 @@ static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
* signal.
*/
for_each_process(q)
- if (q->mm == mm && !same_thread_group(q, p)) {
+ if (q->mm == mm && !same_thread_group(q, p) &&
+ !(q->flags & PF_KTHREAD)) {
+ if (q->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
+ continue;
+
task_lock(q); /* Protect ->comm from prctl() */
pr_err("Kill process %d (%s) sharing same memory\n",
task_pid_nr(q), q->comm);
@@ -722,7 +733,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
read_lock(&tasklist_lock);
if (sysctl_oom_kill_allocating_task &&
!oom_unkillable_task(current, NULL, nodemask) &&
- current->mm && !atomic_read(&current->mm->oom_disable_count)) {
+ current->mm) {
/*
* oom_kill_process() needs tasklist_lock held. If it returns
* non-zero, current could not be killed so we must fallback to
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0e309cd..793e987 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -305,7 +305,9 @@ static unsigned long task_min_dirty_limit(unsigned long bdi_dirty)
}
/*
- *
+ * bdi_min_ratio keeps the sum of the minimum dirty shares of all
+ * registered backing devices, which, for obvious reasons, can not
+ * exceed 100%.
*/
static unsigned int bdi_min_ratio;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 6e8ecb6..9dd443d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -318,6 +318,7 @@ static void bad_page(struct page *page)
current->comm, page_to_pfn(page));
dump_page(page);
+ print_modules();
dump_stack();
out:
/* Leave bad fields for debug, except PageBuddy could make trouble */
@@ -1753,7 +1754,6 @@ static DEFINE_RATELIMIT_STATE(nopage_rs,
void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
{
- va_list args;
unsigned int filter = SHOW_MEM_FILTER_NODES;
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
@@ -1772,14 +1772,21 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
filter &= ~SHOW_MEM_FILTER_NODES;
if (fmt) {
- printk(KERN_WARNING);
+ struct va_format vaf;
+ va_list args;
+
va_start(args, fmt);
- vprintk(fmt, args);
+
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ pr_warn("%pV", &vaf);
+
va_end(args);
}
- pr_warning("%s: page allocation failure: order:%d, mode:0x%x\n",
- current->comm, order, gfp_mask);
+ pr_warn("%s: page allocation failure: order:%d, mode:0x%x\n",
+ current->comm, order, gfp_mask);
dump_stack();
if (!should_suppress_show_mem())
diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c
index 39d216d..6bdc67d 100644
--- a/mm/page_cgroup.c
+++ b/mm/page_cgroup.c
@@ -513,11 +513,10 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
array_size = length * sizeof(void *);
- array = vmalloc(array_size);
+ array = vzalloc(array_size);
if (!array)
goto nomem;
- memset(array, 0, array_size);
ctrl = &swap_cgroup_ctrl[type];
mutex_lock(&swap_cgroup_mutex);
ctrl->length = length;
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
new file mode 100644
index 0000000..e920aa3
--- /dev/null
+++ b/mm/process_vm_access.c
@@ -0,0 +1,496 @@
+/*
+ * linux/mm/process_vm_access.c
+ *
+ * Copyright (C) 2010-2011 Christopher Yeoh <cyeoh@au1.ibm.com>, IBM Corp.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/mm.h>
+#include <linux/uio.h>
+#include <linux/sched.h>
+#include <linux/highmem.h>
+#include <linux/ptrace.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+#endif
+
+/**
+ * process_vm_rw_pages - read/write pages from task specified
+ * @task: task to read/write from
+ * @mm: mm for task
+ * @process_pages: struct pages area that can store at least
+ * nr_pages_to_copy struct page pointers
+ * @pa: address of page in task to start copying from/to
+ * @start_offset: offset in page to start copying from/to
+ * @len: number of bytes to copy
+ * @lvec: iovec array specifying where to copy to/from
+ * @lvec_cnt: number of elements in iovec array
+ * @lvec_current: index in iovec array we are up to
+ * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @vm_write: 0 means copy from, 1 means copy to
+ * @nr_pages_to_copy: number of pages to copy
+ * @bytes_copied: returns number of bytes successfully copied
+ * Returns 0 on success, error code otherwise
+ */
+static int process_vm_rw_pages(struct task_struct *task,
+ struct mm_struct *mm,
+ struct page **process_pages,
+ unsigned long pa,
+ unsigned long start_offset,
+ unsigned long len,
+ const struct iovec *lvec,
+ unsigned long lvec_cnt,
+ unsigned long *lvec_current,
+ size_t *lvec_offset,
+ int vm_write,
+ unsigned int nr_pages_to_copy,
+ ssize_t *bytes_copied)
+{
+ int pages_pinned;
+ void *target_kaddr;
+ int pgs_copied = 0;
+ int j;
+ int ret;
+ ssize_t bytes_to_copy;
+ ssize_t rc = 0;
+
+ *bytes_copied = 0;
+
+ /* Get the pages we're interested in */
+ down_read(&mm->mmap_sem);
+ pages_pinned = get_user_pages(task, mm, pa,
+ nr_pages_to_copy,
+ vm_write, 0, process_pages, NULL);
+ up_read(&mm->mmap_sem);
+
+ if (pages_pinned != nr_pages_to_copy) {
+ rc = -EFAULT;
+ goto end;
+ }
+
+ /* Do the copy for each page */
+ for (pgs_copied = 0;
+ (pgs_copied < nr_pages_to_copy) && (*lvec_current < lvec_cnt);
+ pgs_copied++) {
+ /* Make sure we have a non zero length iovec */
+ while (*lvec_current < lvec_cnt
+ && lvec[*lvec_current].iov_len == 0)
+ (*lvec_current)++;
+ if (*lvec_current == lvec_cnt)
+ break;
+
+ /*
+ * Will copy smallest of:
+ * - bytes remaining in page
+ * - bytes remaining in destination iovec
+ */
+ bytes_to_copy = min_t(ssize_t, PAGE_SIZE - start_offset,
+ len - *bytes_copied);
+ bytes_to_copy = min_t(ssize_t, bytes_to_copy,
+ lvec[*lvec_current].iov_len
+ - *lvec_offset);
+
+ target_kaddr = kmap(process_pages[pgs_copied]) + start_offset;
+
+ if (vm_write)
+ ret = copy_from_user(target_kaddr,
+ lvec[*lvec_current].iov_base
+ + *lvec_offset,
+ bytes_to_copy);
+ else
+ ret = copy_to_user(lvec[*lvec_current].iov_base
+ + *lvec_offset,
+ target_kaddr, bytes_to_copy);
+ kunmap(process_pages[pgs_copied]);
+ if (ret) {
+ *bytes_copied += bytes_to_copy - ret;
+ pgs_copied++;
+ rc = -EFAULT;
+ goto end;
+ }
+ *bytes_copied += bytes_to_copy;
+ *lvec_offset += bytes_to_copy;
+ if (*lvec_offset == lvec[*lvec_current].iov_len) {
+ /*
+ * Need to copy remaining part of page into the
+ * next iovec if there are any bytes left in page
+ */
+ (*lvec_current)++;
+ *lvec_offset = 0;
+ start_offset = (start_offset + bytes_to_copy)
+ % PAGE_SIZE;
+ if (start_offset)
+ pgs_copied--;
+ } else {
+ start_offset = 0;
+ }
+ }
+
+end:
+ if (vm_write) {
+ for (j = 0; j < pages_pinned; j++) {
+ if (j < pgs_copied)
+ set_page_dirty_lock(process_pages[j]);
+ put_page(process_pages[j]);
+ }
+ } else {
+ for (j = 0; j < pages_pinned; j++)
+ put_page(process_pages[j]);
+ }
+
+ return rc;
+}
+
+/* Maximum number of pages kmalloc'd to hold struct page's during copy */
+#define PVM_MAX_KMALLOC_PAGES (PAGE_SIZE * 2)
+
+/**
+ * process_vm_rw_single_vec - read/write pages from task specified
+ * @addr: start memory address of target process
+ * @len: size of area to copy to/from
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @lvec_cnt: number of elements in iovec array
+ * @lvec_current: index in iovec array we are up to
+ * @lvec_offset: offset in bytes from current iovec iov_base we are up to
+ * @process_pages: struct pages area that can store at least
+ * nr_pages_to_copy struct page pointers
+ * @mm: mm for task
+ * @task: task to read/write from
+ * @vm_write: 0 means copy from, 1 means copy to
+ * @bytes_copied: returns number of bytes successfully copied
+ * Returns 0 on success or on failure error code
+ */
+static int process_vm_rw_single_vec(unsigned long addr,
+ unsigned long len,
+ const struct iovec *lvec,
+ unsigned long lvec_cnt,
+ unsigned long *lvec_current,
+ size_t *lvec_offset,
+ struct page **process_pages,
+ struct mm_struct *mm,
+ struct task_struct *task,
+ int vm_write,
+ ssize_t *bytes_copied)
+{
+ unsigned long pa = addr & PAGE_MASK;
+ unsigned long start_offset = addr - pa;
+ unsigned long nr_pages;
+ ssize_t bytes_copied_loop;
+ ssize_t rc = 0;
+ unsigned long nr_pages_copied = 0;
+ unsigned long nr_pages_to_copy;
+ unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES
+ / sizeof(struct pages *);
+
+ *bytes_copied = 0;
+
+ /* Work out address and page range required */
+ if (len == 0)
+ return 0;
+ nr_pages = (addr + len - 1) / PAGE_SIZE - addr / PAGE_SIZE + 1;
+
+ while ((nr_pages_copied < nr_pages) && (*lvec_current < lvec_cnt)) {
+ nr_pages_to_copy = min(nr_pages - nr_pages_copied,
+ max_pages_per_loop);
+
+ rc = process_vm_rw_pages(task, mm, process_pages, pa,
+ start_offset, len,
+ lvec, lvec_cnt,
+ lvec_current, lvec_offset,
+ vm_write, nr_pages_to_copy,
+ &bytes_copied_loop);
+ start_offset = 0;
+ *bytes_copied += bytes_copied_loop;
+
+ if (rc < 0) {
+ return rc;
+ } else {
+ len -= bytes_copied_loop;
+ nr_pages_copied += nr_pages_to_copy;
+ pa += nr_pages_to_copy * PAGE_SIZE;
+ }
+ }
+
+ return rc;
+}
+
+/* Maximum number of entries for process pages array
+ which lives on stack */
+#define PVM_MAX_PP_ARRAY_COUNT 16
+
+/**
+ * process_vm_rw_core - core of reading/writing pages from task specified
+ * @pid: PID of process to read/write from/to
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @liovcnt: size of lvec array
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ * Returns the number of bytes read/written or error code. May
+ * return less bytes than expected if an error occurs during the copying
+ * process.
+ */
+static ssize_t process_vm_rw_core(pid_t pid, const struct iovec *lvec,
+ unsigned long liovcnt,
+ const struct iovec *rvec,
+ unsigned long riovcnt,
+ unsigned long flags, int vm_write)
+{
+ struct task_struct *task;
+ struct page *pp_stack[PVM_MAX_PP_ARRAY_COUNT];
+ struct page **process_pages = pp_stack;
+ struct mm_struct *mm;
+ unsigned long i;
+ ssize_t rc = 0;
+ ssize_t bytes_copied_loop;
+ ssize_t bytes_copied = 0;
+ unsigned long nr_pages = 0;
+ unsigned long nr_pages_iov;
+ unsigned long iov_l_curr_idx = 0;
+ size_t iov_l_curr_offset = 0;
+ ssize_t iov_len;
+
+ /*
+ * Work out how many pages of struct pages we're going to need
+ * when eventually calling get_user_pages
+ */
+ for (i = 0; i < riovcnt; i++) {
+ iov_len = rvec[i].iov_len;
+ if (iov_len > 0) {
+ nr_pages_iov = ((unsigned long)rvec[i].iov_base
+ + iov_len)
+ / PAGE_SIZE - (unsigned long)rvec[i].iov_base
+ / PAGE_SIZE + 1;
+ nr_pages = max(nr_pages, nr_pages_iov);
+ }
+ }
+
+ if (nr_pages == 0)
+ return 0;
+
+ if (nr_pages > PVM_MAX_PP_ARRAY_COUNT) {
+ /* For reliability don't try to kmalloc more than
+ 2 pages worth */
+ process_pages = kmalloc(min_t(size_t, PVM_MAX_KMALLOC_PAGES,
+ sizeof(struct pages *)*nr_pages),
+ GFP_KERNEL);
+
+ if (!process_pages)
+ return -ENOMEM;
+ }
+
+ /* Get process information */
+ rcu_read_lock();
+ task = find_task_by_vpid(pid);
+ if (task)
+ get_task_struct(task);
+ rcu_read_unlock();
+ if (!task) {
+ rc = -ESRCH;
+ goto free_proc_pages;
+ }
+
+ task_lock(task);
+ if (__ptrace_may_access(task, PTRACE_MODE_ATTACH)) {
+ task_unlock(task);
+ rc = -EPERM;
+ goto put_task_struct;
+ }
+ mm = task->mm;
+
+ if (!mm || (task->flags & PF_KTHREAD)) {
+ task_unlock(task);
+ rc = -EINVAL;
+ goto put_task_struct;
+ }
+
+ atomic_inc(&mm->mm_users);
+ task_unlock(task);
+
+ for (i = 0; i < riovcnt && iov_l_curr_idx < liovcnt; i++) {
+ rc = process_vm_rw_single_vec(
+ (unsigned long)rvec[i].iov_base, rvec[i].iov_len,
+ lvec, liovcnt, &iov_l_curr_idx, &iov_l_curr_offset,
+ process_pages, mm, task, vm_write, &bytes_copied_loop);
+ bytes_copied += bytes_copied_loop;
+ if (rc != 0) {
+ /* If we have managed to copy any data at all then
+ we return the number of bytes copied. Otherwise
+ we return the error code */
+ if (bytes_copied)
+ rc = bytes_copied;
+ goto put_mm;
+ }
+ }
+
+ rc = bytes_copied;
+put_mm:
+ mmput(mm);
+
+put_task_struct:
+ put_task_struct(task);
+
+free_proc_pages:
+ if (process_pages != pp_stack)
+ kfree(process_pages);
+ return rc;
+}
+
+/**
+ * process_vm_rw - check iovecs before calling core routine
+ * @pid: PID of process to read/write from/to
+ * @lvec: iovec array specifying where to copy to/from locally
+ * @liovcnt: size of lvec array
+ * @rvec: iovec array specifying where to copy to/from in the other process
+ * @riovcnt: size of rvec array
+ * @flags: currently unused
+ * @vm_write: 0 if reading from other process, 1 if writing to other process
+ * Returns the number of bytes read/written or error code. May
+ * return less bytes than expected if an error occurs during the copying
+ * process.
+ */
+static ssize_t process_vm_rw(pid_t pid,
+ const struct iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags, int vm_write)
+{
+ struct iovec iovstack_l[UIO_FASTIOV];
+ struct iovec iovstack_r[UIO_FASTIOV];
+ struct iovec *iov_l = iovstack_l;
+ struct iovec *iov_r = iovstack_r;
+ ssize_t rc;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ /* Check iovecs */
+ if (vm_write)
+ rc = rw_copy_check_uvector(WRITE, lvec, liovcnt, UIO_FASTIOV,
+ iovstack_l, &iov_l, 1);
+ else
+ rc = rw_copy_check_uvector(READ, lvec, liovcnt, UIO_FASTIOV,
+ iovstack_l, &iov_l, 1);
+ if (rc <= 0)
+ goto free_iovecs;
+
+ rc = rw_copy_check_uvector(READ, rvec, riovcnt, UIO_FASTIOV,
+ iovstack_r, &iov_r, 0);
+ if (rc <= 0)
+ goto free_iovecs;
+
+ rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
+ vm_write);
+
+free_iovecs:
+ if (iov_r != iovstack_r)
+ kfree(iov_r);
+ if (iov_l != iovstack_l)
+ kfree(iov_l);
+
+ return rc;
+}
+
+SYSCALL_DEFINE6(process_vm_readv, pid_t, pid, const struct iovec __user *, lvec,
+ unsigned long, liovcnt, const struct iovec __user *, rvec,
+ unsigned long, riovcnt, unsigned long, flags)
+{
+ return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 0);
+}
+
+SYSCALL_DEFINE6(process_vm_writev, pid_t, pid,
+ const struct iovec __user *, lvec,
+ unsigned long, liovcnt, const struct iovec __user *, rvec,
+ unsigned long, riovcnt, unsigned long, flags)
+{
+ return process_vm_rw(pid, lvec, liovcnt, rvec, riovcnt, flags, 1);
+}
+
+#ifdef CONFIG_COMPAT
+
+asmlinkage ssize_t
+compat_process_vm_rw(compat_pid_t pid,
+ const struct compat_iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct compat_iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags, int vm_write)
+{
+ struct iovec iovstack_l[UIO_FASTIOV];
+ struct iovec iovstack_r[UIO_FASTIOV];
+ struct iovec *iov_l = iovstack_l;
+ struct iovec *iov_r = iovstack_r;
+ ssize_t rc = -EFAULT;
+
+ if (flags != 0)
+ return -EINVAL;
+
+ if (!access_ok(VERIFY_READ, lvec, liovcnt * sizeof(*lvec)))
+ goto out;
+
+ if (!access_ok(VERIFY_READ, rvec, riovcnt * sizeof(*rvec)))
+ goto out;
+
+ if (vm_write)
+ rc = compat_rw_copy_check_uvector(WRITE, lvec, liovcnt,
+ UIO_FASTIOV, iovstack_l,
+ &iov_l, 1);
+ else
+ rc = compat_rw_copy_check_uvector(READ, lvec, liovcnt,
+ UIO_FASTIOV, iovstack_l,
+ &iov_l, 1);
+ if (rc <= 0)
+ goto free_iovecs;
+ rc = compat_rw_copy_check_uvector(READ, rvec, riovcnt,
+ UIO_FASTIOV, iovstack_r,
+ &iov_r, 0);
+ if (rc <= 0)
+ goto free_iovecs;
+
+ rc = process_vm_rw_core(pid, iov_l, liovcnt, iov_r, riovcnt, flags,
+ vm_write);
+
+free_iovecs:
+ if (iov_r != iovstack_r)
+ kfree(iov_r);
+ if (iov_l != iovstack_l)
+ kfree(iov_l);
+
+out:
+ return rc;
+}
+
+asmlinkage ssize_t
+compat_sys_process_vm_readv(compat_pid_t pid,
+ const struct compat_iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct compat_iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags)
+{
+ return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
+ riovcnt, flags, 0);
+}
+
+asmlinkage ssize_t
+compat_sys_process_vm_writev(compat_pid_t pid,
+ const struct compat_iovec __user *lvec,
+ unsigned long liovcnt,
+ const struct compat_iovec __user *rvec,
+ unsigned long riovcnt,
+ unsigned long flags)
+{
+ return compat_process_vm_rw(pid, lvec, liovcnt, rvec,
+ riovcnt, flags, 1);
+}
+
+#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 8005080..6541cf7 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -1164,7 +1164,7 @@ void page_remove_rmap(struct page *page)
/*
* Subfunctions of try_to_unmap: try_to_unmap_one called
- * repeatedly from either try_to_unmap_anon or try_to_unmap_file.
+ * repeatedly from try_to_unmap_ksm, try_to_unmap_anon or try_to_unmap_file.
*/
int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
unsigned long address, enum ttu_flags flags)
diff --git a/mm/shmem.c b/mm/shmem.c
index 32f6763..fa4fa6c 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1068,6 +1068,12 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
user_shm_unlock(inode->i_size, user);
info->flags &= ~VM_LOCKED;
mapping_clear_unevictable(file->f_mapping);
+ /*
+ * Ensure that a racing putback_lru_page() can see
+ * the pages of this mapping are evictable when we
+ * skip them due to !PageLRU during the scan.
+ */
+ smp_mb__after_clear_bit();
scan_mapping_unevictable_pages(file->f_mapping);
}
retval = 0;
@@ -1458,7 +1464,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev)
inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
if (inode) {
error = security_inode_init_security(inode, dir,
- &dentry->d_name, NULL,
+ &dentry->d_name,
NULL, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
@@ -1598,7 +1604,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
if (!inode)
return -ENOSPC;
- error = security_inode_init_security(inode, dir, &dentry->d_name, NULL,
+ error = security_inode_init_security(inode, dir, &dentry->d_name,
NULL, NULL);
if (error) {
if (error != -EOPNOTSUPP) {
diff --git a/mm/slab.c b/mm/slab.c
index 6d90a09..708efe8 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1851,15 +1851,15 @@ static void dump_line(char *data, int offset, int limit)
unsigned char error = 0;
int bad_count = 0;
- printk(KERN_ERR "%03x:", offset);
+ printk(KERN_ERR "%03x: ", offset);
for (i = 0; i < limit; i++) {
if (data[offset + i] != POISON_FREE) {
error = data[offset + i];
bad_count++;
}
- printk(" %02x", (unsigned char)data[offset + i]);
}
- printk("\n");
+ print_hex_dump(KERN_CONT, "", 0, 16, 1,
+ &data[offset], limit, 1);
if (bad_count == 1) {
error ^= POISON_FREE;
@@ -3039,14 +3039,9 @@ bad:
printk(KERN_ERR "slab: Internal list corruption detected in "
"cache '%s'(%d), slabp %p(%d). Hexdump:\n",
cachep->name, cachep->num, slabp, slabp->inuse);
- for (i = 0;
- i < sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t);
- i++) {
- if (i % 16 == 0)
- printk("\n%03x:", i);
- printk(" %02x", ((unsigned char *)slabp)[i]);
- }
- printk("\n");
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
+ sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
+ 1);
BUG();
}
}
@@ -4584,7 +4579,7 @@ static const struct file_operations proc_slabstats_operations = {
static int __init slab_proc_init(void)
{
- proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations);
+ proc_create("slabinfo",S_IWUSR|S_IRUSR,NULL,&proc_slabinfo_operations);
#ifdef CONFIG_DEBUG_SLAB_LEAK
proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations);
#endif
diff --git a/mm/slub.c b/mm/slub.c
index 7c54fe8..7d2a996 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -467,34 +467,8 @@ static int disable_higher_order_debug;
*/
static void print_section(char *text, u8 *addr, unsigned int length)
{
- int i, offset;
- int newline = 1;
- char ascii[17];
-
- ascii[16] = 0;
-
- for (i = 0; i < length; i++) {
- if (newline) {
- printk(KERN_ERR "%8s 0x%p: ", text, addr + i);
- newline = 0;
- }
- printk(KERN_CONT " %02x", addr[i]);
- offset = i % 16;
- ascii[offset] = isgraph(addr[i]) ? addr[i] : '.';
- if (offset == 15) {
- printk(KERN_CONT " %s\n", ascii);
- newline = 1;
- }
- }
- if (!newline) {
- i %= 16;
- while (i < 16) {
- printk(KERN_CONT " ");
- ascii[i] = ' ';
- i++;
- }
- printk(KERN_CONT " %s\n", ascii);
- }
+ print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
+ length, 1);
}
static struct track *get_track(struct kmem_cache *s, void *object,
@@ -625,12 +599,12 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
p, p - addr, get_freepointer(s, p));
if (p > addr + 16)
- print_section("Bytes b4", p - 16, 16);
-
- print_section("Object", p, min_t(unsigned long, s->objsize, PAGE_SIZE));
+ print_section("Bytes b4 ", p - 16, 16);
+ print_section("Object ", p, min_t(unsigned long, s->objsize,
+ PAGE_SIZE));
if (s->flags & SLAB_RED_ZONE)
- print_section("Redzone", p + s->objsize,
+ print_section("Redzone ", p + s->objsize,
s->inuse - s->objsize);
if (s->offset)
@@ -643,7 +617,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
if (off != s->size)
/* Beginning of the filler is the free pointer */
- print_section("Padding", p + off, s->size - off);
+ print_section("Padding ", p + off, s->size - off);
dump_stack();
}
@@ -681,49 +655,6 @@ static void init_object(struct kmem_cache *s, void *object, u8 val)
memset(p + s->objsize, val, s->inuse - s->objsize);
}
-static u8 *check_bytes8(u8 *start, u8 value, unsigned int bytes)
-{
- while (bytes) {
- if (*start != value)
- return start;
- start++;
- bytes--;
- }
- return NULL;
-}
-
-static u8 *check_bytes(u8 *start, u8 value, unsigned int bytes)
-{
- u64 value64;
- unsigned int words, prefix;
-
- if (bytes <= 16)
- return check_bytes8(start, value, bytes);
-
- value64 = value | value << 8 | value << 16 | value << 24;
- value64 = (value64 & 0xffffffff) | value64 << 32;
- prefix = 8 - ((unsigned long)start) % 8;
-
- if (prefix) {
- u8 *r = check_bytes8(start, value, prefix);
- if (r)
- return r;
- start += prefix;
- bytes -= prefix;
- }
-
- words = bytes / 8;
-
- while (words) {
- if (*(u64 *)start != value64)
- return check_bytes8(start, value, 8);
- start += 8;
- words--;
- }
-
- return check_bytes8(start, value, bytes % 8);
-}
-
static void restore_bytes(struct kmem_cache *s, char *message, u8 data,
void *from, void *to)
{
@@ -738,7 +669,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
u8 *fault;
u8 *end;
- fault = check_bytes(start, value, bytes);
+ fault = memchr_inv(start, value, bytes);
if (!fault)
return 1;
@@ -831,14 +762,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
if (!remainder)
return 1;
- fault = check_bytes(end - remainder, POISON_INUSE, remainder);
+ fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
if (!fault)
return 1;
while (end > fault && end[-1] == POISON_INUSE)
end--;
slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1);
- print_section("Padding", end - remainder, remainder);
+ print_section("Padding ", end - remainder, remainder);
restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end);
return 0;
@@ -987,7 +918,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
page->freelist);
if (!alloc)
- print_section("Object", (void *)object, s->objsize);
+ print_section("Object ", (void *)object, s->objsize);
dump_stack();
}
@@ -1447,7 +1378,7 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
set_freepointer(s, last, NULL);
page->freelist = start;
- page->inuse = 0;
+ page->inuse = page->objects;
page->frozen = 1;
out:
return page;
@@ -1534,7 +1465,7 @@ static inline void add_partial(struct kmem_cache_node *n,
struct page *page, int tail)
{
n->nr_partial++;
- if (tail)
+ if (tail == DEACTIVATE_TO_TAIL)
list_add_tail(&page->lru, &n->partial);
else
list_add(&page->lru, &n->partial);
@@ -1554,10 +1485,13 @@ static inline void remove_partial(struct kmem_cache_node *n,
* Lock slab, remove from the partial list and put the object into the
* per cpu freelist.
*
+ * Returns a list of objects or NULL if it fails.
+ *
* Must hold list_lock.
*/
-static inline int acquire_slab(struct kmem_cache *s,
- struct kmem_cache_node *n, struct page *page)
+static inline void *acquire_slab(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct page *page,
+ int mode)
{
void *freelist;
unsigned long counters;
@@ -1572,7 +1506,8 @@ static inline int acquire_slab(struct kmem_cache *s,
freelist = page->freelist;
counters = page->counters;
new.counters = counters;
- new.inuse = page->objects;
+ if (mode)
+ new.inuse = page->objects;
VM_BUG_ON(new.frozen);
new.frozen = 1;
@@ -1583,32 +1518,19 @@ static inline int acquire_slab(struct kmem_cache *s,
"lock and freeze"));
remove_partial(n, page);
-
- if (freelist) {
- /* Populate the per cpu freelist */
- this_cpu_write(s->cpu_slab->freelist, freelist);
- this_cpu_write(s->cpu_slab->page, page);
- this_cpu_write(s->cpu_slab->node, page_to_nid(page));
- return 1;
- } else {
- /*
- * Slab page came from the wrong list. No object to allocate
- * from. Put it onto the correct list and continue partial
- * scan.
- */
- printk(KERN_ERR "SLUB: %s : Page without available objects on"
- " partial list\n", s->name);
- return 0;
- }
+ return freelist;
}
+static int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain);
+
/*
* Try to allocate a partial slab from a specific node.
*/
-static struct page *get_partial_node(struct kmem_cache *s,
- struct kmem_cache_node *n)
+static void *get_partial_node(struct kmem_cache *s,
+ struct kmem_cache_node *n, struct kmem_cache_cpu *c)
{
- struct page *page;
+ struct page *page, *page2;
+ void *object = NULL;
/*
* Racy check. If we mistakenly see no partial slabs then we
@@ -1620,26 +1542,43 @@ static struct page *get_partial_node(struct kmem_cache *s,
return NULL;
spin_lock(&n->list_lock);
- list_for_each_entry(page, &n->partial, lru)
- if (acquire_slab(s, n, page))
- goto out;
- page = NULL;
-out:
+ list_for_each_entry_safe(page, page2, &n->partial, lru) {
+ void *t = acquire_slab(s, n, page, object == NULL);
+ int available;
+
+ if (!t)
+ break;
+
+ if (!object) {
+ c->page = page;
+ c->node = page_to_nid(page);
+ stat(s, ALLOC_FROM_PARTIAL);
+ object = t;
+ available = page->objects - page->inuse;
+ } else {
+ page->freelist = t;
+ available = put_cpu_partial(s, page, 0);
+ }
+ if (kmem_cache_debug(s) || available > s->cpu_partial / 2)
+ break;
+
+ }
spin_unlock(&n->list_lock);
- return page;
+ return object;
}
/*
* Get a page from somewhere. Search in increasing NUMA distances.
*/
-static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
+static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags,
+ struct kmem_cache_cpu *c)
{
#ifdef CONFIG_NUMA
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
enum zone_type high_zoneidx = gfp_zone(flags);
- struct page *page;
+ void *object;
/*
* The defrag ratio allows a configuration of the tradeoffs between
@@ -1672,10 +1611,10 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
if (n && cpuset_zone_allowed_hardwall(zone, flags) &&
n->nr_partial > s->min_partial) {
- page = get_partial_node(s, n);
- if (page) {
+ object = get_partial_node(s, n, c);
+ if (object) {
put_mems_allowed();
- return page;
+ return object;
}
}
}
@@ -1687,16 +1626,17 @@ static struct page *get_any_partial(struct kmem_cache *s, gfp_t flags)
/*
* Get a partial page, lock it and return it.
*/
-static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
+static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
+ struct kmem_cache_cpu *c)
{
- struct page *page;
+ void *object;
int searchnode = (node == NUMA_NO_NODE) ? numa_node_id() : node;
- page = get_partial_node(s, get_node(s, searchnode));
- if (page || node != NUMA_NO_NODE)
- return page;
+ object = get_partial_node(s, get_node(s, searchnode), c);
+ if (object || node != NUMA_NO_NODE)
+ return object;
- return get_any_partial(s, flags);
+ return get_any_partial(s, flags, c);
}
#ifdef CONFIG_PREEMPT
@@ -1765,9 +1705,6 @@ void init_kmem_cache_cpus(struct kmem_cache *s)
for_each_possible_cpu(cpu)
per_cpu_ptr(s->cpu_slab, cpu)->tid = init_tid(cpu);
}
-/*
- * Remove the cpu slab
- */
/*
* Remove the cpu slab
@@ -1781,13 +1718,13 @@ static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
enum slab_modes l = M_NONE, m = M_NONE;
void *freelist;
void *nextfree;
- int tail = 0;
+ int tail = DEACTIVATE_TO_HEAD;
struct page new;
struct page old;
if (page->freelist) {
stat(s, DEACTIVATE_REMOTE_FREES);
- tail = 1;
+ tail = DEACTIVATE_TO_TAIL;
}
c->tid = next_tid(c->tid);
@@ -1893,7 +1830,7 @@ redo:
if (m == M_PARTIAL) {
add_partial(n, page, tail);
- stat(s, tail ? DEACTIVATE_TO_TAIL : DEACTIVATE_TO_HEAD);
+ stat(s, tail);
} else if (m == M_FULL) {
@@ -1920,6 +1857,123 @@ redo:
}
}
+/* Unfreeze all the cpu partial slabs */
+static void unfreeze_partials(struct kmem_cache *s)
+{
+ struct kmem_cache_node *n = NULL;
+ struct kmem_cache_cpu *c = this_cpu_ptr(s->cpu_slab);
+ struct page *page;
+
+ while ((page = c->partial)) {
+ enum slab_modes { M_PARTIAL, M_FREE };
+ enum slab_modes l, m;
+ struct page new;
+ struct page old;
+
+ c->partial = page->next;
+ l = M_FREE;
+
+ do {
+
+ old.freelist = page->freelist;
+ old.counters = page->counters;
+ VM_BUG_ON(!old.frozen);
+
+ new.counters = old.counters;
+ new.freelist = old.freelist;
+
+ new.frozen = 0;
+
+ if (!new.inuse && (!n || n->nr_partial > s->min_partial))
+ m = M_FREE;
+ else {
+ struct kmem_cache_node *n2 = get_node(s,
+ page_to_nid(page));
+
+ m = M_PARTIAL;
+ if (n != n2) {
+ if (n)
+ spin_unlock(&n->list_lock);
+
+ n = n2;
+ spin_lock(&n->list_lock);
+ }
+ }
+
+ if (l != m) {
+ if (l == M_PARTIAL)
+ remove_partial(n, page);
+ else
+ add_partial(n, page, 1);
+
+ l = m;
+ }
+
+ } while (!cmpxchg_double_slab(s, page,
+ old.freelist, old.counters,
+ new.freelist, new.counters,
+ "unfreezing slab"));
+
+ if (m == M_FREE) {
+ stat(s, DEACTIVATE_EMPTY);
+ discard_slab(s, page);
+ stat(s, FREE_SLAB);
+ }
+ }
+
+ if (n)
+ spin_unlock(&n->list_lock);
+}
+
+/*
+ * Put a page that was just frozen (in __slab_free) into a partial page
+ * slot if available. This is done without interrupts disabled and without
+ * preemption disabled. The cmpxchg is racy and may put the partial page
+ * onto a random cpus partial slot.
+ *
+ * If we did not find a slot then simply move all the partials to the
+ * per node partial list.
+ */
+int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
+{
+ struct page *oldpage;
+ int pages;
+ int pobjects;
+
+ do {
+ pages = 0;
+ pobjects = 0;
+ oldpage = this_cpu_read(s->cpu_slab->partial);
+
+ if (oldpage) {
+ pobjects = oldpage->pobjects;
+ pages = oldpage->pages;
+ if (drain && pobjects > s->cpu_partial) {
+ unsigned long flags;
+ /*
+ * partial array is full. Move the existing
+ * set to the per node partial list.
+ */
+ local_irq_save(flags);
+ unfreeze_partials(s);
+ local_irq_restore(flags);
+ pobjects = 0;
+ pages = 0;
+ }
+ }
+
+ pages++;
+ pobjects += page->objects - page->inuse;
+
+ page->pages = pages;
+ page->pobjects = pobjects;
+ page->next = oldpage;
+
+ } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+ stat(s, CPU_PARTIAL_FREE);
+ return pobjects;
+}
+
static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
{
stat(s, CPUSLAB_FLUSH);
@@ -1935,8 +1989,12 @@ static inline void __flush_cpu_slab(struct kmem_cache *s, int cpu)
{
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
- if (likely(c && c->page))
- flush_slab(s, c);
+ if (likely(c)) {
+ if (c->page)
+ flush_slab(s, c);
+
+ unfreeze_partials(s);
+ }
}
static void flush_cpu_slab(void *d)
@@ -2027,12 +2085,39 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid)
}
}
+static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags,
+ int node, struct kmem_cache_cpu **pc)
+{
+ void *object;
+ struct kmem_cache_cpu *c;
+ struct page *page = new_slab(s, flags, node);
+
+ if (page) {
+ c = __this_cpu_ptr(s->cpu_slab);
+ if (c->page)
+ flush_slab(s, c);
+
+ /*
+ * No other reference to the page yet so we can
+ * muck around with it freely without cmpxchg
+ */
+ object = page->freelist;
+ page->freelist = NULL;
+
+ stat(s, ALLOC_SLAB);
+ c->node = page_to_nid(page);
+ c->page = page;
+ *pc = c;
+ } else
+ object = NULL;
+
+ return object;
+}
+
/*
* Slow path. The lockless freelist is empty or we need to perform
* debugging duties.
*
- * Interrupts are disabled.
- *
* Processing is still very fast if new objects have been freed to the
* regular freelist. In that case we simply take over the regular freelist
* as the lockless freelist and zap the regular freelist.
@@ -2049,7 +2134,6 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
unsigned long addr, struct kmem_cache_cpu *c)
{
void **object;
- struct page *page;
unsigned long flags;
struct page new;
unsigned long counters;
@@ -2064,13 +2148,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
c = this_cpu_ptr(s->cpu_slab);
#endif
- /* We handle __GFP_ZERO in the caller */
- gfpflags &= ~__GFP_ZERO;
-
- page = c->page;
- if (!page)
+ if (!c->page)
goto new_slab;
-
+redo:
if (unlikely(!node_match(c, node))) {
stat(s, ALLOC_NODE_MISMATCH);
deactivate_slab(s, c);
@@ -2080,8 +2160,8 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
stat(s, ALLOC_SLOWPATH);
do {
- object = page->freelist;
- counters = page->counters;
+ object = c->page->freelist;
+ counters = c->page->counters;
new.counters = counters;
VM_BUG_ON(!new.frozen);
@@ -2093,17 +2173,17 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
*
* If there are objects left then we retrieve them
* and use them to refill the per cpu queue.
- */
+ */
- new.inuse = page->objects;
+ new.inuse = c->page->objects;
new.frozen = object != NULL;
- } while (!__cmpxchg_double_slab(s, page,
+ } while (!__cmpxchg_double_slab(s, c->page,
object, counters,
NULL, new.counters,
"__slab_alloc"));
- if (unlikely(!object)) {
+ if (!object) {
c->page = NULL;
stat(s, DEACTIVATE_BYPASS);
goto new_slab;
@@ -2112,58 +2192,47 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node,
stat(s, ALLOC_REFILL);
load_freelist:
- VM_BUG_ON(!page->frozen);
c->freelist = get_freepointer(s, object);
c->tid = next_tid(c->tid);
local_irq_restore(flags);
return object;
new_slab:
- page = get_partial(s, gfpflags, node);
- if (page) {
- stat(s, ALLOC_FROM_PARTIAL);
- object = c->freelist;
- if (kmem_cache_debug(s))
- goto debug;
- goto load_freelist;
+ if (c->partial) {
+ c->page = c->partial;
+ c->partial = c->page->next;
+ c->node = page_to_nid(c->page);
+ stat(s, CPU_PARTIAL_ALLOC);
+ c->freelist = NULL;
+ goto redo;
}
- page = new_slab(s, gfpflags, node);
+ /* Then do expensive stuff like retrieving pages from the partial lists */
+ object = get_partial(s, gfpflags, node, c);
- if (page) {
- c = __this_cpu_ptr(s->cpu_slab);
- if (c->page)
- flush_slab(s, c);
+ if (unlikely(!object)) {
- /*
- * No other reference to the page yet so we can
- * muck around with it freely without cmpxchg
- */
- object = page->freelist;
- page->freelist = NULL;
- page->inuse = page->objects;
+ object = new_slab_objects(s, gfpflags, node, &c);
- stat(s, ALLOC_SLAB);
- c->node = page_to_nid(page);
- c->page = page;
+ if (unlikely(!object)) {
+ if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
+ slab_out_of_memory(s, gfpflags, node);
- if (kmem_cache_debug(s))
- goto debug;
- goto load_freelist;
+ local_irq_restore(flags);
+ return NULL;
+ }
}
- if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit())
- slab_out_of_memory(s, gfpflags, node);
- local_irq_restore(flags);
- return NULL;
-debug:
- if (!object || !alloc_debug_processing(s, page, object, addr))
- goto new_slab;
+ if (likely(!kmem_cache_debug(s)))
+ goto load_freelist;
+
+ /* Only entered in the debug case */
+ if (!alloc_debug_processing(s, c->page, object, addr))
+ goto new_slab; /* Slab failed checks. Next slab needed */
c->freelist = get_freepointer(s, object);
deactivate_slab(s, c);
- c->page = NULL;
c->node = NUMA_NO_NODE;
local_irq_restore(flags);
return object;
@@ -2333,16 +2402,29 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
was_frozen = new.frozen;
new.inuse--;
if ((!new.inuse || !prior) && !was_frozen && !n) {
- n = get_node(s, page_to_nid(page));
- /*
- * Speculatively acquire the list_lock.
- * If the cmpxchg does not succeed then we may
- * drop the list_lock without any processing.
- *
- * Otherwise the list_lock will synchronize with
- * other processors updating the list of slabs.
- */
- spin_lock_irqsave(&n->list_lock, flags);
+
+ if (!kmem_cache_debug(s) && !prior)
+
+ /*
+ * Slab was on no list before and will be partially empty
+ * We can defer the list move and instead freeze it.
+ */
+ new.frozen = 1;
+
+ else { /* Needs to be taken off a list */
+
+ n = get_node(s, page_to_nid(page));
+ /*
+ * Speculatively acquire the list_lock.
+ * If the cmpxchg does not succeed then we may
+ * drop the list_lock without any processing.
+ *
+ * Otherwise the list_lock will synchronize with
+ * other processors updating the list of slabs.
+ */
+ spin_lock_irqsave(&n->list_lock, flags);
+
+ }
}
inuse = new.inuse;
@@ -2352,7 +2434,15 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
"__slab_free"));
if (likely(!n)) {
- /*
+
+ /*
+ * If we just froze the page then put it onto the
+ * per cpu partial list.
+ */
+ if (new.frozen && !was_frozen)
+ put_cpu_partial(s, page, 1);
+
+ /*
* The list lock was not taken therefore no list
* activity can be necessary.
*/
@@ -2377,7 +2467,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
*/
if (unlikely(!prior)) {
remove_full(s, page);
- add_partial(n, page, 1);
+ add_partial(n, page, DEACTIVATE_TO_TAIL);
stat(s, FREE_ADD_PARTIAL);
}
}
@@ -2421,7 +2511,6 @@ static __always_inline void slab_free(struct kmem_cache *s,
slab_free_hook(s, x);
redo:
-
/*
* Determine the currently cpus per cpu slab.
* The cpu may change afterward. However that does not matter since
@@ -2685,7 +2774,7 @@ static void early_kmem_cache_node_alloc(int node)
n = page->freelist;
BUG_ON(!n);
page->freelist = get_freepointer(kmem_cache_node, n);
- page->inuse++;
+ page->inuse = 1;
page->frozen = 0;
kmem_cache_node->node[node] = n;
#ifdef CONFIG_SLUB_DEBUG
@@ -2695,7 +2784,7 @@ static void early_kmem_cache_node_alloc(int node)
init_kmem_cache_node(n, kmem_cache_node);
inc_slabs_node(kmem_cache_node, node, page->objects);
- add_partial(n, page, 0);
+ add_partial(n, page, DEACTIVATE_TO_HEAD);
}
static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -2911,7 +3000,34 @@ static int kmem_cache_open(struct kmem_cache *s,
* The larger the object size is, the more pages we want on the partial
* list to avoid pounding the page allocator excessively.
*/
- set_min_partial(s, ilog2(s->size));
+ set_min_partial(s, ilog2(s->size) / 2);
+
+ /*
+ * cpu_partial determined the maximum number of objects kept in the
+ * per cpu partial lists of a processor.
+ *
+ * Per cpu partial lists mainly contain slabs that just have one
+ * object freed. If they are used for allocation then they can be
+ * filled up again with minimal effort. The slab will never hit the
+ * per node partial lists and therefore no locking will be required.
+ *
+ * This setting also determines
+ *
+ * A) The number of objects from per cpu partial slabs dumped to the
+ * per node list when we reach the limit.
+ * B) The number of objects in cpu partial slabs to extract from the
+ * per node list when we run out of per cpu objects. We only fetch 50%
+ * to keep some capacity around for frees.
+ */
+ if (s->size >= PAGE_SIZE)
+ s->cpu_partial = 2;
+ else if (s->size >= 1024)
+ s->cpu_partial = 6;
+ else if (s->size >= 256)
+ s->cpu_partial = 13;
+ else
+ s->cpu_partial = 30;
+
s->refcount = 1;
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
@@ -2970,13 +3086,13 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
/*
* Attempt to free all partial slabs on a node.
+ * This is called from kmem_cache_close(). We must be the last thread
+ * using the cache and therefore we do not need to lock anymore.
*/
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
- unsigned long flags;
struct page *page, *h;
- spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
remove_partial(n, page);
@@ -2986,7 +3102,6 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
"Objects remaining on kmem_cache_close()");
}
}
- spin_unlock_irqrestore(&n->list_lock, flags);
}
/*
@@ -3020,6 +3135,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
s->refcount--;
if (!s->refcount) {
list_del(&s->list);
+ up_write(&slub_lock);
if (kmem_cache_close(s)) {
printk(KERN_ERR "SLUB %s: %s called for cache that "
"still has objects.\n", s->name, __func__);
@@ -3028,8 +3144,8 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->flags & SLAB_DESTROY_BY_RCU)
rcu_barrier();
sysfs_slab_remove(s);
- }
- up_write(&slub_lock);
+ } else
+ up_write(&slub_lock);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -3347,23 +3463,23 @@ int kmem_cache_shrink(struct kmem_cache *s)
* list_lock. page->inuse here is the upper limit.
*/
list_for_each_entry_safe(page, t, &n->partial, lru) {
- if (!page->inuse) {
- remove_partial(n, page);
- discard_slab(s, page);
- } else {
- list_move(&page->lru,
- slabs_by_inuse + page->inuse);
- }
+ list_move(&page->lru, slabs_by_inuse + page->inuse);
+ if (!page->inuse)
+ n->nr_partial--;
}
/*
* Rebuild the partial list with the slabs filled up most
* first and the least used slabs at the end.
*/
- for (i = objects - 1; i >= 0; i--)
+ for (i = objects - 1; i > 0; i--)
list_splice(slabs_by_inuse + i, n->partial.prev);
spin_unlock_irqrestore(&n->list_lock, flags);
+
+ /* Release empty slabs */
+ list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+ discard_slab(s, page);
}
kfree(slabs_by_inuse);
@@ -4319,6 +4435,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
for_each_possible_cpu(cpu) {
struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu);
+ struct page *page;
if (!c || c->node < 0)
continue;
@@ -4334,6 +4451,13 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
total += x;
nodes[c->node] += x;
}
+ page = c->partial;
+
+ if (page) {
+ x = page->pobjects;
+ total += x;
+ nodes[c->node] += x;
+ }
per_cpu[c->node]++;
}
}
@@ -4412,11 +4536,12 @@ struct slab_attribute {
};
#define SLAB_ATTR_RO(_name) \
- static struct slab_attribute _name##_attr = __ATTR_RO(_name)
+ static struct slab_attribute _name##_attr = \
+ __ATTR(_name, 0400, _name##_show, NULL)
#define SLAB_ATTR(_name) \
static struct slab_attribute _name##_attr = \
- __ATTR(_name, 0644, _name##_show, _name##_store)
+ __ATTR(_name, 0600, _name##_show, _name##_store)
static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
{
@@ -4485,6 +4610,27 @@ static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
}
SLAB_ATTR(min_partial);
+static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%u\n", s->cpu_partial);
+}
+
+static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ unsigned long objects;
+ int err;
+
+ err = strict_strtoul(buf, 10, &objects);
+ if (err)
+ return err;
+
+ s->cpu_partial = objects;
+ flush_all(s);
+ return length;
+}
+SLAB_ATTR(cpu_partial);
+
static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
if (!s->ctor)
@@ -4523,6 +4669,37 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(objects_partial);
+static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
+{
+ int objects = 0;
+ int pages = 0;
+ int cpu;
+ int len;
+
+ for_each_online_cpu(cpu) {
+ struct page *page = per_cpu_ptr(s->cpu_slab, cpu)->partial;
+
+ if (page) {
+ pages += page->pages;
+ objects += page->pobjects;
+ }
+ }
+
+ len = sprintf(buf, "%d(%d)", objects, pages);
+
+#ifdef CONFIG_SMP
+ for_each_online_cpu(cpu) {
+ struct page *page = per_cpu_ptr(s->cpu_slab, cpu) ->partial;
+
+ if (page && len < PAGE_SIZE - 20)
+ len += sprintf(buf + len, " C%d=%d(%d)", cpu,
+ page->pobjects, page->pages);
+ }
+#endif
+ return len + sprintf(buf + len, "\n");
+}
+SLAB_ATTR_RO(slabs_cpu_partial);
+
static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
@@ -4845,6 +5022,8 @@ STAT_ATTR(DEACTIVATE_BYPASS, deactivate_bypass);
STAT_ATTR(ORDER_FALLBACK, order_fallback);
STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail);
STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail);
+STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc);
+STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free);
#endif
static struct attribute *slab_attrs[] = {
@@ -4853,6 +5032,7 @@ static struct attribute *slab_attrs[] = {
&objs_per_slab_attr.attr,
&order_attr.attr,
&min_partial_attr.attr,
+ &cpu_partial_attr.attr,
&objects_attr.attr,
&objects_partial_attr.attr,
&partial_attr.attr,
@@ -4865,6 +5045,7 @@ static struct attribute *slab_attrs[] = {
&destroy_by_rcu_attr.attr,
&shrink_attr.attr,
&reserved_attr.attr,
+ &slabs_cpu_partial_attr.attr,
#ifdef CONFIG_SLUB_DEBUG
&total_objects_attr.attr,
&slabs_attr.attr,
@@ -4906,6 +5087,8 @@ static struct attribute *slab_attrs[] = {
&order_fallback_attr.attr,
&cmpxchg_double_fail_attr.attr,
&cmpxchg_double_cpu_fail_attr.attr,
+ &cpu_partial_alloc_attr.attr,
+ &cpu_partial_free_attr.attr,
#endif
#ifdef CONFIG_FAILSLAB
&failslab_attr.attr,
@@ -5257,7 +5440,7 @@ static const struct file_operations proc_slabinfo_operations = {
static int __init slab_proc_init(void)
{
- proc_create("slabinfo", S_IRUGO, NULL, &proc_slabinfo_operations);
+ proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations);
return 0;
}
module_init(slab_proc_init);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 17bc224..c9d6540 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1617,7 +1617,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX);
err = try_to_unuse(type);
- test_set_oom_score_adj(oom_score_adj);
+ compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj);
if (err) {
/*
diff --git a/mm/thrash.c b/mm/thrash.c
index e53f7d0..57ad495 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -29,7 +29,7 @@
static DEFINE_SPINLOCK(swap_token_lock);
struct mm_struct *swap_token_mm;
-struct mem_cgroup *swap_token_memcg;
+static struct mem_cgroup *swap_token_memcg;
#ifdef CONFIG_CGROUP_MEM_RES_CTLR
static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm)
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5016f19..b669aa6 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1253,18 +1253,22 @@ EXPORT_SYMBOL_GPL(map_vm_area);
DEFINE_RWLOCK(vmlist_lock);
struct vm_struct *vmlist;
-static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
unsigned long flags, void *caller)
{
- struct vm_struct *tmp, **p;
-
vm->flags = flags;
vm->addr = (void *)va->va_start;
vm->size = va->va_end - va->va_start;
vm->caller = caller;
va->private = vm;
va->flags |= VM_VM_AREA;
+}
+static void insert_vmalloc_vmlist(struct vm_struct *vm)
+{
+ struct vm_struct *tmp, **p;
+
+ vm->flags &= ~VM_UNLIST;
write_lock(&vmlist_lock);
for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
if (tmp->addr >= vm->addr)
@@ -1275,6 +1279,13 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
write_unlock(&vmlist_lock);
}
+static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
+ unsigned long flags, void *caller)
+{
+ setup_vmalloc_vm(vm, va, flags, caller);
+ insert_vmalloc_vmlist(vm);
+}
+
static struct vm_struct *__get_vm_area_node(unsigned long size,
unsigned long align, unsigned long flags, unsigned long start,
unsigned long end, int node, gfp_t gfp_mask, void *caller)
@@ -1313,7 +1324,18 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return NULL;
}
- insert_vmalloc_vm(area, va, flags, caller);
+ /*
+ * When this function is called from __vmalloc_node_range,
+ * we do not add vm_struct to vmlist here to avoid
+ * accessing uninitialized members of vm_struct such as
+ * pages and nr_pages fields. They will be set later.
+ * To distinguish it from others, we use a VM_UNLIST flag.
+ */
+ if (flags & VM_UNLIST)
+ setup_vmalloc_vm(area, va, flags, caller);
+ else
+ insert_vmalloc_vm(area, va, flags, caller);
+
return area;
}
@@ -1381,17 +1403,20 @@ struct vm_struct *remove_vm_area(const void *addr)
va = find_vmap_area((unsigned long)addr);
if (va && va->flags & VM_VM_AREA) {
struct vm_struct *vm = va->private;
- struct vm_struct *tmp, **p;
- /*
- * remove from list and disallow access to this vm_struct
- * before unmap. (address range confliction is maintained by
- * vmap.)
- */
- write_lock(&vmlist_lock);
- for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
- ;
- *p = tmp->next;
- write_unlock(&vmlist_lock);
+
+ if (!(vm->flags & VM_UNLIST)) {
+ struct vm_struct *tmp, **p;
+ /*
+ * remove from list and disallow access to
+ * this vm_struct before unmap. (address range
+ * confliction is maintained by vmap.)
+ */
+ write_lock(&vmlist_lock);
+ for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next)
+ ;
+ *p = tmp->next;
+ write_unlock(&vmlist_lock);
+ }
vmap_debug_free_range(va->va_start, va->va_end);
free_unmap_vmap_area(va);
@@ -1568,8 +1593,8 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
return area->addr;
fail:
- warn_alloc_failed(gfp_mask, order, "vmalloc: allocation failure, "
- "allocated %ld of %ld bytes\n",
+ warn_alloc_failed(gfp_mask, order,
+ "vmalloc: allocation failure, allocated %ld of %ld bytes\n",
(area->nr_pages*PAGE_SIZE), area->size);
vfree(area->addr);
return NULL;
@@ -1600,17 +1625,22 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
size = PAGE_ALIGN(size);
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
- return NULL;
-
- area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node,
- gfp_mask, caller);
+ goto fail;
+ area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
+ start, end, node, gfp_mask, caller);
if (!area)
- return NULL;
+ goto fail;
addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
/*
+ * In this function, newly allocated vm_struct is not added
+ * to vmlist at __get_vm_area_node(). so, it is added here.
+ */
+ insert_vmalloc_vmlist(area);
+
+ /*
* A ref_count = 3 is needed because the vm_struct and vmap_area
* structures allocated in the __get_vm_area_node() function contain
* references to the virtual address of the vmalloc'ed block.
@@ -1618,6 +1648,12 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
kmemleak_alloc(addr, real_size, 3, gfp_mask);
return addr;
+
+fail:
+ warn_alloc_failed(gfp_mask, 0,
+ "vmalloc: allocation failure: %lu bytes\n",
+ real_size);
+ return NULL;
}
/**
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b55699c..a90c603 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -495,15 +495,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
return PAGE_ACTIVATE;
}
- /*
- * Wait on writeback if requested to. This happens when
- * direct reclaiming a large contiguous area and the
- * first attempt to free a range of pages fails.
- */
- if (PageWriteback(page) &&
- (sc->reclaim_mode & RECLAIM_MODE_SYNC))
- wait_on_page_writeback(page);
-
if (!PageWriteback(page)) {
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
@@ -642,13 +633,14 @@ redo:
lru = LRU_UNEVICTABLE;
add_page_to_unevictable_list(page);
/*
- * When racing with an mlock clearing (page is
- * unlocked), make sure that if the other thread does
- * not observe our setting of PG_lru and fails
- * isolation, we see PG_mlocked cleared below and move
+ * When racing with an mlock or AS_UNEVICTABLE clearing
+ * (page is unlocked) make sure that if the other thread
+ * does not observe our setting of PG_lru and fails
+ * isolation/check_move_unevictable_page,
+ * we see PG_mlocked/AS_UNEVICTABLE cleared below and move
* the page back to the evictable list.
*
- * The other side is TestClearPageMlocked().
+ * The other side is TestClearPageMlocked() or shmem_lock().
*/
smp_mb();
}
@@ -759,7 +751,10 @@ static noinline_for_stack void free_page_list(struct list_head *free_pages)
*/
static unsigned long shrink_page_list(struct list_head *page_list,
struct zone *zone,
- struct scan_control *sc)
+ struct scan_control *sc,
+ int priority,
+ unsigned long *ret_nr_dirty,
+ unsigned long *ret_nr_writeback)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
@@ -767,6 +762,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
unsigned long nr_dirty = 0;
unsigned long nr_congested = 0;
unsigned long nr_reclaimed = 0;
+ unsigned long nr_writeback = 0;
cond_resched();
@@ -803,13 +799,12 @@ static unsigned long shrink_page_list(struct list_head *page_list,
(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
if (PageWriteback(page)) {
+ nr_writeback++;
/*
- * Synchronous reclaim is performed in two passes,
- * first an asynchronous pass over the list to
- * start parallel writeback, and a second synchronous
- * pass to wait for the IO to complete. Wait here
- * for any page for which writeback has already
- * started.
+ * Synchronous reclaim cannot queue pages for
+ * writeback due to the possibility of stack overflow
+ * but if it encounters a page under writeback, wait
+ * for the IO to complete.
*/
if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) &&
may_enter_fs)
@@ -865,6 +860,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
if (PageDirty(page)) {
nr_dirty++;
+ /*
+ * Only kswapd can writeback filesystem pages to
+ * avoid risk of stack overflow but do not writeback
+ * unless under significant pressure.
+ */
+ if (page_is_file_cache(page) &&
+ (!current_is_kswapd() || priority >= DEF_PRIORITY - 2)) {
+ /*
+ * Immediately reclaim when written back.
+ * Similar in principal to deactivate_page()
+ * except we already have the page isolated
+ * and know it's dirty
+ */
+ inc_zone_page_state(page, NR_VMSCAN_IMMEDIATE);
+ SetPageReclaim(page);
+
+ goto keep_locked;
+ }
+
if (references == PAGEREF_RECLAIM_CLEAN)
goto keep_locked;
if (!may_enter_fs)
@@ -999,6 +1013,8 @@ keep_lumpy:
list_splice(&ret_pages, page_list);
count_vm_events(PGACTIVATE, pgactivate);
+ *ret_nr_dirty += nr_dirty;
+ *ret_nr_writeback += nr_writeback;
return nr_reclaimed;
}
@@ -1012,23 +1028,27 @@ keep_lumpy:
*
* returns 0 on success, -ve errno on failure.
*/
-int __isolate_lru_page(struct page *page, int mode, int file)
+int __isolate_lru_page(struct page *page, isolate_mode_t mode, int file)
{
+ bool all_lru_mode;
int ret = -EINVAL;
/* Only take pages on the LRU. */
if (!PageLRU(page))
return ret;
+ all_lru_mode = (mode & (ISOLATE_ACTIVE|ISOLATE_INACTIVE)) ==
+ (ISOLATE_ACTIVE|ISOLATE_INACTIVE);
+
/*
* When checking the active state, we need to be sure we are
* dealing with comparible boolean values. Take the logical not
* of each.
*/
- if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+ if (!all_lru_mode && !PageActive(page) != !(mode & ISOLATE_ACTIVE))
return ret;
- if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file)
+ if (!all_lru_mode && !!page_is_file_cache(page) != file)
return ret;
/*
@@ -1041,6 +1061,12 @@ int __isolate_lru_page(struct page *page, int mode, int file)
ret = -EBUSY;
+ if ((mode & ISOLATE_CLEAN) && (PageDirty(page) || PageWriteback(page)))
+ return ret;
+
+ if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
+ return ret;
+
if (likely(get_page_unless_zero(page))) {
/*
* Be careful not to clear PageLRU until after we're
@@ -1076,7 +1102,8 @@ int __isolate_lru_page(struct page *page, int mode, int file)
*/
static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
struct list_head *src, struct list_head *dst,
- unsigned long *scanned, int order, int mode, int file)
+ unsigned long *scanned, int order, isolate_mode_t mode,
+ int file)
{
unsigned long nr_taken = 0;
unsigned long nr_lumpy_taken = 0;
@@ -1201,8 +1228,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
static unsigned long isolate_pages_global(unsigned long nr,
struct list_head *dst,
unsigned long *scanned, int order,
- int mode, struct zone *z,
- int active, int file)
+ isolate_mode_t mode,
+ struct zone *z, int active, int file)
{
int lru = LRU_BASE;
if (active)
@@ -1394,7 +1421,7 @@ static noinline_for_stack void update_isolated_counts(struct zone *zone,
}
/*
- * Returns true if the caller should wait to clean dirty/writeback pages.
+ * Returns true if a direct reclaim should wait on pages under writeback.
*
* If we are direct reclaiming for contiguous pages and we do not reclaim
* everything in the list, try again and wait for writeback IO to complete.
@@ -1416,7 +1443,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken,
if (sc->reclaim_mode & RECLAIM_MODE_SINGLE)
return false;
- /* If we have relaimed everything on the isolated list, no stall */
+ /* If we have reclaimed everything on the isolated list, no stall */
if (nr_freed == nr_taken)
return false;
@@ -1448,6 +1475,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
unsigned long nr_taken;
unsigned long nr_anon;
unsigned long nr_file;
+ unsigned long nr_dirty = 0;
+ unsigned long nr_writeback = 0;
+ isolate_mode_t reclaim_mode = ISOLATE_INACTIVE;
while (unlikely(too_many_isolated(zone, file, sc))) {
congestion_wait(BLK_RW_ASYNC, HZ/10);
@@ -1458,15 +1488,21 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
}
set_reclaim_mode(priority, sc, false);
+ if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM)
+ reclaim_mode |= ISOLATE_ACTIVE;
+
lru_add_drain();
+
+ if (!sc->may_unmap)
+ reclaim_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ reclaim_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);
if (scanning_global_lru(sc)) {
- nr_taken = isolate_pages_global(nr_to_scan,
- &page_list, &nr_scanned, sc->order,
- sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
- zone, 0, file);
+ nr_taken = isolate_pages_global(nr_to_scan, &page_list,
+ &nr_scanned, sc->order, reclaim_mode, zone, 0, file);
zone->pages_scanned += nr_scanned;
if (current_is_kswapd())
__count_zone_vm_events(PGSCAN_KSWAPD, zone,
@@ -1475,12 +1511,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
__count_zone_vm_events(PGSCAN_DIRECT, zone,
nr_scanned);
} else {
- nr_taken = mem_cgroup_isolate_pages(nr_to_scan,
- &page_list, &nr_scanned, sc->order,
- sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ?
- ISOLATE_BOTH : ISOLATE_INACTIVE,
- zone, sc->mem_cgroup,
- 0, file);
+ nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list,
+ &nr_scanned, sc->order, reclaim_mode, zone,
+ sc->mem_cgroup, 0, file);
/*
* mem_cgroup_isolate_pages() keeps track of
* scanned pages on its own.
@@ -1496,12 +1529,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
spin_unlock_irq(&zone->lru_lock);
- nr_reclaimed = shrink_page_list(&page_list, zone, sc);
+ nr_reclaimed = shrink_page_list(&page_list, zone, sc, priority,
+ &nr_dirty, &nr_writeback);
/* Check if we should syncronously wait for writeback */
if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) {
set_reclaim_mode(priority, sc, true);
- nr_reclaimed += shrink_page_list(&page_list, zone, sc);
+ nr_reclaimed += shrink_page_list(&page_list, zone, sc,
+ priority, &nr_dirty, &nr_writeback);
}
local_irq_disable();
@@ -1511,6 +1546,32 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone,
putback_lru_pages(zone, sc, nr_anon, nr_file, &page_list);
+ /*
+ * If reclaim is isolating dirty pages under writeback, it implies
+ * that the long-lived page allocation rate is exceeding the page
+ * laundering rate. Either the global limits are not being effective
+ * at throttling processes due to the page distribution throughout
+ * zones or there is heavy usage of a slow backing device. The
+ * only option is to throttle from reclaim context which is not ideal
+ * as there is no guarantee the dirtying process is throttled in the
+ * same way balance_dirty_pages() manages.
+ *
+ * This scales the number of dirty pages that must be under writeback
+ * before throttling depending on priority. It is a simple backoff
+ * function that has the most effect in the range DEF_PRIORITY to
+ * DEF_PRIORITY-2 which is the priority reclaim is considered to be
+ * in trouble and reclaim is considered to be in trouble.
+ *
+ * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
+ * DEF_PRIORITY-1 50% must be PageWriteback
+ * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
+ * ...
+ * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
+ * isolated page is PageWriteback
+ */
+ if (nr_writeback && nr_writeback >= (nr_taken >> (DEF_PRIORITY-priority)))
+ wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10);
+
trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id,
zone_idx(zone),
nr_scanned, nr_reclaimed,
@@ -1582,19 +1643,26 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
struct page *page;
struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
unsigned long nr_rotated = 0;
+ isolate_mode_t reclaim_mode = ISOLATE_ACTIVE;
lru_add_drain();
+
+ if (!sc->may_unmap)
+ reclaim_mode |= ISOLATE_UNMAPPED;
+ if (!sc->may_writepage)
+ reclaim_mode |= ISOLATE_CLEAN;
+
spin_lock_irq(&zone->lru_lock);
if (scanning_global_lru(sc)) {
nr_taken = isolate_pages_global(nr_pages, &l_hold,
&pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
+ reclaim_mode, zone,
1, file);
zone->pages_scanned += pgscanned;
} else {
nr_taken = mem_cgroup_isolate_pages(nr_pages, &l_hold,
&pgscanned, sc->order,
- ISOLATE_ACTIVE, zone,
+ reclaim_mode, zone,
sc->mem_cgroup, 1, file);
/*
* mem_cgroup_isolate_pages() keeps track of
@@ -1795,12 +1863,19 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
enum lru_list l;
int noswap = 0;
bool force_scan = false;
- unsigned long nr_force_scan[2];
- /* kswapd does zone balancing and needs to scan this zone */
+ /*
+ * If the zone or memcg is small, nr[l] can be 0. This
+ * results in no scanning on this priority and a potential
+ * priority drop. Global direct reclaim can go to the next
+ * zone and tends to have no problems. Global kswapd is for
+ * zone balancing and it needs to scan a minimum amount. When
+ * reclaiming for a memcg, a priority drop can cause high
+ * latencies, so it's better to scan a minimum amount there as
+ * well.
+ */
if (scanning_global_lru(sc) && current_is_kswapd())
force_scan = true;
- /* memcg may have small limit and need to avoid priority drop */
if (!scanning_global_lru(sc))
force_scan = true;
@@ -1810,8 +1885,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = 0;
fraction[1] = 1;
denominator = 1;
- nr_force_scan[0] = 0;
- nr_force_scan[1] = SWAP_CLUSTER_MAX;
goto out;
}
@@ -1828,8 +1901,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = 1;
fraction[1] = 0;
denominator = 1;
- nr_force_scan[0] = SWAP_CLUSTER_MAX;
- nr_force_scan[1] = 0;
goto out;
}
}
@@ -1878,11 +1949,6 @@ static void get_scan_count(struct zone *zone, struct scan_control *sc,
fraction[0] = ap;
fraction[1] = fp;
denominator = ap + fp + 1;
- if (force_scan) {
- unsigned long scan = SWAP_CLUSTER_MAX;
- nr_force_scan[0] = div64_u64(scan * ap, denominator);
- nr_force_scan[1] = div64_u64(scan * fp, denominator);
- }
out:
for_each_evictable_lru(l) {
int file = is_file_lru(l);
@@ -1891,20 +1957,10 @@ out:
scan = zone_nr_lru_pages(zone, sc, l);
if (priority || noswap) {
scan >>= priority;
+ if (!scan && force_scan)
+ scan = SWAP_CLUSTER_MAX;
scan = div64_u64(scan * fraction[file], denominator);
}
-
- /*
- * If zone is small or memcg is small, nr[l] can be 0.
- * This results no-scan on this priority and priority drop down.
- * For global direct reclaim, it can visit next zone and tend
- * not to have problems. For global kswapd, it's for zone
- * balancing and it need to scan a small amounts. When using
- * memcg, priority drop can cause big latency. So, it's better
- * to scan small amount. See may_noscan above.
- */
- if (!scan && force_scan)
- scan = nr_force_scan[file];
nr[l] = scan;
}
}
@@ -1983,12 +2039,14 @@ static void shrink_zone(int priority, struct zone *zone,
enum lru_list l;
unsigned long nr_reclaimed, nr_scanned;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ struct blk_plug plug;
restart:
nr_reclaimed = 0;
nr_scanned = sc->nr_scanned;
get_scan_count(zone, sc, nr, priority);
+ blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
nr[LRU_INACTIVE_FILE]) {
for_each_evictable_lru(l) {
@@ -2012,6 +2070,7 @@ restart:
if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
break;
}
+ blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
/*
@@ -2044,14 +2103,19 @@ restart:
*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
+ *
+ * This function returns true if a zone is being reclaimed for a costly
+ * high-order allocation and compaction is either ready to begin or deferred.
+ * This indicates to the caller that it should retry the allocation or fail.
*/
-static void shrink_zones(int priority, struct zonelist *zonelist,
+static bool shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
unsigned long nr_soft_reclaimed;
unsigned long nr_soft_scanned;
+ bool should_abort_reclaim = false;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -2066,6 +2130,23 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
+ if (COMPACTION_BUILD) {
+ /*
+ * If we already have plenty of memory free for
+ * compaction in this zone, don't free any more.
+ * Even though compaction is invoked for any
+ * non-zero order, only frequent costly order
+ * reclamation is disruptive enough to become a
+ * noticable problem, like transparent huge page
+ * allocations.
+ */
+ if (sc->order > PAGE_ALLOC_COSTLY_ORDER &&
+ (compaction_suitable(zone, sc->order) ||
+ compaction_deferred(zone))) {
+ should_abort_reclaim = true;
+ continue;
+ }
+ }
/*
* This steals pages from memory cgroups over softlimit
* and returns the number of reclaimed pages and
@@ -2083,6 +2164,8 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
shrink_zone(priority, zone, sc);
}
+
+ return should_abort_reclaim;
}
static bool zone_reclaimable(struct zone *zone)
@@ -2147,7 +2230,9 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token(sc->mem_cgroup);
- shrink_zones(priority, zonelist, sc);
+ if (shrink_zones(priority, zonelist, sc))
+ break;
+
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -2690,6 +2775,8 @@ out:
/* If balanced, clear the congested flag */
zone_clear_flag(zone, ZONE_CONGESTED);
+ if (i <= *classzone_idx)
+ balanced += zone->present_pages;
}
}
@@ -2763,7 +2850,9 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx)
static int kswapd(void *p)
{
unsigned long order, new_order;
+ unsigned balanced_order;
int classzone_idx, new_classzone_idx;
+ int balanced_classzone_idx;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
@@ -2794,7 +2883,9 @@ static int kswapd(void *p)
set_freezable();
order = new_order = 0;
+ balanced_order = 0;
classzone_idx = new_classzone_idx = pgdat->nr_zones - 1;
+ balanced_classzone_idx = classzone_idx;
for ( ; ; ) {
int ret;
@@ -2803,7 +2894,8 @@ static int kswapd(void *p)
* new request of a similar or harder type will succeed soon
* so consider going to sleep on the basis we reclaimed at
*/
- if (classzone_idx >= new_classzone_idx && order == new_order) {
+ if (balanced_classzone_idx >= new_classzone_idx &&
+ balanced_order == new_order) {
new_order = pgdat->kswapd_max_order;
new_classzone_idx = pgdat->classzone_idx;
pgdat->kswapd_max_order = 0;
@@ -2818,9 +2910,12 @@ static int kswapd(void *p)
order = new_order;
classzone_idx = new_classzone_idx;
} else {
- kswapd_try_to_sleep(pgdat, order, classzone_idx);
+ kswapd_try_to_sleep(pgdat, balanced_order,
+ balanced_classzone_idx);
order = pgdat->kswapd_max_order;
classzone_idx = pgdat->classzone_idx;
+ new_order = order;
+ new_classzone_idx = classzone_idx;
pgdat->kswapd_max_order = 0;
pgdat->classzone_idx = pgdat->nr_zones - 1;
}
@@ -2835,7 +2930,9 @@ static int kswapd(void *p)
*/
if (!ret) {
trace_mm_vmscan_kswapd_wake(pgdat->node_id, order);
- order = balance_pgdat(pgdat, order, &classzone_idx);
+ balanced_classzone_idx = classzone_idx;
+ balanced_order = balance_pgdat(pgdat, order,
+ &balanced_classzone_idx);
}
}
return 0;
@@ -3347,66 +3444,12 @@ void scan_mapping_unevictable_pages(struct address_space *mapping)
}
-/**
- * scan_zone_unevictable_pages - check unevictable list for evictable pages
- * @zone - zone of which to scan the unevictable list
- *
- * Scan @zone's unevictable LRU lists to check for pages that have become
- * evictable. Move those that have to @zone's inactive list where they
- * become candidates for reclaim, unless shrink_inactive_zone() decides
- * to reactivate them. Pages that are still unevictable are rotated
- * back onto @zone's unevictable list.
- */
-#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
-static void scan_zone_unevictable_pages(struct zone *zone)
+static void warn_scan_unevictable_pages(void)
{
- struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
- unsigned long scan;
- unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
-
- while (nr_to_scan > 0) {
- unsigned long batch_size = min(nr_to_scan,
- SCAN_UNEVICTABLE_BATCH_SIZE);
-
- spin_lock_irq(&zone->lru_lock);
- for (scan = 0; scan < batch_size; scan++) {
- struct page *page = lru_to_page(l_unevictable);
-
- if (!trylock_page(page))
- continue;
-
- prefetchw_prev_lru_page(page, l_unevictable, flags);
-
- if (likely(PageLRU(page) && PageUnevictable(page)))
- check_move_unevictable_page(page, zone);
-
- unlock_page(page);
- }
- spin_unlock_irq(&zone->lru_lock);
-
- nr_to_scan -= batch_size;
- }
-}
-
-
-/**
- * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
- *
- * A really big hammer: scan all zones' unevictable LRU lists to check for
- * pages that have become evictable. Move those back to the zones'
- * inactive list where they become candidates for reclaim.
- * This occurs when, e.g., we have unswappable pages on the unevictable lists,
- * and we add swap to the system. As such, it runs in the context of a task
- * that has possibly/probably made some previously unevictable pages
- * evictable.
- */
-static void scan_all_zones_unevictable_pages(void)
-{
- struct zone *zone;
-
- for_each_zone(zone) {
- scan_zone_unevictable_pages(zone);
- }
+ printk_once(KERN_WARNING
+ "The scan_unevictable_pages sysctl/node-interface has been "
+ "disabled for lack of a legitimate use case. If you have "
+ "one, please send an email to linux-mm@kvack.org.\n");
}
/*
@@ -3419,11 +3462,8 @@ int scan_unevictable_handler(struct ctl_table *table, int write,
void __user *buffer,
size_t *length, loff_t *ppos)
{
+ warn_scan_unevictable_pages();
proc_doulongvec_minmax(table, write, buffer, length, ppos);
-
- if (write && *(unsigned long *)table->data)
- scan_all_zones_unevictable_pages();
-
scan_unevictable_pages = 0;
return 0;
}
@@ -3438,6 +3478,7 @@ static ssize_t read_scan_unevictable_node(struct sys_device *dev,
struct sysdev_attribute *attr,
char *buf)
{
+ warn_scan_unevictable_pages();
return sprintf(buf, "0\n"); /* always zero; should fit... */
}
@@ -3445,19 +3486,7 @@ static ssize_t write_scan_unevictable_node(struct sys_device *dev,
struct sysdev_attribute *attr,
const char *buf, size_t count)
{
- struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
- struct zone *zone;
- unsigned long res;
- unsigned long req = strict_strtoul(buf, 10, &res);
-
- if (!req)
- return 1; /* zero is no-op */
-
- for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!populated_zone(zone))
- continue;
- scan_zone_unevictable_pages(zone);
- }
+ warn_scan_unevictable_pages();
return 1;
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index d52b13d..8fd603b 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -78,7 +78,7 @@ void vm_events_fold_cpu(int cpu)
*
* vm_stat contains the global counters
*/
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
+atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
EXPORT_SYMBOL(vm_stat);
#ifdef CONFIG_SMP
@@ -702,6 +702,7 @@ const char * const vmstat_text[] = {
"nr_unstable",
"nr_bounce",
"nr_vmscan_write",
+ "nr_vmscan_immediate_reclaim",
"nr_writeback_temp",
"nr_isolated_anon",
"nr_isolated_file",