summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
authorScott Wood <scottwood@freescale.com>2014-04-07 23:49:35 (GMT)
committerScott Wood <scottwood@freescale.com>2014-04-07 23:49:35 (GMT)
commit62b8c978ee6b8d135d9e7953221de58000dba986 (patch)
tree683b04b2e627f6710c22c151b23c8cc9a165315e /mm
parent78fd82238d0e5716578c326404184a27ba67fd6e (diff)
downloadlinux-fsl-qoriq-62b8c978ee6b8d135d9e7953221de58000dba986.tar.xz
Rewind v3.13-rc3+ (78fd82238d0e5716) to v3.12
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig23
-rw-r--r--mm/bootmem.c8
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/filemap.c10
-rw-r--r--mm/filemap_xip.c2
-rw-r--r--mm/huge_memory.c334
-rw-r--r--mm/hugetlb.c161
-rw-r--r--mm/kmemleak.c4
-rw-r--r--mm/ksm.c4
-rw-r--r--mm/memblock.c124
-rw-r--r--mm/memcontrol.c179
-rw-r--r--mm/memory-failure.c38
-rw-r--r--mm/memory.c178
-rw-r--r--mm/memory_hotplug.c65
-rw-r--r--mm/mempolicy.c149
-rw-r--r--mm/migrate.c92
-rw-r--r--mm/mm_init.c18
-rw-r--r--mm/mmap.c23
-rw-r--r--mm/mmzone.c14
-rw-r--r--mm/mprotect.c69
-rw-r--r--mm/nobootmem.c25
-rw-r--r--mm/nommu.c5
-rw-r--r--mm/oom_kill.c6
-rw-r--r--mm/page_alloc.c38
-rw-r--r--mm/percpu.c5
-rw-r--r--mm/pgtable-generic.c16
-rw-r--r--mm/readahead.c8
-rw-r--r--mm/rmap.c15
-rw-r--r--mm/slab.c573
-rw-r--r--mm/slab.h6
-rw-r--r--mm/slab_common.c2
-rw-r--r--mm/slub.c49
-rw-r--r--mm/sparse.c53
-rw-r--r--mm/swap.c146
-rw-r--r--mm/swapfile.c16
-rw-r--r--mm/util.c13
-rw-r--r--mm/vmalloc.c48
-rw-r--r--mm/vmstat.c22
-rw-r--r--mm/zswap.c195
39 files changed, 1247 insertions, 1496 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index eb69f35..394838f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -20,7 +20,7 @@ config FLATMEM_MANUAL
Some users of more advanced features like NUMA and
memory hotplug may have different options here.
- DISCONTIGMEM is a more mature, better tested system,
+ DISCONTIGMEM is an more mature, better tested system,
but is incompatible with memory hotplug and may suffer
decreased performance over SPARSEMEM. If unsure between
"Sparse Memory" and "Discontiguous Memory", choose
@@ -153,18 +153,11 @@ config MOVABLE_NODE
help
Allow a node to have only movable memory. Pages used by the kernel,
such as direct mapping pages cannot be migrated. So the corresponding
- memory device cannot be hotplugged. This option allows the following
- two things:
- - When the system is booting, node full of hotpluggable memory can
- be arranged to have only movable memory so that the whole node can
- be hot-removed. (need movable_node boot option specified).
- - After the system is up, the option allows users to online all the
- memory of a node as movable memory so that the whole node can be
- hot-removed.
-
- Users who don't use the memory hotplug feature are fine with this
- option on since they don't specify movable_node boot option or they
- don't online memory as movable.
+ memory device cannot be hotplugged. This option allows users to
+ online all the memory of a node as movable memory so that the whole
+ node can be hotplugged. Users who don't use the memory hotplug
+ feature are fine with this option on since they don't online memory
+ as movable.
Say Y here if you want to hotplug a whole node.
Say N here if you want kernel to use memory on all nodes evenly.
@@ -218,11 +211,9 @@ config SPLIT_PTLOCK_CPUS
int
default "999999" if ARM && !CPU_CACHE_VIPT
default "999999" if PARISC && !PA20
+ default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC
default "4"
-config ARCH_ENABLE_SPLIT_PMD_PTLOCK
- boolean
-
#
# support for memory balloon compaction
config BALLOON_COMPACTION
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 90bd350..6ab7744 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -172,12 +172,11 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size)
static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
{
struct page *page;
- unsigned long *map, start, end, pages, count = 0;
+ unsigned long start, end, pages, count = 0;
if (!bdata->node_bootmem_map)
return 0;
- map = bdata->node_bootmem_map;
start = bdata->node_min_pfn;
end = bdata->node_low_pfn;
@@ -185,9 +184,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata)
bdata - bootmem_node_data, start, end);
while (start < end) {
- unsigned long idx, vec;
+ unsigned long *map, idx, vec;
unsigned shift;
+ map = bdata->node_bootmem_map;
idx = start - bdata->node_min_pfn;
shift = idx & (BITS_PER_LONG - 1);
/*
@@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size,
return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id);
/* update goal according ...MAX_DMA32_PFN */
- end_pfn = pgdat_end_pfn(pgdat);
+ end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages;
if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) &&
(goal >> PAGE_SHIFT) < MAX_DMA32_PFN) {
diff --git a/mm/compaction.c b/mm/compaction.c
index 805165b..b5326b1 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -235,9 +235,10 @@ static bool suitable_migration_target(struct page *page)
}
/*
- * Isolate free pages onto a private freelist. If @strict is true, will abort
- * returning 0 on any invalid PFNs or non-free pages inside of the pageblock
- * (even though it may still end up isolating some pages).
+ * Isolate free pages onto a private freelist. Caller must hold zone->lock.
+ * If @strict is true, will abort returning 0 on any invalid PFNs or non-free
+ * pages inside of the pageblock (even though it may still end up isolating
+ * some pages).
*/
static unsigned long isolate_freepages_block(struct compact_control *cc,
unsigned long blockpfn,
diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a9..ae4846f 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1090,6 +1090,7 @@ static void shrink_readahead_size_eio(struct file *filp,
* @filp: the file to read
* @ppos: current file position
* @desc: read_descriptor
+ * @actor: read method
*
* This is a generic file read routine, and uses the
* mapping->a_ops->readpage() function for the actual low-level stuff.
@@ -1098,7 +1099,7 @@ static void shrink_readahead_size_eio(struct file *filp,
* of the logic when it comes to error handling etc.
*/
static void do_generic_file_read(struct file *filp, loff_t *ppos,
- read_descriptor_t *desc)
+ read_descriptor_t *desc, read_actor_t actor)
{
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
@@ -1199,14 +1200,13 @@ page_ok:
* Ok, we have the page, and it's up-to-date, so
* now we can copy it to user space...
*
- * The file_read_actor routine returns how many bytes were
- * actually used..
+ * The actor routine returns how many bytes were actually used..
* NOTE! This may not be the same as how much of a user buffer
* we filled up (we may be padding etc), so we can only update
* "pos" here (the actor routine has to update the user buffer
* pointers and the remaining count).
*/
- ret = file_read_actor(desc, page, offset, nr);
+ ret = actor(desc, page, offset, nr);
offset += ret;
index += offset >> PAGE_CACHE_SHIFT;
offset &= ~PAGE_CACHE_MASK;
@@ -1479,7 +1479,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
if (desc.count == 0)
continue;
desc.error = 0;
- do_generic_file_read(filp, ppos, &desc);
+ do_generic_file_read(filp, ppos, &desc, file_read_actor);
retval += desc.written;
if (desc.error) {
retval = retval ?: desc.error;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
index d8d9fe3..28fe26b 100644
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -26,7 +26,7 @@
* of ZERO_PAGE(), such as /dev/zero
*/
static DEFINE_MUTEX(xip_sparse_mutex);
-static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
static struct page *__xip_sparse_page;
/* called under xip_sparse_mutex */
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index bccd5a6..cca80d9 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -27,12 +27,11 @@
#include "internal.h"
/*
- * By default transparent hugepage support is disabled in order that avoid
- * to risk increase the memory footprint of applications without a guaranteed
- * benefit. When transparent hugepage support is enabled, is for all mappings,
- * and khugepaged scans all mappings.
- * Defrag is invoked by khugepaged hugepage allocations and by page faults
- * for all hugepage allocations.
+ * By default transparent hugepage support is enabled for all mappings
+ * and khugepaged scans all mappings. Defrag is only invoked by
+ * khugepaged hugepage allocations and by page faults inside
+ * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived
+ * allocations.
*/
unsigned long transparent_hugepage_flags __read_mostly =
#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS
@@ -710,7 +709,6 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
struct page *page)
{
pgtable_t pgtable;
- spinlock_t *ptl;
VM_BUG_ON(!PageCompound(page));
pgtable = pte_alloc_one(mm, haddr);
@@ -725,9 +723,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
*/
__SetPageUptodate(page);
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_none(*pmd))) {
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mem_cgroup_uncharge_page(page);
put_page(page);
pte_free(mm, pgtable);
@@ -739,8 +737,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
- atomic_long_inc(&mm->nr_ptes);
- spin_unlock(ptl);
+ mm->nr_ptes++;
+ spin_unlock(&mm->page_table_lock);
}
return 0;
@@ -760,7 +758,14 @@ static inline struct page *alloc_hugepage_vma(int defrag,
HPAGE_PMD_ORDER, vma, haddr, nd);
}
-/* Caller must hold page table lock. */
+#ifndef CONFIG_NUMA
+static inline struct page *alloc_hugepage(int defrag)
+{
+ return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
+ HPAGE_PMD_ORDER);
+}
+#endif
+
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
struct page *zero_page)
@@ -773,7 +778,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
entry = pmd_mkhuge(entry);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
- atomic_long_inc(&mm->nr_ptes);
+ mm->nr_ptes++;
return true;
}
@@ -792,7 +797,6 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
if (!(flags & FAULT_FLAG_WRITE) &&
transparent_hugepage_use_zero_page()) {
- spinlock_t *ptl;
pgtable_t pgtable;
struct page *zero_page;
bool set;
@@ -805,10 +809,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
}
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd,
zero_page);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
if (!set) {
pte_free(mm, pgtable);
put_huge_zero_page();
@@ -841,7 +845,6 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr,
struct vm_area_struct *vma)
{
- spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
pmd_t pmd;
pgtable_t pgtable;
@@ -852,9 +855,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (unlikely(!pgtable))
goto out;
- dst_ptl = pmd_lock(dst_mm, dst_pmd);
- src_ptl = pmd_lockptr(src_mm, src_pmd);
- spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ spin_lock(&dst_mm->page_table_lock);
+ spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING);
ret = -EAGAIN;
pmd = *src_pmd;
@@ -863,7 +865,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
/*
- * When page table lock is held, the huge zero pmd should not be
+ * mm->page_table_lock is enough to be sure that huge zero pmd is not
* under splitting since we don't split the page itself, only pmd to
* a page table.
*/
@@ -884,8 +886,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
}
if (unlikely(pmd_trans_splitting(pmd))) {
/* split huge page running from under us */
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
pte_free(dst_mm, pgtable);
wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */
@@ -901,12 +903,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd = pmd_mkold(pmd_wrprotect(pmd));
pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable);
set_pmd_at(dst_mm, addr, dst_pmd, pmd);
- atomic_long_inc(&dst_mm->nr_ptes);
+ dst_mm->nr_ptes++;
ret = 0;
out_unlock:
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
+ spin_unlock(&src_mm->page_table_lock);
+ spin_unlock(&dst_mm->page_table_lock);
out:
return ret;
}
@@ -917,11 +919,10 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
pmd_t *pmd, pmd_t orig_pmd,
int dirty)
{
- spinlock_t *ptl;
pmd_t entry;
unsigned long haddr;
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto unlock;
@@ -931,14 +932,13 @@ void huge_pmd_set_accessed(struct mm_struct *mm,
update_mmu_cache_pmd(vma, address, pmd);
unlock:
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
}
static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
{
- spinlock_t *ptl;
pgtable_t pgtable;
pmd_t _pmd;
struct page *page;
@@ -965,7 +965,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_page;
@@ -992,7 +992,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
}
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
put_huge_zero_page();
inc_mm_counter(mm, MM_ANONPAGES);
@@ -1002,7 +1002,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
out:
return ret;
out_free_page:
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_page(page);
put_page(page);
@@ -1016,7 +1016,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
struct page *page,
unsigned long haddr)
{
- spinlock_t *ptl;
pgtable_t pgtable;
pmd_t _pmd;
int ret = 0, i;
@@ -1063,7 +1062,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_free_pages;
VM_BUG_ON(!PageHead(page));
@@ -1089,7 +1088,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
smp_wmb(); /* make pte visible before pmd */
pmd_populate(mm, pmd, pgtable);
page_remove_rmap(page);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
@@ -1100,7 +1099,7 @@ out:
return ret;
out_free_pages:
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
mem_cgroup_uncharge_start();
for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1115,19 +1114,17 @@ out_free_pages:
int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd, pmd_t orig_pmd)
{
- spinlock_t *ptl;
int ret = 0;
struct page *page = NULL, *new_page;
unsigned long haddr;
unsigned long mmun_start; /* For mmu_notifiers */
unsigned long mmun_end; /* For mmu_notifiers */
- ptl = pmd_lockptr(mm, pmd);
VM_BUG_ON(!vma->anon_vma);
haddr = address & HPAGE_PMD_MASK;
if (is_huge_zero_pmd(orig_pmd))
goto alloc;
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, orig_pmd)))
goto out_unlock;
@@ -1143,7 +1140,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out_unlock;
}
get_page(page);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
alloc:
if (transparent_hugepage_enabled(vma) &&
!transparent_hugepage_debug_cow())
@@ -1190,11 +1187,11 @@ alloc:
mmun_end = haddr + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
if (page)
put_page(page);
if (unlikely(!pmd_same(*pmd, orig_pmd))) {
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mem_cgroup_uncharge_page(new_page);
put_page(new_page);
goto out_mn;
@@ -1216,13 +1213,13 @@ alloc:
}
ret |= VM_FAULT_WRITE;
}
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
out_mn:
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
out:
return ret;
out_unlock:
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
return ret;
}
@@ -1234,7 +1231,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
struct page *page = NULL;
- assert_spin_locked(pmd_lockptr(mm, pmd));
+ assert_spin_locked(&mm->page_table_lock);
if (flags & FOLL_WRITE && !pmd_write(*pmd))
goto out;
@@ -1281,37 +1278,23 @@ out:
int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr, pmd_t pmd, pmd_t *pmdp)
{
- spinlock_t *ptl;
struct anon_vma *anon_vma = NULL;
struct page *page;
unsigned long haddr = addr & HPAGE_PMD_MASK;
int page_nid = -1, this_nid = numa_node_id();
- int target_nid, last_cpupid = -1;
+ int target_nid;
bool page_locked;
bool migrated = false;
- int flags = 0;
- ptl = pmd_lock(mm, pmdp);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
page = pmd_page(pmd);
- BUG_ON(is_huge_zero_page(page));
page_nid = page_to_nid(page);
- last_cpupid = page_cpupid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == this_nid) {
+ if (page_nid == this_nid)
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
- flags |= TNF_FAULT_LOCAL;
- }
-
- /*
- * Avoid grouping on DSO/COW pages in specific and RO pages
- * in general, RO pages shouldn't hurt as much anyway since
- * they can be in shared cache state.
- */
- if (!pmd_write(pmd))
- flags |= TNF_NO_GROUP;
/*
* Acquire the page lock to serialise THP migrations but avoid dropping
@@ -1329,7 +1312,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* relock and check_same as the page may no longer be mapped.
* As the fault is being retried, do not account for it.
*/
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
wait_on_page_locked(page);
page_nid = -1;
goto out;
@@ -1337,13 +1320,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/* Page is misplaced, serialise migrations and parallel THP splits */
get_page(page);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
if (!page_locked)
lock_page(page);
anon_vma = page_lock_anon_vma_read(page);
- /* Confirm the PMD did not change while page_table_lock was released */
- spin_lock(ptl);
+ /* Confirm the PTE did not while locked */
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(pmd, *pmdp))) {
unlock_page(page);
put_page(page);
@@ -1355,13 +1338,11 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* Migrate the THP to the requested node, returns with page unlocked
* and pmd_numa cleared.
*/
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
migrated = migrate_misplaced_transhuge_page(mm, vma,
pmdp, pmd, addr, page, target_nid);
- if (migrated) {
- flags |= TNF_MIGRATED;
+ if (migrated)
page_nid = target_nid;
- }
goto out;
clear_pmdnuma:
@@ -1372,14 +1353,14 @@ clear_pmdnuma:
update_mmu_cache_pmd(vma, addr, pmdp);
unlock_page(page);
out_unlock:
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
out:
if (anon_vma)
page_unlock_anon_vma_read(anon_vma);
if (page_nid != -1)
- task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags);
+ task_numa_fault(page_nid, HPAGE_PMD_NR, migrated);
return 0;
}
@@ -1387,10 +1368,9 @@ out:
int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long addr)
{
- spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
struct page *page;
pgtable_t pgtable;
pmd_t orig_pmd;
@@ -1404,8 +1384,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd);
if (is_huge_zero_pmd(orig_pmd)) {
- atomic_long_dec(&tlb->mm->nr_ptes);
- spin_unlock(ptl);
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
put_huge_zero_page();
} else {
page = pmd_page(orig_pmd);
@@ -1413,8 +1393,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
VM_BUG_ON(page_mapcount(page) < 0);
add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR);
VM_BUG_ON(!PageHead(page));
- atomic_long_dec(&tlb->mm->nr_ptes);
- spin_unlock(ptl);
+ tlb->mm->nr_ptes--;
+ spin_unlock(&tlb->mm->page_table_lock);
tlb_remove_page(tlb, page);
}
pte_free(tlb->mm, pgtable);
@@ -1427,15 +1407,14 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end,
unsigned char *vec)
{
- spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
/*
* All logical pages in the range are present
* if backed by a huge page.
*/
- spin_unlock(ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
memset(vec, 1, (end - addr) >> PAGE_SHIFT);
ret = 1;
}
@@ -1448,7 +1427,6 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long old_end,
pmd_t *old_pmd, pmd_t *new_pmd)
{
- spinlock_t *old_ptl, *new_ptl;
int ret = 0;
pmd_t pmd;
@@ -1469,69 +1447,41 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
goto out;
}
- /*
- * We don't have to worry about the ordering of src and dst
- * ptlocks because exclusive mmap_sem prevents deadlock.
- */
- ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl);
+ ret = __pmd_trans_huge_lock(old_pmd, vma);
if (ret == 1) {
- new_ptl = pmd_lockptr(mm, new_pmd);
- if (new_ptl != old_ptl)
- spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
pmd = pmdp_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd));
- if (new_ptl != old_ptl)
- spin_unlock(new_ptl);
- spin_unlock(old_ptl);
+ spin_unlock(&mm->page_table_lock);
}
out:
return ret;
}
-/*
- * Returns
- * - 0 if PMD could not be locked
- * - 1 if PMD was locked but protections unchange and TLB flush unnecessary
- * - HPAGE_PMD_NR is protections changed and TLB flush necessary
- */
int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, pgprot_t newprot, int prot_numa)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
int ret = 0;
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (__pmd_trans_huge_lock(pmd, vma) == 1) {
pmd_t entry;
- ret = 1;
+ entry = pmdp_get_and_clear(mm, addr, pmd);
if (!prot_numa) {
- entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_modify(entry, newprot);
- ret = HPAGE_PMD_NR;
BUG_ON(pmd_write(entry));
} else {
struct page *page = pmd_page(*pmd);
- /*
- * Do not trap faults against the zero page. The
- * read-only data is likely to be read-cached on the
- * local CPU cache and it is less useful to know about
- * local vs remote hits on the zero page.
- */
- if (!is_huge_zero_page(page) &&
+ /* only check non-shared pages */
+ if (page_mapcount(page) == 1 &&
!pmd_numa(*pmd)) {
- entry = pmdp_get_and_clear(mm, addr, pmd);
entry = pmd_mknuma(entry);
- ret = HPAGE_PMD_NR;
}
}
-
- /* Set PMD if cleared earlier */
- if (ret == HPAGE_PMD_NR)
- set_pmd_at(mm, addr, pmd, entry);
-
- spin_unlock(ptl);
+ set_pmd_at(mm, addr, pmd, entry);
+ spin_unlock(&vma->vm_mm->page_table_lock);
+ ret = 1;
}
return ret;
@@ -1544,13 +1494,12 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* Note that if it returns 1, this routine returns without unlocking page
* table locks. So callers must unlock them.
*/
-int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
- spinlock_t **ptl)
+int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma)
{
- *ptl = pmd_lock(vma->vm_mm, pmd);
+ spin_lock(&vma->vm_mm->page_table_lock);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(*ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
wait_split_huge_page(vma->anon_vma, pmd);
return -1;
} else {
@@ -1559,37 +1508,27 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma,
return 1;
}
}
- spin_unlock(*ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
-/*
- * This function returns whether a given @page is mapped onto the @address
- * in the virtual space of @mm.
- *
- * When it's true, this function returns *pmd with holding the page table lock
- * and passing it back to the caller via @ptl.
- * If it's false, returns NULL without holding the page table lock.
- */
pmd_t *page_check_address_pmd(struct page *page,
struct mm_struct *mm,
unsigned long address,
- enum page_check_address_pmd_flag flag,
- spinlock_t **ptl)
+ enum page_check_address_pmd_flag flag)
{
- pmd_t *pmd;
+ pmd_t *pmd, *ret = NULL;
if (address & ~HPAGE_PMD_MASK)
- return NULL;
+ goto out;
pmd = mm_find_pmd(mm, address);
if (!pmd)
- return NULL;
- *ptl = pmd_lock(mm, pmd);
+ goto out;
if (pmd_none(*pmd))
- goto unlock;
+ goto out;
if (pmd_page(*pmd) != page)
- goto unlock;
+ goto out;
/*
* split_vma() may create temporary aliased mappings. There is
* no risk as long as all huge pmd are found and have their
@@ -1599,15 +1538,14 @@ pmd_t *page_check_address_pmd(struct page *page,
*/
if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG &&
pmd_trans_splitting(*pmd))
- goto unlock;
+ goto out;
if (pmd_trans_huge(*pmd)) {
VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG &&
!pmd_trans_splitting(*pmd));
- return pmd;
+ ret = pmd;
}
-unlock:
- spin_unlock(*ptl);
- return NULL;
+out:
+ return ret;
}
static int __split_huge_page_splitting(struct page *page,
@@ -1615,7 +1553,6 @@ static int __split_huge_page_splitting(struct page *page,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
pmd_t *pmd;
int ret = 0;
/* For mmu_notifiers */
@@ -1623,8 +1560,9 @@ static int __split_huge_page_splitting(struct page *page,
const unsigned long mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
+ spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl);
+ PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG);
if (pmd) {
/*
* We can't temporarily set the pmd to null in order
@@ -1635,8 +1573,8 @@ static int __split_huge_page_splitting(struct page *page,
*/
pmdp_splitting_flush(vma, address, pmd);
ret = 1;
- spin_unlock(ptl);
}
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return ret;
@@ -1724,7 +1662,7 @@ static void __split_huge_page_refcount(struct page *page,
page_tail->mapping = page->mapping;
page_tail->index = page->index + i;
- page_cpupid_xchg_last(page_tail, page_cpupid_last(page));
+ page_nid_xchg_last(page_tail, page_nid_last(page));
BUG_ON(!PageAnon(page_tail));
BUG_ON(!PageUptodate(page_tail));
@@ -1767,14 +1705,14 @@ static int __split_huge_page_map(struct page *page,
unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
pmd_t *pmd, _pmd;
int ret = 0, i;
pgtable_t pgtable;
unsigned long haddr;
+ spin_lock(&mm->page_table_lock);
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl);
+ PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG);
if (pmd) {
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
@@ -1829,8 +1767,8 @@ static int __split_huge_page_map(struct page *page,
pmdp_invalidate(vma, address, pmd);
pmd_populate(mm, pmd, pgtable);
ret = 1;
- spin_unlock(ptl);
}
+ spin_unlock(&mm->page_table_lock);
return ret;
}
@@ -2227,34 +2165,7 @@ static void khugepaged_alloc_sleep(void)
msecs_to_jiffies(khugepaged_alloc_sleep_millisecs));
}
-static int khugepaged_node_load[MAX_NUMNODES];
-
#ifdef CONFIG_NUMA
-static int khugepaged_find_target_node(void)
-{
- static int last_khugepaged_target_node = NUMA_NO_NODE;
- int nid, target_node = 0, max_value = 0;
-
- /* find first node with max normal pages hit */
- for (nid = 0; nid < MAX_NUMNODES; nid++)
- if (khugepaged_node_load[nid] > max_value) {
- max_value = khugepaged_node_load[nid];
- target_node = nid;
- }
-
- /* do some balance if several nodes have the same hit record */
- if (target_node <= last_khugepaged_target_node)
- for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES;
- nid++)
- if (max_value == khugepaged_node_load[nid]) {
- target_node = nid;
- break;
- }
-
- last_khugepaged_target_node = target_node;
- return target_node;
-}
-
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
if (IS_ERR(*hpage)) {
@@ -2288,8 +2199,9 @@ static struct page
* mmap_sem in read mode is good idea also to allow greater
* scalability.
*/
- *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask(
- khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER);
+ *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address,
+ node, __GFP_OTHER_NODE);
+
/*
* After allocating the hugepage, release the mmap_sem read lock in
* preparation for taking it in write mode.
@@ -2305,17 +2217,6 @@ static struct page
return *hpage;
}
#else
-static int khugepaged_find_target_node(void)
-{
- return 0;
-}
-
-static inline struct page *alloc_hugepage(int defrag)
-{
- return alloc_pages(alloc_hugepage_gfpmask(defrag, 0),
- HPAGE_PMD_ORDER);
-}
-
static struct page *khugepaged_alloc_hugepage(bool *wait)
{
struct page *hpage;
@@ -2382,7 +2283,7 @@ static void collapse_huge_page(struct mm_struct *mm,
pte_t *pte;
pgtable_t pgtable;
struct page *new_page;
- spinlock_t *pmd_ptl, *pte_ptl;
+ spinlock_t *ptl;
int isolated;
unsigned long hstart, hend;
unsigned long mmun_start; /* For mmu_notifiers */
@@ -2425,12 +2326,12 @@ static void collapse_huge_page(struct mm_struct *mm,
anon_vma_lock_write(vma->anon_vma);
pte = pte_offset_map(pmd, address);
- pte_ptl = pte_lockptr(mm, pmd);
+ ptl = pte_lockptr(mm, pmd);
mmun_start = address;
mmun_end = address + HPAGE_PMD_SIZE;
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */
+ spin_lock(&mm->page_table_lock); /* probably unnecessary */
/*
* After this gup_fast can't run anymore. This also removes
* any huge TLB entry from the CPU so we won't allow
@@ -2438,16 +2339,16 @@ static void collapse_huge_page(struct mm_struct *mm,
* to avoid the risk of CPU bugs in that area.
*/
_pmd = pmdp_clear_flush(vma, address, pmd);
- spin_unlock(pmd_ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- spin_lock(pte_ptl);
+ spin_lock(ptl);
isolated = __collapse_huge_page_isolate(vma, address, pte);
- spin_unlock(pte_ptl);
+ spin_unlock(ptl);
if (unlikely(!isolated)) {
pte_unmap(pte);
- spin_lock(pmd_ptl);
+ spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
/*
* We can only use set_pmd_at when establishing
@@ -2455,7 +2356,7 @@ static void collapse_huge_page(struct mm_struct *mm,
* points to regular pagetables. Use pmd_populate for that
*/
pmd_populate(mm, pmd, pmd_pgtable(_pmd));
- spin_unlock(pmd_ptl);
+ spin_unlock(&mm->page_table_lock);
anon_vma_unlock_write(vma->anon_vma);
goto out;
}
@@ -2466,7 +2367,7 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
anon_vma_unlock_write(vma->anon_vma);
- __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
+ __collapse_huge_page_copy(pte, new_page, vma, address, ptl);
pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
@@ -2481,13 +2382,13 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
smp_wmb();
- spin_lock(pmd_ptl);
+ spin_lock(&mm->page_table_lock);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
- spin_unlock(pmd_ptl);
+ spin_unlock(&mm->page_table_lock);
*hpage = NULL;
@@ -2522,7 +2423,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (pmd_trans_huge(*pmd))
goto out;
- memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load));
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, _address += PAGE_SIZE) {
@@ -2539,13 +2439,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
if (unlikely(!page))
goto out_unmap;
/*
- * Record which node the original page is from and save this
- * information to khugepaged_node_load[].
- * Khupaged will allocate hugepage from the node has the max
- * hit record.
+ * Chose the node of the first page. This could
+ * be more sophisticated and look at more pages,
+ * but isn't for now.
*/
- node = page_to_nid(page);
- khugepaged_node_load[node]++;
+ if (node == NUMA_NO_NODE)
+ node = page_to_nid(page);
VM_BUG_ON(PageCompound(page));
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
@@ -2560,11 +2459,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
- if (ret) {
- node = khugepaged_find_target_node();
+ if (ret)
/* collapse_huge_page will return with the mmap_sem released */
collapse_huge_page(mm, address, hpage, vma, node);
- }
out:
return ret;
}
@@ -2816,7 +2713,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma,
void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmd)
{
- spinlock_t *ptl;
struct page *page;
struct mm_struct *mm = vma->vm_mm;
unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -2829,22 +2725,22 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address,
mmun_end = haddr + HPAGE_PMD_SIZE;
again:
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_trans_huge(*pmd))) {
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
if (is_huge_zero_pmd(*pmd)) {
__split_huge_zero_page_pmd(vma, haddr, pmd);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
return;
}
page = pmd_page(*pmd);
VM_BUG_ON(!page_count(page));
get_page(page);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
split_huge_page(page);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dee6cf4..0b7656e 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -476,6 +476,40 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg)
return 0;
}
+static void copy_gigantic_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+ struct page *dst_base = dst;
+ struct page *src_base = src;
+
+ for (i = 0; i < pages_per_huge_page(h); ) {
+ cond_resched();
+ copy_highpage(dst, src);
+
+ i++;
+ dst = mem_map_next(dst, dst_base, i);
+ src = mem_map_next(src, src_base, i);
+ }
+}
+
+void copy_huge_page(struct page *dst, struct page *src)
+{
+ int i;
+ struct hstate *h = page_hstate(src);
+
+ if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) {
+ copy_gigantic_page(dst, src);
+ return;
+ }
+
+ might_sleep();
+ for (i = 0; i < pages_per_huge_page(h); i++) {
+ cond_resched();
+ copy_highpage(dst + i, src + i);
+ }
+}
+
static void enqueue_huge_page(struct hstate *h, struct page *page)
{
int nid = page_to_nid(page);
@@ -702,23 +736,6 @@ int PageHuge(struct page *page)
}
EXPORT_SYMBOL_GPL(PageHuge);
-/*
- * PageHeadHuge() only returns true for hugetlbfs head page, but not for
- * normal or transparent huge pages.
- */
-int PageHeadHuge(struct page *page_head)
-{
- compound_page_dtor *dtor;
-
- if (!PageHead(page_head))
- return 0;
-
- dtor = get_compound_page_dtor(page_head);
-
- return dtor == free_huge_page;
-}
-EXPORT_SYMBOL_GPL(PageHeadHuge);
-
pgoff_t __basepage_index(struct page *page)
{
struct page *page_head = compound_head(page);
@@ -2359,7 +2376,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) {
- spinlock_t *src_ptl, *dst_ptl;
src_pte = huge_pte_offset(src, addr);
if (!src_pte)
continue;
@@ -2371,9 +2387,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
if (dst_pte == src_pte)
continue;
- dst_ptl = huge_pte_lock(h, dst, dst_pte);
- src_ptl = huge_pte_lockptr(h, src, src_pte);
- spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+ spin_lock(&dst->page_table_lock);
+ spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING);
if (!huge_pte_none(huge_ptep_get(src_pte))) {
if (cow)
huge_ptep_set_wrprotect(src, addr, src_pte);
@@ -2383,8 +2398,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
- spin_unlock(src_ptl);
- spin_unlock(dst_ptl);
+ spin_unlock(&src->page_table_lock);
+ spin_unlock(&dst->page_table_lock);
}
return 0;
@@ -2427,7 +2442,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
unsigned long address;
pte_t *ptep;
pte_t pte;
- spinlock_t *ptl;
struct page *page;
struct hstate *h = hstate_vma(vma);
unsigned long sz = huge_page_size(h);
@@ -2441,25 +2455,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
tlb_start_vma(tlb, vma);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
again:
+ spin_lock(&mm->page_table_lock);
for (address = start; address < end; address += sz) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
- ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, &address, ptep))
- goto unlock;
+ continue;
pte = huge_ptep_get(ptep);
if (huge_pte_none(pte))
- goto unlock;
+ continue;
/*
* HWPoisoned hugepage is already unmapped and dropped reference
*/
if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
huge_pte_clear(mm, address, ptep);
- goto unlock;
+ continue;
}
page = pte_page(pte);
@@ -2470,7 +2484,7 @@ again:
*/
if (ref_page) {
if (page != ref_page)
- goto unlock;
+ continue;
/*
* Mark the VMA as having unmapped its page so that
@@ -2487,18 +2501,13 @@ again:
page_remove_rmap(page);
force_flush = !__tlb_remove_page(tlb, page);
- if (force_flush) {
- spin_unlock(ptl);
+ if (force_flush)
break;
- }
/* Bail out after unmapping reference page if supplied */
- if (ref_page) {
- spin_unlock(ptl);
+ if (ref_page)
break;
- }
-unlock:
- spin_unlock(ptl);
}
+ spin_unlock(&mm->page_table_lock);
/*
* mmu_gather ran out of room to batch pages, we break out of
* the PTE lock to avoid doing the potential expensive TLB invalidate
@@ -2604,7 +2613,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
*/
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
- struct page *pagecache_page, spinlock_t *ptl)
+ struct page *pagecache_page)
{
struct hstate *h = hstate_vma(vma);
struct page *old_page, *new_page;
@@ -2638,8 +2647,8 @@ retry_avoidcopy:
page_cache_get(old_page);
- /* Drop page table lock as buddy allocator may be called */
- spin_unlock(ptl);
+ /* Drop page_table_lock as buddy allocator may be called */
+ spin_unlock(&mm->page_table_lock);
new_page = alloc_huge_page(vma, address, outside_reserve);
if (IS_ERR(new_page)) {
@@ -2657,13 +2666,13 @@ retry_avoidcopy:
BUG_ON(huge_pte_none(pte));
if (unmap_ref_private(mm, vma, old_page, address)) {
BUG_ON(huge_pte_none(pte));
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte)))
goto retry_avoidcopy;
/*
- * race occurs while re-acquiring page table
- * lock, and our job is done.
+ * race occurs while re-acquiring page_table_lock, and
+ * our job is done.
*/
return 0;
}
@@ -2671,7 +2680,7 @@ retry_avoidcopy:
}
/* Caller expects lock to be held */
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
if (err == -ENOMEM)
return VM_FAULT_OOM;
else
@@ -2686,7 +2695,7 @@ retry_avoidcopy:
page_cache_release(new_page);
page_cache_release(old_page);
/* Caller expects lock to be held */
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
return VM_FAULT_OOM;
}
@@ -2698,10 +2707,10 @@ retry_avoidcopy:
mmun_end = mmun_start + huge_page_size(h);
mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
/*
- * Retake the page table lock to check for racing updates
+ * Retake the page_table_lock to check for racing updates
* before the page tables are altered
*/
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & huge_page_mask(h));
if (likely(pte_same(huge_ptep_get(ptep), pte))) {
ClearPagePrivate(new_page);
@@ -2715,13 +2724,13 @@ retry_avoidcopy:
/* Make the old page be freed below */
new_page = old_page;
}
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
page_cache_release(new_page);
page_cache_release(old_page);
/* Caller expects lock to be held */
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
return 0;
}
@@ -2769,7 +2778,6 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page;
struct address_space *mapping;
pte_t new_pte;
- spinlock_t *ptl;
/*
* Currently, we are forced to kill the process in the event the
@@ -2856,8 +2864,7 @@ retry:
goto backout_unlocked;
}
- ptl = huge_pte_lockptr(h, mm, ptep);
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
size = i_size_read(mapping->host) >> huge_page_shift(h);
if (idx >= size)
goto backout;
@@ -2878,16 +2885,16 @@ retry:
if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) {
/* Optimization, do the COW without a second fault */
- ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl);
+ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page);
}
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
unlock_page(page);
out:
return ret;
backout:
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
backout_unlocked:
unlock_page(page);
put_page(page);
@@ -2899,7 +2906,6 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
{
pte_t *ptep;
pte_t entry;
- spinlock_t *ptl;
int ret;
struct page *page = NULL;
struct page *pagecache_page = NULL;
@@ -2912,7 +2918,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (ptep) {
entry = huge_ptep_get(ptep);
if (unlikely(is_hugetlb_entry_migration(entry))) {
- migration_entry_wait_huge(vma, mm, ptep);
+ migration_entry_wait_huge(mm, ptep);
return 0;
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
@@ -2968,18 +2974,17 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (page != pagecache_page)
lock_page(page);
- ptl = huge_pte_lockptr(h, mm, ptep);
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
- goto out_ptl;
+ goto out_page_table_lock;
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
ret = hugetlb_cow(mm, vma, address, ptep, entry,
- pagecache_page, ptl);
- goto out_ptl;
+ pagecache_page);
+ goto out_page_table_lock;
}
entry = huge_pte_mkdirty(entry);
}
@@ -2988,8 +2993,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, address, ptep);
-out_ptl:
- spin_unlock(ptl);
+out_page_table_lock:
+ spin_unlock(&mm->page_table_lock);
if (pagecache_page) {
unlock_page(pagecache_page);
@@ -3015,9 +3020,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long remainder = *nr_pages;
struct hstate *h = hstate_vma(vma);
+ spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
- spinlock_t *ptl = NULL;
int absent;
struct page *page;
@@ -3025,12 +3030,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* Some archs (sparc64, sh*) have multiple pte_ts to
* each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
- *
- * Note that page table lock is not held when pte is null.
*/
pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
- if (pte)
- ptl = huge_pte_lock(h, mm, pte);
absent = !pte || huge_pte_none(huge_ptep_get(pte));
/*
@@ -3042,8 +3043,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (absent && (flags & FOLL_DUMP) &&
!hugetlbfs_pagecache_present(h, vma, vaddr)) {
- if (pte)
- spin_unlock(ptl);
remainder = 0;
break;
}
@@ -3063,10 +3062,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
!huge_pte_write(huge_ptep_get(pte)))) {
int ret;
- if (pte)
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
ret = hugetlb_fault(mm, vma, vaddr,
(flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
+ spin_lock(&mm->page_table_lock);
if (!(ret & VM_FAULT_ERROR))
continue;
@@ -3097,8 +3096,8 @@ same_page:
*/
goto same_page;
}
- spin_unlock(ptl);
}
+ spin_unlock(&mm->page_table_lock);
*nr_pages = remainder;
*position = vaddr;
@@ -3119,15 +3118,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
flush_cache_range(vma, address, end);
mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex);
+ spin_lock(&mm->page_table_lock);
for (; address < end; address += huge_page_size(h)) {
- spinlock_t *ptl;
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
- ptl = huge_pte_lock(h, mm, ptep);
if (huge_pmd_unshare(mm, &address, ptep)) {
pages++;
- spin_unlock(ptl);
continue;
}
if (!huge_pte_none(huge_ptep_get(ptep))) {
@@ -3137,8 +3134,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
set_huge_pte_at(mm, address, ptep, pte);
pages++;
}
- spin_unlock(ptl);
}
+ spin_unlock(&mm->page_table_lock);
/*
* Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare
* may have cleared our pud entry and done put_page on the page table:
@@ -3301,7 +3298,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
unsigned long saddr;
pte_t *spte = NULL;
pte_t *pte;
- spinlock_t *ptl;
if (!vma_shareable(vma, addr))
return (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3324,14 +3320,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (!spte)
goto out;
- ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
- spin_lock(ptl);
+ spin_lock(&mm->page_table_lock);
if (pud_none(*pud))
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
else
put_page(virt_to_page(spte));
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
mutex_unlock(&mapping->i_mmap_mutex);
@@ -3345,7 +3340,7 @@ out:
* indicated by page_count > 1, unmap is achieved by clearing pud and
* decrementing the ref count. If count == 1, the pte page is not shared.
*
- * called with page table lock held.
+ * called with vma->vm_mm->page_table_lock held.
*
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 31f01c5..e126b0e 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -753,9 +753,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp)
}
spin_lock_irqsave(&object->lock, flags);
- if (size == SIZE_MAX) {
- size = object->pointer + object->size - ptr;
- } else if (ptr + size > object->pointer + object->size) {
+ if (ptr + size > object->pointer + object->size) {
kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr);
dump_object_info(object);
kmem_cache_free(scan_area_cache, area);
diff --git a/mm/ksm.c b/mm/ksm.c
index 175fff7..0bea2b2 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj,
* Allocate stable and unstable together:
* MAXSMP NODES_SHIFT 10 will use 16kB.
*/
- buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf),
- GFP_KERNEL);
+ buf = kcalloc(nr_node_ids + nr_node_ids,
+ sizeof(*buf), GFP_KERNEL | __GFP_ZERO);
/* Let us assume that RB_ROOT is NULL is zero */
if (!buf)
err = -ENOMEM;
diff --git a/mm/memblock.c b/mm/memblock.c
index 53e477b..0ac412a 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -20,8 +20,6 @@
#include <linux/seq_file.h>
#include <linux/memblock.h>
-#include <asm-generic/sections.h>
-
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
@@ -34,7 +32,6 @@ struct memblock memblock __initdata_memblock = {
.reserved.cnt = 1, /* empty dummy entry */
.reserved.max = INIT_MEMBLOCK_REGIONS,
- .bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
@@ -85,73 +82,6 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type,
return (i < type->cnt) ? i : -1;
}
-/*
- * __memblock_find_range_bottom_up - find free area utility in bottom-up
- * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
- * @size: size of free area to find
- * @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
- *
- * Utility called from memblock_find_in_range_node(), find free area bottom-up.
- *
- * RETURNS:
- * Found address on success, 0 on failure.
- */
-static phys_addr_t __init_memblock
-__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
-{
- phys_addr_t this_start, this_end, cand;
- u64 i;
-
- for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) {
- this_start = clamp(this_start, start, end);
- this_end = clamp(this_end, start, end);
-
- cand = round_up(this_start, align);
- if (cand < this_end && this_end - cand >= size)
- return cand;
- }
-
- return 0;
-}
-
-/**
- * __memblock_find_range_top_down - find free area utility, in top-down
- * @start: start of candidate range
- * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE}
- * @size: size of free area to find
- * @align: alignment of free area to find
- * @nid: nid of the free area to find, %MAX_NUMNODES for any node
- *
- * Utility called from memblock_find_in_range_node(), find free area top-down.
- *
- * RETURNS:
- * Found address on success, 0 on failure.
- */
-static phys_addr_t __init_memblock
-__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
- phys_addr_t size, phys_addr_t align, int nid)
-{
- phys_addr_t this_start, this_end, cand;
- u64 i;
-
- for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
- this_start = clamp(this_start, start, end);
- this_end = clamp(this_end, start, end);
-
- if (this_end < size)
- continue;
-
- cand = round_down(this_end - size, align);
- if (cand >= this_start)
- return cand;
- }
-
- return 0;
-}
-
/**
* memblock_find_in_range_node - find free area in given range and node
* @start: start of candidate range
@@ -162,23 +92,15 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end,
*
* Find @size free area aligned to @align in the specified range and node.
*
- * When allocation direction is bottom-up, the @start should be greater
- * than the end of the kernel image. Otherwise, it will be trimmed. The
- * reason is that we want the bottom-up allocation just near the kernel
- * image so it is highly likely that the allocated memory and the kernel
- * will reside in the same node.
- *
- * If bottom-up allocation failed, will try to allocate memory top-down.
- *
* RETURNS:
- * Found address on success, 0 on failure.
+ * Found address on success, %0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
phys_addr_t align, int nid)
{
- int ret;
- phys_addr_t kernel_end;
+ phys_addr_t this_start, this_end, cand;
+ u64 i;
/* pump up @end */
if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
@@ -187,39 +109,19 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
/* avoid allocating the first page */
start = max_t(phys_addr_t, start, PAGE_SIZE);
end = max(start, end);
- kernel_end = __pa_symbol(_end);
- /*
- * try bottom-up allocation only when bottom-up mode
- * is set and @end is above the kernel image.
- */
- if (memblock_bottom_up() && end > kernel_end) {
- phys_addr_t bottom_up_start;
-
- /* make sure we will allocate above the kernel */
- bottom_up_start = max(start, kernel_end);
+ for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) {
+ this_start = clamp(this_start, start, end);
+ this_end = clamp(this_end, start, end);
- /* ok, try bottom-up allocation first */
- ret = __memblock_find_range_bottom_up(bottom_up_start, end,
- size, align, nid);
- if (ret)
- return ret;
+ if (this_end < size)
+ continue;
- /*
- * we always limit bottom-up allocation above the kernel,
- * but top-down allocation doesn't have the limit, so
- * retrying top-down allocation may succeed when bottom-up
- * allocation failed.
- *
- * bottom-up allocation is expected to be fail very rarely,
- * so we use WARN_ONCE() here to see the stack trace if
- * fail happens.
- */
- WARN_ONCE(1, "memblock: bottom-up allocation failed, "
- "memory hotunplug may be affected\n");
+ cand = round_down(this_end - size, align);
+ if (cand >= this_start)
+ return cand;
}
-
- return __memblock_find_range_top_down(start, end, size, align, nid);
+ return 0;
}
/**
@@ -232,7 +134,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start,
* Find @size free area aligned to @align in the specified range.
*
* RETURNS:
- * Found address on success, 0 on failure.
+ * Found address on success, %0 on failure.
*/
phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start,
phys_addr_t end, phys_addr_t size,
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f1a0ae6..13b9d0f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -59,7 +59,6 @@
#include <net/sock.h>
#include <net/ip.h>
#include <net/tcp_memcontrol.h>
-#include "slab.h"
#include <asm/uaccess.h>
@@ -313,7 +312,7 @@ struct mem_cgroup {
atomic_t dead_count;
#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
- struct cg_proto tcp_mem;
+ struct tcp_memcontrol tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
/* analogous to slab_common's slab_caches list. per-memcg */
@@ -500,29 +499,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
return (memcg == root_mem_cgroup);
}
-/*
- * We restrict the id in the range of [1, 65535], so it can fit into
- * an unsigned short.
- */
-#define MEM_CGROUP_ID_MAX USHRT_MAX
-
-static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg)
-{
- /*
- * The ID of the root cgroup is 0, but memcg treat 0 as an
- * invalid ID, so we return (cgroup_id + 1).
- */
- return memcg->css.cgroup->id + 1;
-}
-
-static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
-{
- struct cgroup_subsys_state *css;
-
- css = css_from_id(id - 1, &mem_cgroup_subsys);
- return mem_cgroup_from_css(css);
-}
-
/* Writing them here to avoid exposing memcg's inner layout */
#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM)
@@ -575,13 +551,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
if (!memcg || mem_cgroup_is_root(memcg))
return NULL;
- return &memcg->tcp_mem;
+ return &memcg->tcp_mem.cg_proto;
}
EXPORT_SYMBOL(tcp_proto_cgroup);
static void disarm_sock_keys(struct mem_cgroup *memcg)
{
- if (!memcg_proto_activated(&memcg->tcp_mem))
+ if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
return;
static_key_slow_dec(&memcg_socket_limit_enabled);
}
@@ -594,11 +570,16 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
/*
* This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
- * The main reason for not using cgroup id for this:
- * this works better in sparse environments, where we have a lot of memcgs,
- * but only a few kmem-limited. Or also, if we have, for instance, 200
- * memcgs, and none but the 200th is kmem-limited, we'd have to have a
- * 200 entry array for that.
+ * There are two main reasons for not using the css_id for this:
+ * 1) this works better in sparse environments, where we have a lot of memcgs,
+ * but only a few kmem-limited. Or also, if we have, for instance, 200
+ * memcgs, and none but the 200th is kmem-limited, we'd have to have a
+ * 200 entry array for that.
+ *
+ * 2) In order not to violate the cgroup API, we would like to do all memory
+ * allocation in ->create(). At that point, we haven't yet allocated the
+ * css_id. Having a separate index prevents us from messing with the cgroup
+ * core for this
*
* The current size of the caches array is stored in
* memcg_limited_groups_array_size. It will double each time we have to
@@ -613,14 +594,14 @@ int memcg_limited_groups_array_size;
* cgroups is a reasonable guess. In the future, it could be a parameter or
* tunable, but that is strictly not necessary.
*
- * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
+ * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get
* this constant directly from cgroup, but it is understandable that this is
* better kept as an internal representation in cgroup.c. In any case, the
- * cgrp_id space is not getting any smaller, and we don't have to necessarily
+ * css_id space is not getting any smaller, and we don't have to necessarily
* increase ours as well if it increases.
*/
#define MEMCG_CACHES_MIN_SIZE 4
-#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
+#define MEMCG_CACHES_MAX_SIZE 65535
/*
* A lot of the calls to the cache allocation functions are expected to be
@@ -1427,7 +1408,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
return true;
if (!root_memcg->use_hierarchy || !memcg)
return false;
- return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup);
+ return css_is_ancestor(&memcg->css, &root_memcg->css);
}
static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
@@ -2845,10 +2826,15 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg,
*/
static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
{
+ struct cgroup_subsys_state *css;
+
/* ID 0 is unused ID */
if (!id)
return NULL;
- return mem_cgroup_from_id(id);
+ css = css_lookup(&mem_cgroup_subsys, id);
+ if (!css)
+ return NULL;
+ return mem_cgroup_from_css(css);
}
struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
@@ -2969,7 +2955,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
VM_BUG_ON(p->is_root_cache);
cachep = p->root_cache;
- return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
+ return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
}
#ifdef CONFIG_SLABINFO
@@ -2998,14 +2984,21 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size)
struct res_counter *fail_res;
struct mem_cgroup *_memcg;
int ret = 0;
+ bool may_oom;
ret = res_counter_charge(&memcg->kmem, size, &fail_res);
if (ret)
return ret;
+ /*
+ * Conditions under which we can wait for the oom_killer. Those are
+ * the same conditions tested by the core page allocator
+ */
+ may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
+
_memcg = memcg;
ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT,
- &_memcg, oom_gfp_allowed(gfp));
+ &_memcg, may_oom);
if (ret == -EINTR) {
/*
@@ -3145,7 +3138,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
{
struct memcg_cache_params *cur_params = s->memcg_params;
- VM_BUG_ON(!is_root_cache(s));
+ VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
if (num_groups > memcg_limited_groups_array_size) {
int i;
@@ -3406,7 +3399,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
idx = memcg_cache_id(memcg);
mutex_lock(&memcg_cache_mutex);
- new_cachep = cache_from_memcg_idx(cachep, idx);
+ new_cachep = cachep->memcg_params->memcg_caches[idx];
if (new_cachep) {
css_put(&memcg->css);
goto out;
@@ -3452,8 +3445,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
* we'll take the set_limit_mutex to protect ourselves against this.
*/
mutex_lock(&set_limit_mutex);
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
+ for (i = 0; i < memcg_limited_groups_array_size; i++) {
+ c = s->memcg_params->memcg_caches[i];
if (!c)
continue;
@@ -3586,8 +3579,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
* code updating memcg_caches will issue a write barrier to match this.
*/
read_barrier_depends();
- if (likely(cache_from_memcg_idx(cachep, idx))) {
- cachep = cache_from_memcg_idx(cachep, idx);
+ if (likely(cachep->memcg_params->memcg_caches[idx])) {
+ cachep = cachep->memcg_params->memcg_caches[idx];
goto out;
}
@@ -4357,7 +4350,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
* css_get() was called in uncharge().
*/
if (do_swap_account && swapout && memcg)
- swap_cgroup_record(ent, mem_cgroup_id(memcg));
+ swap_cgroup_record(ent, css_id(&memcg->css));
}
#endif
@@ -4409,8 +4402,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
{
unsigned short old_id, new_id;
- old_id = mem_cgroup_id(from);
- new_id = mem_cgroup_id(to);
+ old_id = css_id(&from->css);
+ new_id = css_id(&to->css);
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
@@ -5383,50 +5376,45 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
static int memcg_numa_stat_show(struct cgroup_subsys_state *css,
struct cftype *cft, struct seq_file *m)
{
- struct numa_stat {
- const char *name;
- unsigned int lru_mask;
- };
-
- static const struct numa_stat stats[] = {
- { "total", LRU_ALL },
- { "file", LRU_ALL_FILE },
- { "anon", LRU_ALL_ANON },
- { "unevictable", BIT(LRU_UNEVICTABLE) },
- };
- const struct numa_stat *stat;
int nid;
- unsigned long nr;
+ unsigned long total_nr, file_nr, anon_nr, unevictable_nr;
+ unsigned long node_nr;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
- seq_printf(m, "%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
- seq_putc(m, '\n');
- }
-
- for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- struct mem_cgroup *iter;
-
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_node_nr_lru_pages(
- iter, nid, stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
- seq_putc(m, '\n');
+ total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL);
+ seq_printf(m, "total=%lu", total_nr);
+ for_each_node_state(nid, N_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
}
+ seq_putc(m, '\n');
+ file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE);
+ seq_printf(m, "file=%lu", file_nr);
+ for_each_node_state(nid, N_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ LRU_ALL_FILE);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
+
+ anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON);
+ seq_printf(m, "anon=%lu", anon_nr);
+ for_each_node_state(nid, N_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ LRU_ALL_ANON);
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
+
+ unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE));
+ seq_printf(m, "unevictable=%lu", unevictable_nr);
+ for_each_node_state(nid, N_MEMORY) {
+ node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
+ BIT(LRU_UNEVICTABLE));
+ seq_printf(m, " N%d=%lu", nid, node_nr);
+ }
+ seq_putc(m, '\n');
return 0;
}
#endif /* CONFIG_NUMA */
@@ -6178,6 +6166,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
size_t size = memcg_size();
mem_cgroup_remove_from_trees(memcg);
+ free_css_id(&mem_cgroup_subsys, &memcg->css);
for_each_node(node)
free_mem_cgroup_per_zone_info(memcg, node);
@@ -6280,9 +6269,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css));
int error = 0;
- if (css->cgroup->id > MEM_CGROUP_ID_MAX)
- return -ENOSPC;
-
if (!parent)
return 0;
@@ -6554,7 +6540,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
}
/* There is a swap entry and a page doesn't exist or isn't charged */
if (ent.val && !ret &&
- mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
+ css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) {
ret = MC_TARGET_SWAP;
if (target)
target->ent = ent;
@@ -6605,10 +6591,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
pte_t *pte;
spinlock_t *ptl;
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
mc.precharge += HPAGE_PMD_NR;
- spin_unlock(ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -6797,9 +6783,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
* to be unlocked in __split_huge_page_splitting(), where the main
* part of thp split is not executed yet.
*/
- if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ if (pmd_trans_huge_lock(pmd, vma) == 1) {
if (mc.precharge < HPAGE_PMD_NR) {
- spin_unlock(ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
@@ -6816,7 +6802,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
}
put_page(page);
}
- spin_unlock(ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
return 0;
}
@@ -6974,6 +6960,7 @@ struct cgroup_subsys mem_cgroup_subsys = {
.bind = mem_cgroup_bind,
.base_cftypes = mem_cgroup_files,
.early_init = 0,
+ .use_id = 1,
};
#ifdef CONFIG_MEMCG_SWAP
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index b7c1716..bf3351b 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -1269,7 +1269,7 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags)
mf_cpu = &get_cpu_var(memory_failure_cpu);
spin_lock_irqsave(&mf_cpu->lock, proc_flags);
- if (kfifo_put(&mf_cpu->fifo, entry))
+ if (kfifo_put(&mf_cpu->fifo, &entry))
schedule_work_on(smp_processor_id(), &mf_cpu->work);
else
pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n",
@@ -1423,6 +1423,19 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
return 1;
/*
+ * The lock_memory_hotplug prevents a race with memory hotplug.
+ * This is a big hammer, a better would be nicer.
+ */
+ lock_memory_hotplug();
+
+ /*
+ * Isolate the page, so that it doesn't get reallocated if it
+ * was free. This flag should be kept set until the source page
+ * is freed and PG_hwpoison on it is set.
+ */
+ if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE)
+ set_migratetype_isolate(p, true);
+ /*
* When the target page is a free hugepage, just remove it
* from free hugepage list.
*/
@@ -1442,6 +1455,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags)
/* Not a free page */
ret = 1;
}
+ unlock_memory_hotplug();
return ret;
}
@@ -1640,28 +1654,15 @@ int soft_offline_page(struct page *page, int flags)
}
}
- /*
- * The lock_memory_hotplug prevents a race with memory hotplug.
- * This is a big hammer, a better would be nicer.
- */
- lock_memory_hotplug();
-
- /*
- * Isolate the page, so that it doesn't get reallocated if it
- * was free. This flag should be kept set until the source page
- * is freed and PG_hwpoison on it is set.
- */
- if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE)
- set_migratetype_isolate(page, true);
-
ret = get_any_page(page, pfn, flags);
- unlock_memory_hotplug();
- if (ret > 0) { /* for in-use pages */
+ if (ret < 0)
+ goto unset;
+ if (ret) { /* for in-use pages */
if (PageHuge(page))
ret = soft_offline_huge_page(page, flags);
else
ret = __soft_offline_page(page, flags);
- } else if (ret == 0) { /* for free pages */
+ } else { /* for free pages */
if (PageHuge(page)) {
set_page_hwpoison_huge_page(hpage);
dequeue_hwpoisoned_huge_page(hpage);
@@ -1672,6 +1673,7 @@ int soft_offline_page(struct page *page, int flags)
atomic_long_inc(&num_poisoned_pages);
}
}
+unset:
unset_migratetype_isolate(page, MIGRATE_MOVABLE);
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index 5d9025f..d176154 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -69,8 +69,8 @@
#include "internal.h"
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
-#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid.
+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
+#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid.
#endif
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
pgtable_t token = pmd_pgtable(*pmd);
pmd_clear(pmd);
pte_free_tlb(tlb, token, addr);
- atomic_long_dec(&tlb->mm->nr_ptes);
+ tlb->mm->nr_ptes--;
}
static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
@@ -453,6 +453,8 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
/*
* This function frees user-level page tables of a process.
+ *
+ * Must be called with pagetable lock held.
*/
void free_pgd_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
@@ -550,7 +552,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma,
int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
pmd_t *pmd, unsigned long address)
{
- spinlock_t *ptl;
pgtable_t new = pte_alloc_one(mm, address);
int wait_split_huge_page;
if (!new)
@@ -571,15 +572,15 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma,
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
wait_split_huge_page = 0;
if (likely(pmd_none(*pmd))) { /* Has another populated it ? */
- atomic_long_inc(&mm->nr_ptes);
+ mm->nr_ptes++;
pmd_populate(mm, pmd, new);
new = NULL;
} else if (unlikely(pmd_trans_splitting(*pmd)))
wait_split_huge_page = 1;
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
if (new)
pte_free(mm, new);
if (wait_split_huge_page)
@@ -680,7 +681,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
if (vma->vm_ops)
printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n",
vma->vm_ops->fault);
- if (vma->vm_file)
+ if (vma->vm_file && vma->vm_file->f_op)
printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n",
vma->vm_file->f_op->mmap);
dump_stack();
@@ -1517,20 +1518,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
split_huge_page_pmd(vma, address, pmd);
goto split_fallthrough;
}
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
if (likely(pmd_trans_huge(*pmd))) {
if (unlikely(pmd_trans_splitting(*pmd))) {
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
wait_split_huge_page(vma->anon_vma, pmd);
} else {
page = follow_trans_huge_pmd(vma, address,
pmd, flags);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
*page_mask = HPAGE_PMD_NR - 1;
goto out;
}
} else
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
/* fall through */
}
split_fallthrough:
@@ -2720,14 +2721,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
get_page(dirty_page);
reuse:
- /*
- * Clear the pages cpupid information as the existing
- * information potentially belongs to a now completely
- * unrelated process.
- */
- if (old_page)
- page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1);
-
flush_cache_page(vma, address, pte_pfn(orig_pte));
entry = pte_mkyoung(orig_pte);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -3528,16 +3521,13 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int page_nid,
- int *flags)
+ unsigned long addr, int page_nid)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == numa_node_id()) {
+ if (page_nid == numa_node_id())
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
- *flags |= TNF_FAULT_LOCAL;
- }
return mpol_misplaced(page, vma, addr);
}
@@ -3548,10 +3538,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *page = NULL;
spinlock_t *ptl;
int page_nid = -1;
- int last_cpupid;
int target_nid;
bool migrated = false;
- int flags = 0;
/*
* The "pte" at this point cannot be used safely without
@@ -3578,26 +3566,9 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(ptep, ptl);
return 0;
}
- BUG_ON(is_zero_pfn(page_to_pfn(page)));
-
- /*
- * Avoid grouping on DSO/COW pages in specific and RO pages
- * in general, RO pages shouldn't hurt as much anyway since
- * they can be in shared cache state.
- */
- if (!pte_write(pte))
- flags |= TNF_NO_GROUP;
-
- /*
- * Flag if the page is shared between multiple address spaces. This
- * is later used when determining whether to group tasks together
- */
- if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED))
- flags |= TNF_SHARED;
- last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid);
pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) {
put_page(page);
@@ -3605,18 +3576,103 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
/* Migrate to the requested node */
- migrated = migrate_misplaced_page(page, vma, target_nid);
- if (migrated) {
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
page_nid = target_nid;
- flags |= TNF_MIGRATED;
- }
out:
if (page_nid != -1)
- task_numa_fault(last_cpupid, page_nid, 1, flags);
+ task_numa_fault(page_nid, 1, migrated);
return 0;
}
+/* NUMA hinting page fault entry point for regular pmds */
+#ifdef CONFIG_NUMA_BALANCING
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ pmd_t pmd;
+ pte_t *pte, *orig_pte;
+ unsigned long _addr = addr & PMD_MASK;
+ unsigned long offset;
+ spinlock_t *ptl;
+ bool numa = false;
+
+ spin_lock(&mm->page_table_lock);
+ pmd = *pmdp;
+ if (pmd_numa(pmd)) {
+ set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd));
+ numa = true;
+ }
+ spin_unlock(&mm->page_table_lock);
+
+ if (!numa)
+ return 0;
+
+ /* we're in a page fault so some vma must be in the range */
+ BUG_ON(!vma);
+ BUG_ON(vma->vm_start >= _addr + PMD_SIZE);
+ offset = max(_addr, vma->vm_start) & ~PMD_MASK;
+ VM_BUG_ON(offset >= PMD_SIZE);
+ orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl);
+ pte += offset >> PAGE_SHIFT;
+ for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) {
+ pte_t pteval = *pte;
+ struct page *page;
+ int page_nid = -1;
+ int target_nid;
+ bool migrated = false;
+
+ if (!pte_present(pteval))
+ continue;
+ if (!pte_numa(pteval))
+ continue;
+ if (addr >= vma->vm_end) {
+ vma = find_vma(mm, addr);
+ /* there's a pte present so there must be a vma */
+ BUG_ON(!vma);
+ BUG_ON(addr < vma->vm_start);
+ }
+ if (pte_numa(pteval)) {
+ pteval = pte_mknonnuma(pteval);
+ set_pte_at(mm, addr, pte, pteval);
+ }
+ page = vm_normal_page(vma, addr, pteval);
+ if (unlikely(!page))
+ continue;
+ /* only check non-shared pages */
+ if (unlikely(page_mapcount(page) != 1))
+ continue;
+
+ page_nid = page_to_nid(page);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+ pte_unmap_unlock(pte, ptl);
+ if (target_nid != -1) {
+ migrated = migrate_misplaced_page(page, target_nid);
+ if (migrated)
+ page_nid = target_nid;
+ } else {
+ put_page(page);
+ }
+
+ if (page_nid != -1)
+ task_numa_fault(page_nid, 1, migrated);
+
+ pte = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ }
+ pte_unmap_unlock(orig_pte, ptl);
+
+ return 0;
+}
+#else
+static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr, pmd_t *pmdp)
+{
+ BUG();
+ return 0;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
@@ -3755,8 +3811,8 @@ retry:
}
}
- /* THP should already have been handled */
- BUG_ON(pmd_numa(*pmd));
+ if (pmd_numa(*pmd))
+ return do_pmd_numa_page(mm, vma, address, pmd);
/*
* Use __pte_alloc instead of pte_alloc_map, because we can't
@@ -4270,21 +4326,3 @@ void copy_user_huge_page(struct page *dst, struct page *src,
}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
-
-#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS
-bool ptlock_alloc(struct page *page)
-{
- spinlock_t *ptl;
-
- ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL);
- if (!ptl)
- return false;
- page->ptl = ptl;
- return true;
-}
-
-void ptlock_free(struct page *page)
-{
- kfree(page->ptl);
-}
-#endif
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 489f235..ed85fe3 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -31,7 +31,6 @@
#include <linux/firmware-map.h>
#include <linux/stop_machine.h>
#include <linux/hugetlb.h>
-#include <linux/memblock.h>
#include <asm/tlbflush.h>
@@ -366,7 +365,8 @@ out_fail:
static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
unsigned long end_pfn)
{
- unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat);
+ unsigned long old_pgdat_end_pfn =
+ pgdat->node_start_pfn + pgdat->node_spanned_pages;
if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
pgdat->node_start_pfn = start_pfn;
@@ -402,12 +402,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
static int __meminit __add_section(int nid, struct zone *zone,
unsigned long phys_start_pfn)
{
+ int nr_pages = PAGES_PER_SECTION;
int ret;
if (pfn_valid(phys_start_pfn))
return -EEXIST;
- ret = sparse_add_one_section(zone, phys_start_pfn);
+ ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
if (ret < 0)
return ret;
@@ -578,9 +579,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn,
static void shrink_pgdat_span(struct pglist_data *pgdat,
unsigned long start_pfn, unsigned long end_pfn)
{
- unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
- unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */
- unsigned long pgdat_end_pfn = p;
+ unsigned long pgdat_start_pfn = pgdat->node_start_pfn;
+ unsigned long pgdat_end_pfn =
+ pgdat->node_start_pfn + pgdat->node_spanned_pages;
unsigned long pfn;
struct mem_section *ms;
int nid = pgdat->node_id;
@@ -934,7 +935,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
arg.nr_pages = nr_pages;
node_states_check_changes_online(nr_pages, zone, &arg);
- nid = pfn_to_nid(pfn);
+ nid = page_to_nid(pfn_to_page(pfn));
ret = memory_notify(MEM_GOING_ONLINE, &arg);
ret = notifier_to_errno(ret);
@@ -1043,23 +1044,17 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
}
-/**
- * try_online_node - online a node if offlined
- *
+/*
* called by cpu_up() to online a node without onlined memory.
*/
-int try_online_node(int nid)
+int mem_online_node(int nid)
{
pg_data_t *pgdat;
int ret;
- if (node_online(nid))
- return 0;
-
lock_memory_hotplug();
pgdat = hotadd_new_pgdat(nid, 0);
if (!pgdat) {
- pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
goto out;
}
@@ -1067,12 +1062,6 @@ int try_online_node(int nid)
ret = register_one_node(nid);
BUG_ON(ret);
- if (pgdat->node_zonelists->_zonerefs->zone == NULL) {
- mutex_lock(&zonelists_mutex);
- build_all_zonelists(NULL, NULL);
- mutex_unlock(&zonelists_mutex);
- }
-
out:
unlock_memory_hotplug();
return ret;
@@ -1423,36 +1412,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
}
#endif /* CONFIG_MOVABLE_NODE */
-static int __init cmdline_parse_movable_node(char *p)
-{
-#ifdef CONFIG_MOVABLE_NODE
- /*
- * Memory used by the kernel cannot be hot-removed because Linux
- * cannot migrate the kernel pages. When memory hotplug is
- * enabled, we should prevent memblock from allocating memory
- * for the kernel.
- *
- * ACPI SRAT records all hotpluggable memory ranges. But before
- * SRAT is parsed, we don't know about it.
- *
- * The kernel image is loaded into memory at very early time. We
- * cannot prevent this anyway. So on NUMA system, we set any
- * node the kernel resides in as un-hotpluggable.
- *
- * Since on modern servers, one node could have double-digit
- * gigabytes memory, we can assume the memory around the kernel
- * image is also un-hotpluggable. So before SRAT is parsed, just
- * allocate memory near the kernel image to try the best to keep
- * the kernel away from hotpluggable memory.
- */
- memblock_set_bottom_up(true);
-#else
- pr_warn("movable_node option not supported\n");
-#endif
- return 0;
-}
-early_param("movable_node", cmdline_parse_movable_node);
-
/* check which state of node_states will be changed when offline memory */
static void node_states_check_changes_offline(unsigned long nr_pages,
struct zone *zone, struct memory_notify *arg)
@@ -1743,7 +1702,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn,
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-static int check_memblock_offlined_cb(struct memory_block *mem, void *arg)
+static int is_memblock_offlined_cb(struct memory_block *mem, void *arg)
{
int ret = !is_memblock_offlined(mem);
@@ -1895,7 +1854,7 @@ void __ref remove_memory(int nid, u64 start, u64 size)
* if this is not the case.
*/
ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL,
- check_memblock_offlined_cb);
+ is_memblock_offlined_cb);
if (ret) {
unlock_memory_hotplug();
BUG();
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index eca4a31..0472964 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -525,9 +525,8 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
#ifdef CONFIG_HUGETLB_PAGE
int nid;
struct page *page;
- spinlock_t *ptl;
- ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
+ spin_lock(&vma->vm_mm->page_table_lock);
page = pte_page(huge_ptep_get((pte_t *)pmd));
nid = page_to_nid(page);
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
@@ -537,7 +536,7 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
isolate_huge_page(page, private);
unlock:
- spin_unlock(ptl);
+ spin_unlock(&vma->vm_mm->page_table_lock);
#else
BUG();
#endif
@@ -1126,7 +1125,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
tmp = *from;
while (!nodes_empty(tmp)) {
int s,d;
- int source = NUMA_NO_NODE;
+ int source = -1;
int dest = 0;
for_each_node_mask(s, tmp) {
@@ -1161,7 +1160,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (!node_isset(dest, tmp))
break;
}
- if (source == NUMA_NO_NODE)
+ if (source == -1)
break;
node_clear(source, tmp);
@@ -1680,30 +1679,6 @@ struct mempolicy *get_vma_policy(struct task_struct *task,
return pol;
}
-bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma)
-{
- struct mempolicy *pol = get_task_policy(task);
- if (vma) {
- if (vma->vm_ops && vma->vm_ops->get_policy) {
- bool ret = false;
-
- pol = vma->vm_ops->get_policy(vma, vma->vm_start);
- if (pol && (pol->flags & MPOL_F_MOF))
- ret = true;
- mpol_cond_put(pol);
-
- return ret;
- } else if (vma->vm_policy) {
- pol = vma->vm_policy;
- }
- }
-
- if (!pol)
- return default_policy.flags & MPOL_F_MOF;
-
- return pol->flags & MPOL_F_MOF;
-}
-
static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
{
enum zone_type dynamic_policy_zone = policy_zone;
@@ -1836,7 +1811,7 @@ static unsigned offset_il_node(struct mempolicy *pol,
unsigned nnodes = nodes_weight(pol->v.nodes);
unsigned target;
int c;
- int nid = NUMA_NO_NODE;
+ int nid = -1;
if (!nnodes)
return numa_node_id();
@@ -1873,11 +1848,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
/*
* Return the bit number of a random bit set in the nodemask.
- * (returns NUMA_NO_NODE if nodemask is empty)
+ * (returns -1 if nodemask is empty)
*/
int node_random(const nodemask_t *maskp)
{
- int w, bit = NUMA_NO_NODE;
+ int w, bit = -1;
w = nodes_weight(*maskp);
if (w)
@@ -2302,35 +2277,6 @@ static void sp_free(struct sp_node *n)
kmem_cache_free(sn_cache, n);
}
-#ifdef CONFIG_NUMA_BALANCING
-static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
- /* Never defer a private fault */
- if (cpupid_match_pid(p, last_cpupid))
- return false;
-
- if (p->numa_migrate_deferred) {
- p->numa_migrate_deferred--;
- return true;
- }
- return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
- p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
-}
-#else
-static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
-{
- return false;
-}
-
-static inline void defer_numa_migrate(struct task_struct *p)
-{
-}
-#endif /* CONFIG_NUMA_BALANCING */
-
/**
* mpol_misplaced - check whether current page node is valid in policy
*
@@ -2354,8 +2300,6 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
struct zone *zone;
int curnid = page_to_nid(page);
unsigned long pgoff;
- int thiscpu = raw_smp_processor_id();
- int thisnid = cpu_to_node(thiscpu);
int polnid = -1;
int ret = -1;
@@ -2404,11 +2348,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
/* Migrate the page towards the node whose CPU is referencing it */
if (pol->flags & MPOL_F_MORON) {
- int last_cpupid;
- int this_cpupid;
+ int last_nid;
- polnid = thisnid;
- this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid);
+ polnid = numa_node_id();
/*
* Multi-stage node selection is used in conjunction
@@ -2431,25 +2373,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long
* it less likely we act on an unlikely task<->page
* relation.
*/
- last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
- if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
-
- /* See sysctl_numa_balancing_migrate_deferred comment */
- if (!cpupid_match_pid(current, last_cpupid))
- defer_numa_migrate(current);
-
- goto out;
- }
-
- /*
- * The quadratic filter above reduces extraneous migration
- * of shared pages somewhat. This code reduces it even more,
- * reducing the overhead of page migrations of shared pages.
- * This makes workloads with shared pages rely more on
- * "move task near its memory", and less on "move memory
- * towards its task", which is exactly what we want.
- */
- if (numa_migrate_deferred(current, last_cpupid))
+ last_nid = page_nid_xchg_last(page, polnid);
+ if (last_nid != polnid)
goto out;
}
@@ -2915,45 +2840,62 @@ out:
* @maxlen: length of @buffer
* @pol: pointer to mempolicy to be formatted
*
- * Convert @pol into a string. If @buffer is too short, truncate the string.
- * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the
- * longest flag, "relative", and to display at least a few node ids.
+ * Convert a mempolicy into a string.
+ * Returns the number of characters in buffer (if positive)
+ * or an error (negative)
*/
-void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
{
char *p = buffer;
- nodemask_t nodes = NODE_MASK_NONE;
- unsigned short mode = MPOL_DEFAULT;
- unsigned short flags = 0;
+ int l;
+ nodemask_t nodes;
+ unsigned short mode;
+ unsigned short flags = pol ? pol->flags : 0;
- if (pol && pol != &default_policy) {
+ /*
+ * Sanity check: room for longest mode, flag and some nodes
+ */
+ VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
+
+ if (!pol || pol == &default_policy)
+ mode = MPOL_DEFAULT;
+ else
mode = pol->mode;
- flags = pol->flags;
- }
switch (mode) {
case MPOL_DEFAULT:
+ nodes_clear(nodes);
break;
+
case MPOL_PREFERRED:
+ nodes_clear(nodes);
if (flags & MPOL_F_LOCAL)
mode = MPOL_LOCAL;
else
node_set(pol->v.preferred_node, nodes);
break;
+
case MPOL_BIND:
+ /* Fall through */
case MPOL_INTERLEAVE:
nodes = pol->v.nodes;
break;
+
default:
- WARN_ON_ONCE(1);
- snprintf(p, maxlen, "unknown");
- return;
+ return -EINVAL;
}
- p += snprintf(p, maxlen, "%s", policy_modes[mode]);
+ l = strlen(policy_modes[mode]);
+ if (buffer + maxlen < p + l + 1)
+ return -ENOSPC;
+
+ strcpy(p, policy_modes[mode]);
+ p += l;
if (flags & MPOL_MODE_FLAGS) {
- p += snprintf(p, buffer + maxlen - p, "=");
+ if (buffer + maxlen < p + 2)
+ return -ENOSPC;
+ *p++ = '=';
/*
* Currently, the only defined flags are mutually exclusive
@@ -2965,7 +2907,10 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
}
if (!nodes_empty(nodes)) {
- p += snprintf(p, buffer + maxlen - p, ":");
+ if (buffer + maxlen < p + 2)
+ return -ENOSPC;
+ *p++ = ':';
p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
}
+ return p - buffer;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index bb94004..c046927 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -130,7 +130,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
ptep = huge_pte_offset(mm, addr);
if (!ptep)
goto out;
- ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep);
+ ptl = &mm->page_table_lock;
} else {
pmd = mm_find_pmd(mm, addr);
if (!pmd)
@@ -249,10 +249,9 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd,
__migration_entry_wait(mm, ptep, ptl);
}
-void migration_entry_wait_huge(struct vm_area_struct *vma,
- struct mm_struct *mm, pte_t *pte)
+void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte)
{
- spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte);
+ spinlock_t *ptl = &(mm)->page_table_lock;
__migration_entry_wait(mm, pte, ptl);
}
@@ -442,60 +441,10 @@ int migrate_huge_page_move_mapping(struct address_space *mapping,
}
/*
- * Gigantic pages are so large that we do not guarantee that page++ pointer
- * arithmetic will work across the entire page. We need something more
- * specialized.
- */
-static void __copy_gigantic_page(struct page *dst, struct page *src,
- int nr_pages)
-{
- int i;
- struct page *dst_base = dst;
- struct page *src_base = src;
-
- for (i = 0; i < nr_pages; ) {
- cond_resched();
- copy_highpage(dst, src);
-
- i++;
- dst = mem_map_next(dst, dst_base, i);
- src = mem_map_next(src, src_base, i);
- }
-}
-
-static void copy_huge_page(struct page *dst, struct page *src)
-{
- int i;
- int nr_pages;
-
- if (PageHuge(src)) {
- /* hugetlbfs page */
- struct hstate *h = page_hstate(src);
- nr_pages = pages_per_huge_page(h);
-
- if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) {
- __copy_gigantic_page(dst, src, nr_pages);
- return;
- }
- } else {
- /* thp page */
- BUG_ON(!PageTransHuge(src));
- nr_pages = hpage_nr_pages(src);
- }
-
- for (i = 0; i < nr_pages; i++) {
- cond_resched();
- copy_highpage(dst + i, src + i);
- }
-}
-
-/*
* Copy the page to its new location
*/
void migrate_page_copy(struct page *newpage, struct page *page)
{
- int cpupid;
-
if (PageHuge(page) || PageTransHuge(page))
copy_huge_page(newpage, page);
else
@@ -532,13 +481,6 @@ void migrate_page_copy(struct page *newpage, struct page *page)
__set_page_dirty_nobuffers(newpage);
}
- /*
- * Copy NUMA information to the new page, to prevent over-eager
- * future migrations of this same page.
- */
- cpupid = page_cpupid_xchg_last(page, -1);
- page_cpupid_xchg_last(newpage, cpupid);
-
mlock_migrate_page(newpage, page);
ksm_migrate_page(newpage, page);
/*
@@ -1558,7 +1500,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
__GFP_NOWARN) &
~GFP_IOFS, 0);
if (newpage)
- page_cpupid_xchg_last(newpage, page_cpupid_last(page));
+ page_nid_xchg_last(newpage, page_nid_last(page));
return newpage;
}
@@ -1659,8 +1601,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
* node. Caller is expected to have an elevated reference count on
* the page that will be dropped by this function before returning.
*/
-int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
- int node)
+int migrate_misplaced_page(struct page *page, int node)
{
pg_data_t *pgdat = NODE_DATA(node);
int isolated;
@@ -1668,11 +1609,10 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
LIST_HEAD(migratepages);
/*
- * Don't migrate file pages that are mapped in multiple processes
- * with execute permissions as they are probably shared libraries.
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
*/
- if (page_mapcount(page) != 1 && page_is_file_cache(page) &&
- (vma->vm_flags & VM_EXEC))
+ if (page_mapcount(page) != 1)
goto out;
/*
@@ -1715,7 +1655,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
unsigned long address,
struct page *page, int node)
{
- spinlock_t *ptl;
unsigned long haddr = address & HPAGE_PMD_MASK;
pg_data_t *pgdat = NODE_DATA(node);
int isolated = 0;
@@ -1724,6 +1663,13 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
int page_lru = page_is_file_cache(page);
/*
+ * Don't migrate pages that are mapped in multiple processes.
+ * TODO: Handle false sharing detection instead of this hammer
+ */
+ if (page_mapcount(page) != 1)
+ goto out_dropref;
+
+ /*
* Rate-limit the amount of data that is being migrated to a node.
* Optimal placement is no good if the memory bus is saturated and
* all the time is being spent migrating!
@@ -1736,7 +1682,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
if (!new_page)
goto out_fail;
- page_cpupid_xchg_last(new_page, page_cpupid_last(page));
+ page_nid_xchg_last(new_page, page_nid_last(page));
isolated = numamigrate_isolate_page(pgdat, page);
if (!isolated) {
@@ -1755,9 +1701,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
WARN_ON(PageLRU(new_page));
/* Recheck the target PMD */
- ptl = pmd_lock(mm, pmd);
+ spin_lock(&mm->page_table_lock);
if (unlikely(!pmd_same(*pmd, entry))) {
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
/* Reverse changes made by migrate_page_copy() */
if (TestClearPageActive(new_page))
@@ -1802,7 +1748,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
* before it's fully transferred to the new page.
*/
mem_cgroup_end_migration(memcg, page, new_page, true);
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
unlock_page(new_page);
unlock_page(page);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 68562e9..633c088 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void)
unsigned long or_mask, add_mask;
shift = 8 * sizeof(unsigned long);
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastnid %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
- LAST_CPUPID_WIDTH,
+ LAST_NID_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
- "Section %d Node %d Zone %d Lastcpupid %d\n",
+ "Section %d Node %d Zone %d Lastnid %d\n",
SECTIONS_SHIFT,
NODES_SHIFT,
ZONES_SHIFT,
- LAST_CPUPID_SHIFT);
+ LAST_NID_SHIFT);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
- "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
+ "Section %lu Node %lu Zone %lu Lastnid %lu\n",
(unsigned long)SECTIONS_PGSHIFT,
(unsigned long)NODES_PGSHIFT,
(unsigned long)ZONES_PGSHIFT,
- (unsigned long)LAST_CPUPID_PGSHIFT);
+ (unsigned long)LAST_NID_PGSHIFT);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
"Node/Zone ID: %lu -> %lu\n",
(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
@@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void)
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
"Node not in page flags");
#endif
-#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS
+#ifdef LAST_NID_NOT_IN_PAGE_FLAGS
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags",
- "Last cpupid not in page flags");
+ "Last nid not in page flags");
#endif
if (SECTIONS_WIDTH) {
diff --git a/mm/mmap.c b/mm/mmap.c
index 834b2d7..9d54851 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -179,12 +179,14 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
goto error;
}
- allowed = vm_commit_limit();
+ allowed = (totalram_pages - hugetlb_total_pages())
+ * sysctl_overcommit_ratio / 100;
/*
* Reserve some for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed += total_swap_pages;
/*
* Don't let a single process grow so big a user can't recover
@@ -1297,7 +1299,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
vm_flags &= ~VM_MAYEXEC;
}
- if (!file->f_op->mmap)
+ if (!file->f_op || !file->f_op->mmap)
return -ENODEV;
if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP))
return -EINVAL;
@@ -1854,7 +1856,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
struct vm_area_struct *vma;
struct vm_unmapped_area_info info;
- if (len > TASK_SIZE - mmap_min_addr)
+ if (len > TASK_SIZE)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1863,14 +1865,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = 0;
info.length = len;
- info.low_limit = mm->mmap_base;
+ info.low_limit = TASK_UNMAPPED_BASE;
info.high_limit = TASK_SIZE;
info.align_mask = 0;
return vm_unmapped_area(&info);
@@ -1893,7 +1895,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
struct vm_unmapped_area_info info;
/* requested length too big for entire address space */
- if (len > TASK_SIZE - mmap_min_addr)
+ if (len > TASK_SIZE)
return -ENOMEM;
if (flags & MAP_FIXED)
@@ -1903,14 +1905,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
if (addr) {
addr = PAGE_ALIGN(addr);
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
+ if (TASK_SIZE - len >= addr &&
(!vma || addr + len <= vma->vm_start))
return addr;
}
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
- info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+ info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
info.align_mask = 0;
addr = vm_unmapped_area(&info);
@@ -1949,7 +1951,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
return -ENOMEM;
get_area = current->mm->get_unmapped_area;
- if (file && file->f_op->get_unmapped_area)
+ if (file && file->f_op && file->f_op->get_unmapped_area)
get_area = file->f_op->get_unmapped_area;
addr = get_area(file, addr, len, pgoff, flags);
if (IS_ERR_VALUE(addr))
@@ -2724,8 +2726,7 @@ void exit_mmap(struct mm_struct *mm)
}
vm_unacct_memory(nr_accounted);
- WARN_ON(atomic_long_read(&mm->nr_ptes) >
- (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
+ WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
/* Insert vm structure into process list sorted by address
diff --git a/mm/mmzone.c b/mm/mmzone.c
index bf34fb8..2ac0afb 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec)
INIT_LIST_HEAD(&lruvec->lists[lru]);
}
-#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS)
-int page_cpupid_xchg_last(struct page *page, int cpupid)
+#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS)
+int page_nid_xchg_last(struct page *page, int nid)
{
unsigned long old_flags, flags;
- int last_cpupid;
+ int last_nid;
do {
old_flags = flags = page->flags;
- last_cpupid = page_cpupid_last(page);
+ last_nid = page_nid_last(page);
- flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT);
- flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT;
+ flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT);
+ flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT;
} while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags));
- return last_cpupid;
+ return last_nid;
}
#endif
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 2666797..412ba2b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -37,12 +37,14 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, unsigned long end, pgprot_t newprot,
- int dirty_accountable, int prot_numa)
+ int dirty_accountable, int prot_numa, bool *ret_all_same_node)
{
struct mm_struct *mm = vma->vm_mm;
pte_t *pte, oldpte;
spinlock_t *ptl;
unsigned long pages = 0;
+ bool all_same_node = true;
+ int last_nid = -1;
pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
arch_enter_lazy_mmu_mode();
@@ -61,7 +63,15 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page = vm_normal_page(vma, addr, oldpte);
if (page) {
- if (!pte_numa(oldpte)) {
+ int this_nid = page_to_nid(page);
+ if (last_nid == -1)
+ last_nid = this_nid;
+ if (last_nid != this_nid)
+ all_same_node = false;
+
+ /* only check non-shared pages */
+ if (!pte_numa(oldpte) &&
+ page_mapcount(page) == 1) {
ptent = pte_mknuma(ptent);
updated = true;
}
@@ -94,17 +104,33 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (pte_swp_soft_dirty(oldpte))
newpte = pte_swp_mksoft_dirty(newpte);
set_pte_at(mm, addr, pte, newpte);
-
- pages++;
}
+ pages++;
}
} while (pte++, addr += PAGE_SIZE, addr != end);
arch_leave_lazy_mmu_mode();
pte_unmap_unlock(pte - 1, ptl);
+ *ret_all_same_node = all_same_node;
return pages;
}
+#ifdef CONFIG_NUMA_BALANCING
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ spin_lock(&mm->page_table_lock);
+ set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd));
+ spin_unlock(&mm->page_table_lock);
+}
+#else
+static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr,
+ pmd_t *pmd)
+{
+ BUG();
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pud_t *pud, unsigned long addr, unsigned long end,
pgprot_t newprot, int dirty_accountable, int prot_numa)
@@ -112,39 +138,36 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
pmd_t *pmd;
unsigned long next;
unsigned long pages = 0;
- unsigned long nr_huge_updates = 0;
+ bool all_same_node;
pmd = pmd_offset(pud, addr);
do {
- unsigned long this_pages;
-
next = pmd_addr_end(addr, end);
if (pmd_trans_huge(*pmd)) {
if (next - addr != HPAGE_PMD_SIZE)
split_huge_page_pmd(vma, addr, pmd);
- else {
- int nr_ptes = change_huge_pmd(vma, pmd, addr,
- newprot, prot_numa);
-
- if (nr_ptes) {
- if (nr_ptes == HPAGE_PMD_NR) {
- pages += HPAGE_PMD_NR;
- nr_huge_updates++;
- }
- continue;
- }
+ else if (change_huge_pmd(vma, pmd, addr, newprot,
+ prot_numa)) {
+ pages++;
+ continue;
}
/* fall through */
}
if (pmd_none_or_clear_bad(pmd))
continue;
- this_pages = change_pte_range(vma, pmd, addr, next, newprot,
- dirty_accountable, prot_numa);
- pages += this_pages;
+ pages += change_pte_range(vma, pmd, addr, next, newprot,
+ dirty_accountable, prot_numa, &all_same_node);
+
+ /*
+ * If we are changing protections for NUMA hinting faults then
+ * set pmd_numa if the examined pages were all on the same
+ * node. This allows a regular PMD to be handled as one fault
+ * and effectively batches the taking of the PTL
+ */
+ if (prot_numa && all_same_node)
+ change_pmd_protnuma(vma->vm_mm, addr, pmd);
} while (pmd++, addr = next, addr != end);
- if (nr_huge_updates)
- count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates);
return pages;
}
diff --git a/mm/nobootmem.c b/mm/nobootmem.c
index 2c254d3..61107cf 100644
--- a/mm/nobootmem.c
+++ b/mm/nobootmem.c
@@ -82,18 +82,27 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size)
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
- int order;
+ unsigned long i, start_aligned, end_aligned;
+ int order = ilog2(BITS_PER_LONG);
- while (start < end) {
- order = min(MAX_ORDER - 1UL, __ffs(start));
+ start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1);
+ end_aligned = end & ~(BITS_PER_LONG - 1);
- while (start + (1UL << order) > end)
- order--;
+ if (end_aligned <= start_aligned) {
+ for (i = start; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
- __free_pages_bootmem(pfn_to_page(start), order);
-
- start += (1UL << order);
+ return;
}
+
+ for (i = start; i < start_aligned; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
+
+ for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG)
+ __free_pages_bootmem(pfn_to_page(i), order);
+
+ for (i = end_aligned; i < end; i++)
+ __free_pages_bootmem(pfn_to_page(i), 0);
}
static unsigned long __init __free_memory_core(phys_addr_t start,
diff --git a/mm/nommu.c b/mm/nommu.c
index fec093a..ecd1f15 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -937,7 +937,7 @@ static int validate_mmap_request(struct file *file,
struct address_space *mapping;
/* files must support mmap */
- if (!file->f_op->mmap)
+ if (!file->f_op || !file->f_op->mmap)
return -ENODEV;
/* work out if what we've got could possibly be shared
@@ -1948,12 +1948,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
goto error;
}
- allowed = vm_commit_limit();
+ allowed = totalram_pages * sysctl_overcommit_ratio / 100;
/*
* Reserve some 3% for root
*/
if (!cap_sys_admin)
allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10);
+ allowed += total_swap_pages;
/*
* Don't let a single process grow so big a user can't recover
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 1e4a600..6738c47 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
- points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
+ points = get_mm_rss(p->mm) + p->mm->nr_ptes +
get_mm_counter(p->mm, MM_SWAPENTS);
task_unlock(p);
@@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n",
+ pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
- atomic_long_read(&task->mm->nr_ptes),
+ task->mm->nr_ptes,
get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 580a5f0..dd886fa 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -234,8 +234,8 @@ int page_group_by_mobility_disabled __read_mostly;
void set_pageblock_migratetype(struct page *page, int migratetype)
{
- if (unlikely(page_group_by_mobility_disabled &&
- migratetype < MIGRATE_PCPTYPES))
+
+ if (unlikely(page_group_by_mobility_disabled))
migratetype = MIGRATE_UNMOVABLE;
set_pageblock_flags_group(page, (unsigned long)migratetype,
@@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page)
bad_page(page);
return 1;
}
- page_cpupid_reset_last(page);
+ page_nid_reset_last(page);
if (page->flags & PAGE_FLAGS_CHECK_AT_PREP)
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
return 0;
@@ -1027,10 +1027,6 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
{
int current_order = page_order(page);
- /*
- * When borrowing from MIGRATE_CMA, we need to release the excess
- * buddy pages to CMA itself.
- */
if (is_migrate_cma(fallback_type))
return fallback_type;
@@ -1095,11 +1091,21 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
list_del(&page->lru);
rmv_page_order(page);
+ /*
+ * Borrow the excess buddy pages as well, irrespective
+ * of whether we stole freepages, or took ownership of
+ * the pageblock or not.
+ *
+ * Exception: When borrowing from MIGRATE_CMA, release
+ * the excess buddy pages to CMA itself.
+ */
expand(zone, page, order, current_order, area,
- new_type);
+ is_migrate_cma(migratetype)
+ ? migratetype : start_migratetype);
- trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, migratetype, new_type);
+ trace_mm_page_alloc_extfrag(page, order,
+ current_order, start_migratetype, migratetype,
+ new_type == start_migratetype);
return page;
}
@@ -1705,7 +1711,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark,
* comments in mmzone.h. Reduces cache footprint of zonelist scans
* that have to skip over a lot of full or unallowed zones.
*
- * If the zonelist cache is present in the passed zonelist, then
+ * If the zonelist cache is present in the passed in zonelist, then
* returns a pointer to the allowed node mask (either the current
* tasks mems_allowed, or node_states[N_MEMORY].)
*
@@ -2587,7 +2593,7 @@ rebalance:
* running out of options and have to consider going OOM
*/
if (!did_some_progress) {
- if (oom_gfp_allowed(gfp_mask)) {
+ if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
if (oom_killer_disabled)
goto nopage;
/* Coredumps can quickly deplete all memory reserves */
@@ -3875,6 +3881,8 @@ static inline unsigned long wait_table_bits(unsigned long size)
return ffz(~size);
}
+#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
+
/*
* Check if a pageblock contains reserved pages
*/
@@ -4007,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
mminit_verify_page_links(page, zone, nid, pfn);
init_page_count(page);
page_mapcount_reset(page);
- page_cpupid_reset_last(page);
+ page_nid_reset_last(page);
SetPageReserved(page);
/*
* Mark the block movable so that blocks are reserved for
@@ -4258,7 +4266,7 @@ static __meminit void zone_pcp_init(struct zone *zone)
*/
zone->pageset = &boot_pageset;
- if (populated_zone(zone))
+ if (zone->present_pages)
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
zone->name, zone->present_pages,
zone_batchsize(zone));
@@ -5152,7 +5160,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) {
struct zone *zone = &pgdat->node_zones[zone_type];
- if (populated_zone(zone)) {
+ if (zone->present_pages) {
node_set_state(nid, N_HIGH_MEMORY);
if (N_NORMAL_MEMORY != N_HIGH_MEMORY &&
zone_type <= ZONE_NORMAL)
diff --git a/mm/percpu.c b/mm/percpu.c
index 0d10def..8c8e08f 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1706,9 +1706,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size,
out_free_areas:
for (group = 0; group < ai->nr_groups; group++)
- if (areas[group])
- free_fn(areas[group],
- ai->groups[group].nr_units * ai->unit_size);
+ free_fn(areas[group],
+ ai->groups[group].nr_units * ai->unit_size);
out_free:
pcpu_free_alloc_info(ai);
if (areas)
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index cbb3854..3929a40 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -151,14 +151,14 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address,
void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp,
pgtable_t pgtable)
{
- assert_spin_locked(pmd_lockptr(mm, pmdp));
+ assert_spin_locked(&mm->page_table_lock);
/* FIFO */
- if (!pmd_huge_pte(mm, pmdp))
+ if (!mm->pmd_huge_pte)
INIT_LIST_HEAD(&pgtable->lru);
else
- list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru);
- pmd_huge_pte(mm, pmdp) = pgtable;
+ list_add(&pgtable->lru, &mm->pmd_huge_pte->lru);
+ mm->pmd_huge_pte = pgtable;
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
#endif
@@ -170,14 +170,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
{
pgtable_t pgtable;
- assert_spin_locked(pmd_lockptr(mm, pmdp));
+ assert_spin_locked(&mm->page_table_lock);
/* FIFO */
- pgtable = pmd_huge_pte(mm, pmdp);
+ pgtable = mm->pmd_huge_pte;
if (list_empty(&pgtable->lru))
- pmd_huge_pte(mm, pmdp) = NULL;
+ mm->pmd_huge_pte = NULL;
else {
- pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next,
+ mm->pmd_huge_pte = list_entry(pgtable->lru.next,
struct page, lru);
list_del(&pgtable->lru);
}
diff --git a/mm/readahead.c b/mm/readahead.c
index 7cdbb44..e4ed041 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -401,7 +401,6 @@ ondemand_readahead(struct address_space *mapping,
unsigned long req_size)
{
unsigned long max = max_sane_readahead(ra->ra_pages);
- pgoff_t prev_offset;
/*
* start of file
@@ -453,11 +452,8 @@ ondemand_readahead(struct address_space *mapping,
/*
* sequential cache miss
- * trivial case: (offset - prev_offset) == 1
- * unaligned reads: (offset - prev_offset) == 0
*/
- prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT;
- if (offset - prev_offset <= 1UL)
+ if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL)
goto initial_readahead;
/*
@@ -573,7 +569,7 @@ static ssize_t
do_readahead(struct address_space *mapping, struct file *filp,
pgoff_t index, unsigned long nr)
{
- if (!mapping || !mapping->a_ops)
+ if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage)
return -EINVAL;
force_page_cache_readahead(mapping, filp, index, nr);
diff --git a/mm/rmap.c b/mm/rmap.c
index 55c8b8d..fd3ee7a 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -601,7 +601,7 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm,
if (unlikely(PageHuge(page))) {
pte = huge_pte_offset(mm, address);
- ptl = huge_pte_lockptr(page_hstate(page), mm, pte);
+ ptl = &mm->page_table_lock;
goto check;
}
@@ -665,23 +665,25 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
unsigned long *vm_flags)
{
struct mm_struct *mm = vma->vm_mm;
- spinlock_t *ptl;
int referenced = 0;
if (unlikely(PageTransHuge(page))) {
pmd_t *pmd;
+ spin_lock(&mm->page_table_lock);
/*
* rmap might return false positives; we must filter
* these out using page_check_address_pmd().
*/
pmd = page_check_address_pmd(page, mm, address,
- PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
- if (!pmd)
+ PAGE_CHECK_ADDRESS_PMD_FLAG);
+ if (!pmd) {
+ spin_unlock(&mm->page_table_lock);
goto out;
+ }
if (vma->vm_flags & VM_LOCKED) {
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
*mapcount = 0; /* break early from loop */
*vm_flags |= VM_LOCKED;
goto out;
@@ -690,9 +692,10 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
/* go ahead even if the pmd is pmd_trans_splitting() */
if (pmdp_clear_flush_young_notify(vma, address, pmd))
referenced++;
- spin_unlock(ptl);
+ spin_unlock(&mm->page_table_lock);
} else {
pte_t *pte;
+ spinlock_t *ptl;
/*
* rmap might return false positives; we must filter
diff --git a/mm/slab.c b/mm/slab.c
index eb043bf..2580db0 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -164,6 +164,72 @@
static bool pfmemalloc_active __read_mostly;
/*
+ * kmem_bufctl_t:
+ *
+ * Bufctl's are used for linking objs within a slab
+ * linked offsets.
+ *
+ * This implementation relies on "struct page" for locating the cache &
+ * slab an object belongs to.
+ * This allows the bufctl structure to be small (one int), but limits
+ * the number of objects a slab (not a cache) can contain when off-slab
+ * bufctls are used. The limit is the size of the largest general cache
+ * that does not use off-slab slabs.
+ * For 32bit archs with 4 kB pages, is this 56.
+ * This is not serious, as it is only for large objects, when it is unwise
+ * to have too many per slab.
+ * Note: This limit can be raised by introducing a general cache whose size
+ * is less than 512 (PAGE_SIZE<<3), but greater than 256.
+ */
+
+typedef unsigned int kmem_bufctl_t;
+#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0)
+#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1)
+#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2)
+#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3)
+
+/*
+ * struct slab_rcu
+ *
+ * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to
+ * arrange for kmem_freepages to be called via RCU. This is useful if
+ * we need to approach a kernel structure obliquely, from its address
+ * obtained without the usual locking. We can lock the structure to
+ * stabilize it and check it's still at the given address, only if we
+ * can be sure that the memory has not been meanwhile reused for some
+ * other kind of object (which our subsystem's lock might corrupt).
+ *
+ * rcu_read_lock before reading the address, then rcu_read_unlock after
+ * taking the spinlock within the structure expected at that address.
+ */
+struct slab_rcu {
+ struct rcu_head head;
+ struct kmem_cache *cachep;
+ void *addr;
+};
+
+/*
+ * struct slab
+ *
+ * Manages the objs in a slab. Placed either at the beginning of mem allocated
+ * for a slab, or allocated from an general cache.
+ * Slabs are chained into three list: fully used, partial, fully free slabs.
+ */
+struct slab {
+ union {
+ struct {
+ struct list_head list;
+ unsigned long colouroff;
+ void *s_mem; /* including colour offset */
+ unsigned int inuse; /* num of objs active in slab */
+ kmem_bufctl_t free;
+ unsigned short nodeid;
+ };
+ struct slab_rcu __slab_cover_slab_rcu;
+ };
+};
+
+/*
* struct array_cache
*
* Purpose:
@@ -390,10 +456,18 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
return page->slab_cache;
}
-static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
+static inline struct slab *virt_to_slab(const void *obj)
+{
+ struct page *page = virt_to_head_page(obj);
+
+ VM_BUG_ON(!PageSlab(page));
+ return page->slab_page;
+}
+
+static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab,
unsigned int idx)
{
- return page->s_mem + cache->size * idx;
+ return slab->s_mem + cache->size * idx;
}
/*
@@ -403,9 +477,9 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct page *page,
* reciprocal_divide(offset, cache->reciprocal_buffer_size)
*/
static inline unsigned int obj_to_index(const struct kmem_cache *cache,
- const struct page *page, void *obj)
+ const struct slab *slab, void *obj)
{
- u32 offset = (obj - page->s_mem);
+ u32 offset = (obj - slab->s_mem);
return reciprocal_divide(offset, cache->reciprocal_buffer_size);
}
@@ -567,7 +641,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep)
static size_t slab_mgmt_size(size_t nr_objs, size_t align)
{
- return ALIGN(nr_objs * sizeof(unsigned int), align);
+ return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align);
}
/*
@@ -586,7 +660,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
* on it. For the latter case, the memory allocated for a
* slab is used for:
*
- * - One unsigned int for each object
+ * - The struct slab
+ * - One kmem_bufctl_t for each object
* - Padding to respect alignment of @align
* - @buffer_size bytes for each object
*
@@ -599,6 +674,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
mgmt_size = 0;
nr_objs = slab_size / buffer_size;
+ if (nr_objs > SLAB_LIMIT)
+ nr_objs = SLAB_LIMIT;
} else {
/*
* Ignore padding for the initial guess. The padding
@@ -608,7 +685,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
* into the memory allocation when taking the padding
* into account.
*/
- nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int));
+ nr_objs = (slab_size - sizeof(struct slab)) /
+ (buffer_size + sizeof(kmem_bufctl_t));
/*
* This calculated number will be either the right
@@ -618,6 +696,9 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size,
> slab_size)
nr_objs--;
+ if (nr_objs > SLAB_LIMIT)
+ nr_objs = SLAB_LIMIT;
+
mgmt_size = slab_mgmt_size(nr_objs, align);
}
*num = nr_objs;
@@ -748,8 +829,10 @@ static struct array_cache *alloc_arraycache(int node, int entries,
return nc;
}
-static inline bool is_slab_pfmemalloc(struct page *page)
+static inline bool is_slab_pfmemalloc(struct slab *slabp)
{
+ struct page *page = virt_to_page(slabp->s_mem);
+
return PageSlabPfmemalloc(page);
}
@@ -758,23 +841,23 @@ static void recheck_pfmemalloc_active(struct kmem_cache *cachep,
struct array_cache *ac)
{
struct kmem_cache_node *n = cachep->node[numa_mem_id()];
- struct page *page;
+ struct slab *slabp;
unsigned long flags;
if (!pfmemalloc_active)
return;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->slabs_full, lru)
- if (is_slab_pfmemalloc(page))
+ list_for_each_entry(slabp, &n->slabs_full, list)
+ if (is_slab_pfmemalloc(slabp))
goto out;
- list_for_each_entry(page, &n->slabs_partial, lru)
- if (is_slab_pfmemalloc(page))
+ list_for_each_entry(slabp, &n->slabs_partial, list)
+ if (is_slab_pfmemalloc(slabp))
goto out;
- list_for_each_entry(page, &n->slabs_free, lru)
- if (is_slab_pfmemalloc(page))
+ list_for_each_entry(slabp, &n->slabs_free, list)
+ if (is_slab_pfmemalloc(slabp))
goto out;
pfmemalloc_active = false;
@@ -814,8 +897,8 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac,
*/
n = cachep->node[numa_mem_id()];
if (!list_empty(&n->slabs_free) && force_refill) {
- struct page *page = virt_to_head_page(objp);
- ClearPageSlabPfmemalloc(page);
+ struct slab *slabp = virt_to_slab(objp);
+ ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem));
clear_obj_pfmemalloc(&objp);
recheck_pfmemalloc_active(cachep, ac);
return objp;
@@ -1016,7 +1099,8 @@ static void drain_alien_cache(struct kmem_cache *cachep,
static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
{
- int nodeid = page_to_nid(virt_to_page(objp));
+ struct slab *slabp = virt_to_slab(objp);
+ int nodeid = slabp->nodeid;
struct kmem_cache_node *n;
struct array_cache *alien = NULL;
int node;
@@ -1027,7 +1111,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp)
* Make sure we are not freeing a object from another node to the array
* cache on this cpu.
*/
- if (likely(nodeid == node))
+ if (likely(slabp->nodeid == node))
return 0;
n = cachep->node[node];
@@ -1428,8 +1512,6 @@ void __init kmem_cache_init(void)
{
int i;
- BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) <
- sizeof(struct rcu_head));
kmem_cache = &kmem_cache_boot;
setup_node_pointer(kmem_cache);
@@ -1605,7 +1687,7 @@ static noinline void
slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
{
struct kmem_cache_node *n;
- struct page *page;
+ struct slab *slabp;
unsigned long flags;
int node;
@@ -1624,15 +1706,15 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
continue;
spin_lock_irqsave(&n->list_lock, flags);
- list_for_each_entry(page, &n->slabs_full, lru) {
+ list_for_each_entry(slabp, &n->slabs_full, list) {
active_objs += cachep->num;
active_slabs++;
}
- list_for_each_entry(page, &n->slabs_partial, lru) {
- active_objs += page->active;
+ list_for_each_entry(slabp, &n->slabs_partial, list) {
+ active_objs += slabp->inuse;
active_slabs++;
}
- list_for_each_entry(page, &n->slabs_free, lru)
+ list_for_each_entry(slabp, &n->slabs_free, list)
num_slabs++;
free_objects += n->free_objects;
@@ -1654,11 +1736,19 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid)
* did not request dmaable memory, we might get it, but that
* would be relatively rare and ignorable.
*/
-static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
- int nodeid)
+static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
{
struct page *page;
int nr_pages;
+ int i;
+
+#ifndef CONFIG_MMU
+ /*
+ * Nommu uses slab's for process anonymous memory allocations, and thus
+ * requires __GFP_COMP to properly refcount higher order allocations
+ */
+ flags |= __GFP_COMP;
+#endif
flags |= cachep->allocflags;
if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
@@ -1682,9 +1772,12 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
else
add_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_pages);
- __SetPageSlab(page);
- if (page->pfmemalloc)
- SetPageSlabPfmemalloc(page);
+ for (i = 0; i < nr_pages; i++) {
+ __SetPageSlab(page + i);
+
+ if (page->pfmemalloc)
+ SetPageSlabPfmemalloc(page + i);
+ }
memcg_bind_pages(cachep, cachep->gfporder);
if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
@@ -1696,15 +1789,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
kmemcheck_mark_unallocated_pages(page, nr_pages);
}
- return page;
+ return page_address(page);
}
/*
* Interface to system's page release.
*/
-static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
+static void kmem_freepages(struct kmem_cache *cachep, void *addr)
{
- const unsigned long nr_freed = (1 << cachep->gfporder);
+ unsigned long i = (1 << cachep->gfporder);
+ struct page *page = virt_to_page(addr);
+ const unsigned long nr_freed = i;
kmemcheck_free_shadow(page, cachep->gfporder);
@@ -1714,28 +1809,27 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
else
sub_zone_page_state(page_zone(page),
NR_SLAB_UNRECLAIMABLE, nr_freed);
-
- BUG_ON(!PageSlab(page));
- __ClearPageSlabPfmemalloc(page);
- __ClearPageSlab(page);
- page_mapcount_reset(page);
- page->mapping = NULL;
+ while (i--) {
+ BUG_ON(!PageSlab(page));
+ __ClearPageSlabPfmemalloc(page);
+ __ClearPageSlab(page);
+ page++;
+ }
memcg_release_pages(cachep, cachep->gfporder);
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += nr_freed;
- __free_memcg_kmem_pages(page, cachep->gfporder);
+ free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
}
static void kmem_rcu_free(struct rcu_head *head)
{
- struct kmem_cache *cachep;
- struct page *page;
-
- page = container_of(head, struct page, rcu_head);
- cachep = page->slab_cache;
+ struct slab_rcu *slab_rcu = (struct slab_rcu *)head;
+ struct kmem_cache *cachep = slab_rcu->cachep;
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, slab_rcu->addr);
+ if (OFF_SLAB(cachep))
+ kmem_cache_free(cachep->slabp_cache, slab_rcu);
}
#if DEBUG
@@ -1884,19 +1978,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
/* Print some data about the neighboring objects, if they
* exist:
*/
- struct page *page = virt_to_head_page(objp);
+ struct slab *slabp = virt_to_slab(objp);
unsigned int objnr;
- objnr = obj_to_index(cachep, page, objp);
+ objnr = obj_to_index(cachep, slabp, objp);
if (objnr) {
- objp = index_to_obj(cachep, page, objnr - 1);
+ objp = index_to_obj(cachep, slabp, objnr - 1);
realobj = (char *)objp + obj_offset(cachep);
printk(KERN_ERR "Prev obj: start=%p, len=%d\n",
realobj, size);
print_objinfo(cachep, objp, 2);
}
if (objnr + 1 < cachep->num) {
- objp = index_to_obj(cachep, page, objnr + 1);
+ objp = index_to_obj(cachep, slabp, objnr + 1);
realobj = (char *)objp + obj_offset(cachep);
printk(KERN_ERR "Next obj: start=%p, len=%d\n",
realobj, size);
@@ -1907,12 +2001,11 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp)
#endif
#if DEBUG
-static void slab_destroy_debugcheck(struct kmem_cache *cachep,
- struct page *page)
+static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
{
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = index_to_obj(cachep, page, i);
+ void *objp = index_to_obj(cachep, slabp, i);
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
@@ -1937,8 +2030,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
}
}
#else
-static void slab_destroy_debugcheck(struct kmem_cache *cachep,
- struct page *page)
+static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp)
{
}
#endif
@@ -1952,34 +2044,23 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep,
* Before calling the slab must have been unlinked from the cache. The
* cache-lock is not held/needed.
*/
-static void slab_destroy(struct kmem_cache *cachep, struct page *page)
+static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp)
{
- void *freelist;
+ void *addr = slabp->s_mem - slabp->colouroff;
- freelist = page->freelist;
- slab_destroy_debugcheck(cachep, page);
+ slab_destroy_debugcheck(cachep, slabp);
if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) {
- struct rcu_head *head;
-
- /*
- * RCU free overloads the RCU head over the LRU.
- * slab_page has been overloeaded over the LRU,
- * however it is not used from now on so that
- * we can use it safely.
- */
- head = (void *)&page->rcu_head;
- call_rcu(head, kmem_rcu_free);
+ struct slab_rcu *slab_rcu;
+ slab_rcu = (struct slab_rcu *)slabp;
+ slab_rcu->cachep = cachep;
+ slab_rcu->addr = addr;
+ call_rcu(&slab_rcu->head, kmem_rcu_free);
} else {
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, addr);
+ if (OFF_SLAB(cachep))
+ kmem_cache_free(cachep->slabp_cache, slabp);
}
-
- /*
- * From now on, we don't use freelist
- * although actual page can be freed in rcu context
- */
- if (OFF_SLAB(cachep))
- kmem_cache_free(cachep->freelist_cache, freelist);
}
/**
@@ -2016,8 +2097,8 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
* use off-slab slabs. Needed to avoid a possible
* looping condition in cache_grow().
*/
- offslab_limit = size;
- offslab_limit /= sizeof(unsigned int);
+ offslab_limit = size - sizeof(struct slab);
+ offslab_limit /= sizeof(kmem_bufctl_t);
if (num > offslab_limit)
break;
@@ -2139,7 +2220,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp)
int
__kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
{
- size_t left_over, freelist_size, ralign;
+ size_t left_over, slab_size, ralign;
gfp_t gfp;
int err;
size_t size = cachep->size;
@@ -2258,21 +2339,22 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (!cachep->num)
return -E2BIG;
- freelist_size =
- ALIGN(cachep->num * sizeof(unsigned int), cachep->align);
+ slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t)
+ + sizeof(struct slab), cachep->align);
/*
* If the slab has been placed off-slab, and we have enough space then
* move it on-slab. This is at the expense of any extra colouring.
*/
- if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) {
+ if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) {
flags &= ~CFLGS_OFF_SLAB;
- left_over -= freelist_size;
+ left_over -= slab_size;
}
if (flags & CFLGS_OFF_SLAB) {
/* really off slab. No need for manual alignment */
- freelist_size = cachep->num * sizeof(unsigned int);
+ slab_size =
+ cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab);
#ifdef CONFIG_PAGE_POISONING
/* If we're going to use the generic kernel_map_pages()
@@ -2289,16 +2371,16 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
if (cachep->colour_off < cachep->align)
cachep->colour_off = cachep->align;
cachep->colour = left_over / cachep->colour_off;
- cachep->freelist_size = freelist_size;
+ cachep->slab_size = slab_size;
cachep->flags = flags;
- cachep->allocflags = __GFP_COMP;
+ cachep->allocflags = 0;
if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA))
cachep->allocflags |= GFP_DMA;
cachep->size = size;
cachep->reciprocal_buffer_size = reciprocal_value(size);
if (flags & CFLGS_OFF_SLAB) {
- cachep->freelist_cache = kmalloc_slab(freelist_size, 0u);
+ cachep->slabp_cache = kmalloc_slab(slab_size, 0u);
/*
* This is a possibility for one of the malloc_sizes caches.
* But since we go off slab only for object size greater than
@@ -2306,7 +2388,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
* this should not happen at all.
* But leave a BUG_ON for some lucky dude.
*/
- BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache));
+ BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache));
}
err = setup_cpu_cache(cachep, gfp);
@@ -2412,7 +2494,7 @@ static int drain_freelist(struct kmem_cache *cache,
{
struct list_head *p;
int nr_freed;
- struct page *page;
+ struct slab *slabp;
nr_freed = 0;
while (nr_freed < tofree && !list_empty(&n->slabs_free)) {
@@ -2424,18 +2506,18 @@ static int drain_freelist(struct kmem_cache *cache,
goto out;
}
- page = list_entry(p, struct page, lru);
+ slabp = list_entry(p, struct slab, list);
#if DEBUG
- BUG_ON(page->active);
+ BUG_ON(slabp->inuse);
#endif
- list_del(&page->lru);
+ list_del(&slabp->list);
/*
* Safe to drop the lock. The slab is no longer linked
* to the cache.
*/
n->free_objects -= cache->num;
spin_unlock_irq(&n->list_lock);
- slab_destroy(cache, page);
+ slab_destroy(cache, slabp);
nr_freed++;
}
out:
@@ -2518,42 +2600,52 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
* descriptors in kmem_cache_create, we search through the malloc_sizes array.
* If we are creating a malloc_sizes cache here it would not be visible to
* kmem_find_general_cachep till the initialization is complete.
- * Hence we cannot have freelist_cache same as the original cache.
+ * Hence we cannot have slabp_cache same as the original cache.
*/
-static void *alloc_slabmgmt(struct kmem_cache *cachep,
- struct page *page, int colour_off,
- gfp_t local_flags, int nodeid)
+static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp,
+ int colour_off, gfp_t local_flags,
+ int nodeid)
{
- void *freelist;
- void *addr = page_address(page);
+ struct slab *slabp;
if (OFF_SLAB(cachep)) {
/* Slab management obj is off-slab. */
- freelist = kmem_cache_alloc_node(cachep->freelist_cache,
+ slabp = kmem_cache_alloc_node(cachep->slabp_cache,
local_flags, nodeid);
- if (!freelist)
+ /*
+ * If the first object in the slab is leaked (it's allocated
+ * but no one has a reference to it), we want to make sure
+ * kmemleak does not treat the ->s_mem pointer as a reference
+ * to the object. Otherwise we will not report the leak.
+ */
+ kmemleak_scan_area(&slabp->list, sizeof(struct list_head),
+ local_flags);
+ if (!slabp)
return NULL;
} else {
- freelist = addr + colour_off;
- colour_off += cachep->freelist_size;
+ slabp = objp + colour_off;
+ colour_off += cachep->slab_size;
}
- page->active = 0;
- page->s_mem = addr + colour_off;
- return freelist;
+ slabp->inuse = 0;
+ slabp->colouroff = colour_off;
+ slabp->s_mem = objp + colour_off;
+ slabp->nodeid = nodeid;
+ slabp->free = 0;
+ return slabp;
}
-static inline unsigned int *slab_freelist(struct page *page)
+static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp)
{
- return (unsigned int *)(page->freelist);
+ return (kmem_bufctl_t *) (slabp + 1);
}
static void cache_init_objs(struct kmem_cache *cachep,
- struct page *page)
+ struct slab *slabp)
{
int i;
for (i = 0; i < cachep->num; i++) {
- void *objp = index_to_obj(cachep, page, i);
+ void *objp = index_to_obj(cachep, slabp, i);
#if DEBUG
/* need to poison the objs? */
if (cachep->flags & SLAB_POISON)
@@ -2589,8 +2681,9 @@ static void cache_init_objs(struct kmem_cache *cachep,
if (cachep->ctor)
cachep->ctor(objp);
#endif
- slab_freelist(page)[i] = i;
+ slab_bufctl(slabp)[i] = i + 1;
}
+ slab_bufctl(slabp)[i - 1] = BUFCTL_END;
}
static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
@@ -2603,41 +2696,41 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags)
}
}
-static void *slab_get_obj(struct kmem_cache *cachep, struct page *page,
+static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp,
int nodeid)
{
- void *objp;
+ void *objp = index_to_obj(cachep, slabp, slabp->free);
+ kmem_bufctl_t next;
- objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]);
- page->active++;
+ slabp->inuse++;
+ next = slab_bufctl(slabp)[slabp->free];
#if DEBUG
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
+ slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+ WARN_ON(slabp->nodeid != nodeid);
#endif
+ slabp->free = next;
return objp;
}
-static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
+static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp,
void *objp, int nodeid)
{
- unsigned int objnr = obj_to_index(cachep, page, objp);
-#if DEBUG
- unsigned int i;
+ unsigned int objnr = obj_to_index(cachep, slabp, objp);
+#if DEBUG
/* Verify that the slab belongs to the intended node */
- WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid);
+ WARN_ON(slabp->nodeid != nodeid);
- /* Verify double free bug */
- for (i = page->active; i < cachep->num; i++) {
- if (slab_freelist(page)[i] == objnr) {
- printk(KERN_ERR "slab: double free detected in cache "
- "'%s', objp %p\n", cachep->name, objp);
- BUG();
- }
+ if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) {
+ printk(KERN_ERR "slab: double free detected in cache "
+ "'%s', objp %p\n", cachep->name, objp);
+ BUG();
}
#endif
- page->active--;
- slab_freelist(page)[page->active] = objnr;
+ slab_bufctl(slabp)[objnr] = slabp->free;
+ slabp->free = objnr;
+ slabp->inuse--;
}
/*
@@ -2645,11 +2738,23 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page,
* for the slab allocator to be able to lookup the cache and slab of a
* virtual address for kfree, ksize, and slab debugging.
*/
-static void slab_map_pages(struct kmem_cache *cache, struct page *page,
- void *freelist)
+static void slab_map_pages(struct kmem_cache *cache, struct slab *slab,
+ void *addr)
{
- page->slab_cache = cache;
- page->freelist = freelist;
+ int nr_pages;
+ struct page *page;
+
+ page = virt_to_page(addr);
+
+ nr_pages = 1;
+ if (likely(!PageCompound(page)))
+ nr_pages <<= cache->gfporder;
+
+ do {
+ page->slab_cache = cache;
+ page->slab_page = slab;
+ page++;
+ } while (--nr_pages);
}
/*
@@ -2657,9 +2762,9 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page,
* kmem_cache_alloc() when there are no active objs left in a cache.
*/
static int cache_grow(struct kmem_cache *cachep,
- gfp_t flags, int nodeid, struct page *page)
+ gfp_t flags, int nodeid, void *objp)
{
- void *freelist;
+ struct slab *slabp;
size_t offset;
gfp_t local_flags;
struct kmem_cache_node *n;
@@ -2700,20 +2805,20 @@ static int cache_grow(struct kmem_cache *cachep,
* Get mem for the objs. Attempt to allocate a physical page from
* 'nodeid'.
*/
- if (!page)
- page = kmem_getpages(cachep, local_flags, nodeid);
- if (!page)
+ if (!objp)
+ objp = kmem_getpages(cachep, local_flags, nodeid);
+ if (!objp)
goto failed;
/* Get slab management. */
- freelist = alloc_slabmgmt(cachep, page, offset,
+ slabp = alloc_slabmgmt(cachep, objp, offset,
local_flags & ~GFP_CONSTRAINT_MASK, nodeid);
- if (!freelist)
+ if (!slabp)
goto opps1;
- slab_map_pages(cachep, page, freelist);
+ slab_map_pages(cachep, slabp, objp);
- cache_init_objs(cachep, page);
+ cache_init_objs(cachep, slabp);
if (local_flags & __GFP_WAIT)
local_irq_disable();
@@ -2721,13 +2826,13 @@ static int cache_grow(struct kmem_cache *cachep,
spin_lock(&n->list_lock);
/* Make slab active. */
- list_add_tail(&page->lru, &(n->slabs_free));
+ list_add_tail(&slabp->list, &(n->slabs_free));
STATS_INC_GROWN(cachep);
n->free_objects += cachep->num;
spin_unlock(&n->list_lock);
return 1;
opps1:
- kmem_freepages(cachep, page);
+ kmem_freepages(cachep, objp);
failed:
if (local_flags & __GFP_WAIT)
local_irq_disable();
@@ -2775,8 +2880,9 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj)
static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
- unsigned int objnr;
struct page *page;
+ unsigned int objnr;
+ struct slab *slabp;
BUG_ON(virt_to_cache(objp) != cachep);
@@ -2784,6 +2890,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
kfree_debugcheck(objp);
page = virt_to_head_page(objp);
+ slabp = page->slab_page;
+
if (cachep->flags & SLAB_RED_ZONE) {
verify_redzone_free(cachep, objp);
*dbg_redzone1(cachep, objp) = RED_INACTIVE;
@@ -2792,11 +2900,14 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
if (cachep->flags & SLAB_STORE_USER)
*dbg_userword(cachep, objp) = (void *)caller;
- objnr = obj_to_index(cachep, page, objp);
+ objnr = obj_to_index(cachep, slabp, objp);
BUG_ON(objnr >= cachep->num);
- BUG_ON(objp != index_to_obj(cachep, page, objnr));
+ BUG_ON(objp != index_to_obj(cachep, slabp, objnr));
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ slab_bufctl(slabp)[objnr] = BUFCTL_FREE;
+#endif
if (cachep->flags & SLAB_POISON) {
#ifdef CONFIG_DEBUG_PAGEALLOC
if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) {
@@ -2813,9 +2924,33 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp,
return objp;
}
+static void check_slabp(struct kmem_cache *cachep, struct slab *slabp)
+{
+ kmem_bufctl_t i;
+ int entries = 0;
+
+ /* Check slab's freelist to see if this obj is there. */
+ for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
+ entries++;
+ if (entries > cachep->num || i >= cachep->num)
+ goto bad;
+ }
+ if (entries != cachep->num - slabp->inuse) {
+bad:
+ printk(KERN_ERR "slab: Internal list corruption detected in "
+ "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n",
+ cachep->name, cachep->num, slabp, slabp->inuse,
+ print_tainted());
+ print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp,
+ sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t),
+ 1);
+ BUG();
+ }
+}
#else
#define kfree_debugcheck(x) do { } while(0)
#define cache_free_debugcheck(x,objp,z) (objp)
+#define check_slabp(x,y) do { } while(0)
#endif
static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags,
@@ -2854,7 +2989,7 @@ retry:
while (batchcount > 0) {
struct list_head *entry;
- struct page *page;
+ struct slab *slabp;
/* Get slab alloc is to come from. */
entry = n->slabs_partial.next;
if (entry == &n->slabs_partial) {
@@ -2864,7 +2999,8 @@ retry:
goto must_grow;
}
- page = list_entry(entry, struct page, lru);
+ slabp = list_entry(entry, struct slab, list);
+ check_slabp(cachep, slabp);
check_spinlock_acquired(cachep);
/*
@@ -2872,23 +3008,24 @@ retry:
* there must be at least one object available for
* allocation.
*/
- BUG_ON(page->active >= cachep->num);
+ BUG_ON(slabp->inuse >= cachep->num);
- while (page->active < cachep->num && batchcount--) {
+ while (slabp->inuse < cachep->num && batchcount--) {
STATS_INC_ALLOCED(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- ac_put_obj(cachep, ac, slab_get_obj(cachep, page,
+ ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp,
node));
}
+ check_slabp(cachep, slabp);
/* move slabp to correct slabp list: */
- list_del(&page->lru);
- if (page->active == cachep->num)
- list_add(&page->list, &n->slabs_full);
+ list_del(&slabp->list);
+ if (slabp->free == BUFCTL_END)
+ list_add(&slabp->list, &n->slabs_full);
else
- list_add(&page->list, &n->slabs_partial);
+ list_add(&slabp->list, &n->slabs_partial);
}
must_grow:
@@ -2960,6 +3097,16 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep,
*dbg_redzone1(cachep, objp) = RED_ACTIVE;
*dbg_redzone2(cachep, objp) = RED_ACTIVE;
}
+#ifdef CONFIG_DEBUG_SLAB_LEAK
+ {
+ struct slab *slabp;
+ unsigned objnr;
+
+ slabp = virt_to_head_page(objp)->slab_page;
+ objnr = (unsigned)(objp - slabp->s_mem) / cachep->size;
+ slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE;
+ }
+#endif
objp += obj_offset(cachep);
if (cachep->ctor && cachep->flags & SLAB_POISON)
cachep->ctor(objp);
@@ -3101,20 +3248,18 @@ retry:
* We may trigger various forms of reclaim on the allowed
* set and go into memory reserves if necessary.
*/
- struct page *page;
-
if (local_flags & __GFP_WAIT)
local_irq_enable();
kmem_flagcheck(cache, flags);
- page = kmem_getpages(cache, local_flags, numa_mem_id());
+ obj = kmem_getpages(cache, local_flags, numa_mem_id());
if (local_flags & __GFP_WAIT)
local_irq_disable();
- if (page) {
+ if (obj) {
/*
* Insert into the appropriate per node queues
*/
- nid = page_to_nid(page);
- if (cache_grow(cache, flags, nid, page)) {
+ nid = page_to_nid(virt_to_page(obj));
+ if (cache_grow(cache, flags, nid, obj)) {
obj = ____cache_alloc_node(cache,
flags | GFP_THISNODE, nid);
if (!obj)
@@ -3143,7 +3288,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags,
int nodeid)
{
struct list_head *entry;
- struct page *page;
+ struct slab *slabp;
struct kmem_cache_node *n;
void *obj;
int x;
@@ -3163,24 +3308,26 @@ retry:
goto must_grow;
}
- page = list_entry(entry, struct page, lru);
+ slabp = list_entry(entry, struct slab, list);
check_spinlock_acquired_node(cachep, nodeid);
+ check_slabp(cachep, slabp);
STATS_INC_NODEALLOCS(cachep);
STATS_INC_ACTIVE(cachep);
STATS_SET_HIGH(cachep);
- BUG_ON(page->active == cachep->num);
+ BUG_ON(slabp->inuse == cachep->num);
- obj = slab_get_obj(cachep, page, nodeid);
+ obj = slab_get_obj(cachep, slabp, nodeid);
+ check_slabp(cachep, slabp);
n->free_objects--;
/* move slabp to correct slabp list: */
- list_del(&page->lru);
+ list_del(&slabp->list);
- if (page->active == cachep->num)
- list_add(&page->lru, &n->slabs_full);
+ if (slabp->free == BUFCTL_END)
+ list_add(&slabp->list, &n->slabs_full);
else
- list_add(&page->lru, &n->slabs_partial);
+ list_add(&slabp->list, &n->slabs_partial);
spin_unlock(&n->list_lock);
goto done;
@@ -3330,21 +3477,23 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
for (i = 0; i < nr_objects; i++) {
void *objp;
- struct page *page;
+ struct slab *slabp;
clear_obj_pfmemalloc(&objpp[i]);
objp = objpp[i];
- page = virt_to_head_page(objp);
+ slabp = virt_to_slab(objp);
n = cachep->node[node];
- list_del(&page->lru);
+ list_del(&slabp->list);
check_spinlock_acquired_node(cachep, node);
- slab_put_obj(cachep, page, objp, node);
+ check_slabp(cachep, slabp);
+ slab_put_obj(cachep, slabp, objp, node);
STATS_DEC_ACTIVE(cachep);
n->free_objects++;
+ check_slabp(cachep, slabp);
/* fixup slab chains */
- if (page->active == 0) {
+ if (slabp->inuse == 0) {
if (n->free_objects > n->free_limit) {
n->free_objects -= cachep->num;
/* No need to drop any previously held
@@ -3353,16 +3502,16 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects,
* a different cache, refer to comments before
* alloc_slabmgmt.
*/
- slab_destroy(cachep, page);
+ slab_destroy(cachep, slabp);
} else {
- list_add(&page->lru, &n->slabs_free);
+ list_add(&slabp->list, &n->slabs_free);
}
} else {
/* Unconditionally move a slab to the end of the
* partial list on free - maximum time for the
* other objects to be freed, too.
*/
- list_add_tail(&page->lru, &n->slabs_partial);
+ list_add_tail(&slabp->list, &n->slabs_partial);
}
}
}
@@ -3402,10 +3551,10 @@ free_done:
p = n->slabs_free.next;
while (p != &(n->slabs_free)) {
- struct page *page;
+ struct slab *slabp;
- page = list_entry(p, struct page, lru);
- BUG_ON(page->active);
+ slabp = list_entry(p, struct slab, list);
+ BUG_ON(slabp->inuse);
i++;
p = p->next;
@@ -3833,7 +3982,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
VM_BUG_ON(!mutex_is_locked(&slab_mutex));
for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(cachep, i);
+ c = cache_from_memcg(cachep, i);
if (c)
/* return value determined by the parent cache only */
__do_tune_cpucache(c, limit, batchcount, shared, gfp);
@@ -4009,7 +4158,7 @@ out:
#ifdef CONFIG_SLABINFO
void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
{
- struct page *page;
+ struct slab *slabp;
unsigned long active_objs;
unsigned long num_objs;
unsigned long active_slabs = 0;
@@ -4029,23 +4178,23 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
check_irq_on();
spin_lock_irq(&n->list_lock);
- list_for_each_entry(page, &n->slabs_full, lru) {
- if (page->active != cachep->num && !error)
+ list_for_each_entry(slabp, &n->slabs_full, list) {
+ if (slabp->inuse != cachep->num && !error)
error = "slabs_full accounting error";
active_objs += cachep->num;
active_slabs++;
}
- list_for_each_entry(page, &n->slabs_partial, lru) {
- if (page->active == cachep->num && !error)
- error = "slabs_partial accounting error";
- if (!page->active && !error)
- error = "slabs_partial accounting error";
- active_objs += page->active;
+ list_for_each_entry(slabp, &n->slabs_partial, list) {
+ if (slabp->inuse == cachep->num && !error)
+ error = "slabs_partial inuse accounting error";
+ if (!slabp->inuse && !error)
+ error = "slabs_partial/inuse accounting error";
+ active_objs += slabp->inuse;
active_slabs++;
}
- list_for_each_entry(page, &n->slabs_free, lru) {
- if (page->active && !error)
- error = "slabs_free accounting error";
+ list_for_each_entry(slabp, &n->slabs_free, list) {
+ if (slabp->inuse && !error)
+ error = "slabs_free/inuse accounting error";
num_slabs++;
}
free_objects += n->free_objects;
@@ -4197,27 +4346,15 @@ static inline int add_caller(unsigned long *n, unsigned long v)
return 1;
}
-static void handle_slab(unsigned long *n, struct kmem_cache *c,
- struct page *page)
+static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s)
{
void *p;
- int i, j;
-
+ int i;
if (n[0] == n[1])
return;
- for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) {
- bool active = true;
-
- for (j = page->active; j < c->num; j++) {
- /* Skip freed item */
- if (slab_freelist(page)[j] == i) {
- active = false;
- break;
- }
- }
- if (!active)
+ for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) {
+ if (slab_bufctl(s)[i] != BUFCTL_ACTIVE)
continue;
-
if (!add_caller(n, (unsigned long)*dbg_userword(c, p)))
return;
}
@@ -4242,7 +4379,7 @@ static void show_symbol(struct seq_file *m, unsigned long address)
static int leaks_show(struct seq_file *m, void *p)
{
struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list);
- struct page *page;
+ struct slab *slabp;
struct kmem_cache_node *n;
const char *name;
unsigned long *x = m->private;
@@ -4266,10 +4403,10 @@ static int leaks_show(struct seq_file *m, void *p)
check_irq_on();
spin_lock_irq(&n->list_lock);
- list_for_each_entry(page, &n->slabs_full, lru)
- handle_slab(x, cachep, page);
- list_for_each_entry(page, &n->slabs_partial, lru)
- handle_slab(x, cachep, page);
+ list_for_each_entry(slabp, &n->slabs_full, list)
+ handle_slab(x, cachep, slabp);
+ list_for_each_entry(slabp, &n->slabs_partial, list)
+ handle_slab(x, cachep, slabp);
spin_unlock_irq(&n->list_lock);
}
name = cachep->name;
diff --git a/mm/slab.h b/mm/slab.h
index 0859c42..a535033 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -160,8 +160,7 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
{
if (!s->memcg_params)
return NULL;
@@ -205,8 +204,7 @@ static inline const char *cache_name(struct kmem_cache *s)
return s->name;
}
-static inline struct kmem_cache *
-cache_from_memcg_idx(struct kmem_cache *s, int idx)
+static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
{
return NULL;
}
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 0b7bb39..e2e98af 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -571,7 +571,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
return;
for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
+ c = cache_from_memcg(s, i);
if (!c)
continue;
diff --git a/mm/slub.c b/mm/slub.c
index 545a170..c3eb3d3 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -155,7 +155,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s)
/*
* Maximum number of desirable partial slabs.
* The existence of more partial slabs makes kmem_cache_shrink
- * sort the partial list by the number of objects in use.
+ * sort the partial list by the number of objects in the.
*/
#define MAX_PARTIAL 10
@@ -933,16 +933,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
* Hooks for other subsystems that check memory allocations. In a typical
* production configuration these hooks all should produce no code at all.
*/
-static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
-{
- kmemleak_alloc(ptr, size, 1, flags);
-}
-
-static inline void kfree_hook(const void *x)
-{
- kmemleak_free(x);
-}
-
static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
{
flags &= gfp_allowed_mask;
@@ -965,7 +955,7 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
kmemleak_free_recursive(x, s->flags);
/*
- * Trouble is that we may no longer disable interrupts in the fast path
+ * Trouble is that we may no longer disable interupts in the fast path
* So in order to make the debug calls that expect irqs to be
* disabled we need to disable interrupts temporarily.
*/
@@ -1227,8 +1217,8 @@ static unsigned long kmem_cache_flags(unsigned long object_size,
/*
* Enable debugging if selected on the kernel commandline.
*/
- if (slub_debug && (!slub_debug_slabs || (name &&
- !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))))
+ if (slub_debug && (!slub_debug_slabs ||
+ !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))
flags |= slub_debug;
return flags;
@@ -1270,30 +1260,13 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
-static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
-{
- kmemleak_alloc(ptr, size, 1, flags);
-}
-
-static inline void kfree_hook(const void *x)
-{
- kmemleak_free(x);
-}
-
static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
{ return 0; }
static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
- void *object)
-{
- kmemleak_alloc_recursive(object, s->object_size, 1, s->flags,
- flags & gfp_allowed_mask);
-}
+ void *object) {}
-static inline void slab_free_hook(struct kmem_cache *s, void *x)
-{
- kmemleak_free_recursive(x, s->flags);
-}
+static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
#endif /* CONFIG_SLUB_DEBUG */
@@ -2856,8 +2829,8 @@ static struct kmem_cache *kmem_cache_node;
* slab on the node for this slabcache. There are no concurrent accesses
* possible.
*
- * Note that this function only works on the kmem_cache_node
- * when allocating for the kmem_cache_node. This is used for bootstrapping
+ * Note that this function only works on the kmalloc_node_cache
+ * when allocating for the kmalloc_node_cache. This is used for bootstrapping
* memory on a fresh node that has no slab structures yet.
*/
static void early_kmem_cache_node_alloc(int node)
@@ -3299,7 +3272,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
if (page)
ptr = page_address(page);
- kmalloc_large_node_hook(ptr, size, flags);
+ kmemleak_alloc(ptr, size, 1, flags);
return ptr;
}
@@ -3363,7 +3336,7 @@ void kfree(const void *x)
page = virt_to_head_page(x);
if (unlikely(!PageSlab(page))) {
BUG_ON(!PageCompound(page));
- kfree_hook(x);
+ kmemleak_free(x);
__free_memcg_kmem_pages(page, compound_order(page));
return;
}
@@ -5010,7 +4983,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
* through the descendants with best-effort propagation.
*/
for_each_memcg_cache_index(i) {
- struct kmem_cache *c = cache_from_memcg_idx(s, i);
+ struct kmem_cache *c = cache_from_memcg(s, i);
if (c)
attribute->store(c, buf, len);
}
diff --git a/mm/sparse.c b/mm/sparse.c
index 8cc7be0..4ac1d7e 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -590,32 +590,33 @@ void __init sparse_init(void)
#ifdef CONFIG_MEMORY_HOTPLUG
#ifdef CONFIG_SPARSEMEM_VMEMMAP
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+ unsigned long nr_pages)
{
/* This will make the necessary allocations eventually. */
return sparse_mem_map_populate(pnum, nid);
}
-static void __kfree_section_memmap(struct page *memmap)
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
unsigned long start = (unsigned long)memmap;
- unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+ unsigned long end = (unsigned long)(memmap + nr_pages);
vmemmap_free(start, end);
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-static void free_map_bootmem(struct page *memmap)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
unsigned long start = (unsigned long)memmap;
- unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION);
+ unsigned long end = (unsigned long)(memmap + nr_pages);
vmemmap_free(start, end);
}
#endif /* CONFIG_MEMORY_HOTREMOVE */
#else
-static struct page *__kmalloc_section_memmap(void)
+static struct page *__kmalloc_section_memmap(unsigned long nr_pages)
{
struct page *page, *ret;
- unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION;
+ unsigned long memmap_size = sizeof(struct page) * nr_pages;
page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size));
if (page)
@@ -633,30 +634,28 @@ got_map_ptr:
return ret;
}
-static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid)
+static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid,
+ unsigned long nr_pages)
{
- return __kmalloc_section_memmap();
+ return __kmalloc_section_memmap(nr_pages);
}
-static void __kfree_section_memmap(struct page *memmap)
+static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages)
{
if (is_vmalloc_addr(memmap))
vfree(memmap);
else
free_pages((unsigned long)memmap,
- get_order(sizeof(struct page) * PAGES_PER_SECTION));
+ get_order(sizeof(struct page) * nr_pages));
}
#ifdef CONFIG_MEMORY_HOTREMOVE
-static void free_map_bootmem(struct page *memmap)
+static void free_map_bootmem(struct page *memmap, unsigned long nr_pages)
{
unsigned long maps_section_nr, removing_section_nr, i;
- unsigned long magic, nr_pages;
+ unsigned long magic;
struct page *page = virt_to_page(memmap);
- nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
- >> PAGE_SHIFT;
-
for (i = 0; i < nr_pages; i++, page++) {
magic = (unsigned long) page->lru.next;
@@ -685,7 +684,8 @@ static void free_map_bootmem(struct page *memmap)
* set. If this is <=0, then that means that the passed-in
* map was not consumed and must be freed.
*/
-int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
+int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn,
+ int nr_pages)
{
unsigned long section_nr = pfn_to_section_nr(start_pfn);
struct pglist_data *pgdat = zone->zone_pgdat;
@@ -702,12 +702,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
ret = sparse_index_init(section_nr, pgdat->node_id);
if (ret < 0 && ret != -EEXIST)
return ret;
- memmap = kmalloc_section_memmap(section_nr, pgdat->node_id);
+ memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages);
if (!memmap)
return -ENOMEM;
usemap = __kmalloc_section_usemap();
if (!usemap) {
- __kfree_section_memmap(memmap);
+ __kfree_section_memmap(memmap, nr_pages);
return -ENOMEM;
}
@@ -719,7 +719,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn)
goto out;
}
- memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION);
+ memset(memmap, 0, sizeof(struct page) * nr_pages);
ms->section_mem_map |= SECTION_MARKED_PRESENT;
@@ -729,7 +729,7 @@ out:
pgdat_resize_unlock(pgdat, &flags);
if (ret <= 0) {
kfree(usemap);
- __kfree_section_memmap(memmap);
+ __kfree_section_memmap(memmap, nr_pages);
}
return ret;
}
@@ -759,6 +759,7 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
static void free_section_usemap(struct page *memmap, unsigned long *usemap)
{
struct page *usemap_page;
+ unsigned long nr_pages;
if (!usemap)
return;
@@ -770,7 +771,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
if (PageSlab(usemap_page) || PageCompound(usemap_page)) {
kfree(usemap);
if (memmap)
- __kfree_section_memmap(memmap);
+ __kfree_section_memmap(memmap, PAGES_PER_SECTION);
return;
}
@@ -779,8 +780,12 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap)
* on the section which has pgdat at boot time. Just keep it as is now.
*/
- if (memmap)
- free_map_bootmem(memmap);
+ if (memmap) {
+ nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page))
+ >> PAGE_SHIFT;
+
+ free_map_bootmem(memmap, nr_pages);
+ }
}
void sparse_remove_one_section(struct zone *zone, struct mem_section *ms)
diff --git a/mm/swap.c b/mm/swap.c
index 84b26aa..759c3ca 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -82,6 +82,19 @@ static void __put_compound_page(struct page *page)
static void put_compound_page(struct page *page)
{
+ /*
+ * hugetlbfs pages cannot be split from under us. If this is a
+ * hugetlbfs page, check refcount on head page and release the page if
+ * the refcount becomes zero.
+ */
+ if (PageHuge(page)) {
+ page = compound_head(page);
+ if (put_page_testzero(page))
+ __put_compound_page(page);
+
+ return;
+ }
+
if (unlikely(PageTail(page))) {
/* __split_huge_page_refcount can run under us */
struct page *page_head = compound_trans_head(page);
@@ -98,31 +111,14 @@ static void put_compound_page(struct page *page)
* still hot on arches that do not support
* this_cpu_cmpxchg_double().
*/
- if (PageSlab(page_head) || PageHeadHuge(page_head)) {
- if (likely(PageTail(page))) {
- /*
- * __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON(!PageHead(page_head));
- atomic_dec(&page->_mapcount);
+ if (PageSlab(page_head)) {
+ if (PageTail(page)) {
if (put_page_testzero(page_head))
VM_BUG_ON(1);
- if (put_page_testzero(page_head))
- __put_compound_page(page_head);
- return;
+
+ atomic_dec(&page->_mapcount);
+ goto skip_lock_tail;
} else
- /*
- * __split_huge_page_refcount
- * run before us, "page" was a
- * THP tail. The split
- * page_head has been freed
- * and reallocated as slab or
- * hugetlbfs page of smaller
- * order (only possible if
- * reallocated as slab on
- * x86).
- */
goto skip_lock;
}
/*
@@ -136,27 +132,8 @@ static void put_compound_page(struct page *page)
/* __split_huge_page_refcount run before us */
compound_unlock_irqrestore(page_head, flags);
skip_lock:
- if (put_page_testzero(page_head)) {
- /*
- * The head page may have been
- * freed and reallocated as a
- * compound page of smaller
- * order and then freed again.
- * All we know is that it
- * cannot have become: a THP
- * page, a compound page of
- * higher order, a tail page.
- * That is because we still
- * hold the refcount of the
- * split THP tail and
- * page_head was the THP head
- * before the split.
- */
- if (PageHead(page_head))
- __put_compound_page(page_head);
- else
- __put_single_page(page_head);
- }
+ if (put_page_testzero(page_head))
+ __put_single_page(page_head);
out_put_single:
if (put_page_testzero(page))
__put_single_page(page);
@@ -178,6 +155,7 @@ out_put_single:
VM_BUG_ON(atomic_read(&page->_count) != 0);
compound_unlock_irqrestore(page_head, flags);
+skip_lock_tail:
if (put_page_testzero(page_head)) {
if (PageHead(page_head))
__put_compound_page(page_head);
@@ -220,52 +198,51 @@ bool __get_page_tail(struct page *page)
* proper PT lock that already serializes against
* split_huge_page().
*/
- unsigned long flags;
bool got = false;
- struct page *page_head = compound_trans_head(page);
+ struct page *page_head;
+
+ /*
+ * If this is a hugetlbfs page it cannot be split under us. Simply
+ * increment refcount for the head page.
+ */
+ if (PageHuge(page)) {
+ page_head = compound_head(page);
+ atomic_inc(&page_head->_count);
+ got = true;
+ } else {
+ unsigned long flags;
- if (likely(page != page_head && get_page_unless_zero(page_head))) {
- /* Ref to put_compound_page() comment. */
- if (PageSlab(page_head) || PageHeadHuge(page_head)) {
+ page_head = compound_trans_head(page);
+ if (likely(page != page_head &&
+ get_page_unless_zero(page_head))) {
+
+ /* Ref to put_compound_page() comment. */
+ if (PageSlab(page_head)) {
+ if (likely(PageTail(page))) {
+ __get_page_tail_foll(page, false);
+ return true;
+ } else {
+ put_page(page_head);
+ return false;
+ }
+ }
+
+ /*
+ * page_head wasn't a dangling pointer but it
+ * may not be a head page anymore by the time
+ * we obtain the lock. That is ok as long as it
+ * can't be freed from under us.
+ */
+ flags = compound_lock_irqsave(page_head);
+ /* here __split_huge_page_refcount won't run anymore */
if (likely(PageTail(page))) {
- /*
- * This is a hugetlbfs page or a slab
- * page. __split_huge_page_refcount
- * cannot race here.
- */
- VM_BUG_ON(!PageHead(page_head));
__get_page_tail_foll(page, false);
- return true;
- } else {
- /*
- * __split_huge_page_refcount run
- * before us, "page" was a THP
- * tail. The split page_head has been
- * freed and reallocated as slab or
- * hugetlbfs page of smaller order
- * (only possible if reallocated as
- * slab on x86).
- */
- put_page(page_head);
- return false;
+ got = true;
}
+ compound_unlock_irqrestore(page_head, flags);
+ if (unlikely(!got))
+ put_page(page_head);
}
-
- /*
- * page_head wasn't a dangling pointer but it
- * may not be a head page anymore by the time
- * we obtain the lock. That is ok as long as it
- * can't be freed from under us.
- */
- flags = compound_lock_irqsave(page_head);
- /* here __split_huge_page_refcount won't run anymore */
- if (likely(PageTail(page))) {
- __get_page_tail_foll(page, false);
- got = true;
- }
- compound_unlock_irqrestore(page_head, flags);
- if (unlikely(!got))
- put_page(page_head);
}
return got;
}
@@ -957,8 +934,7 @@ void __init swap_setup(void)
#ifdef CONFIG_SWAP
int i;
- if (bdi_init(swapper_spaces[0].backing_dev_info))
- panic("Failed to init swap bdi");
+ bdi_init(swapper_spaces[0].backing_dev_info);
for (i = 0; i < MAX_SWAPFILES; i++) {
spin_lock_init(&swapper_spaces[i].tree_lock);
INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 612a7c9..de7c904 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -707,7 +707,7 @@ noswap:
return (swp_entry_t) {0};
}
-/* The only caller of this function is now suspend routine */
+/* The only caller of this function is now susupend routine */
swp_entry_t get_swap_page_of_type(int type)
{
struct swap_info_struct *si;
@@ -845,7 +845,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
}
/*
- * Caller has made sure that the swap device corresponding to entry
+ * Caller has made sure that the swapdevice corresponding to entry
* is still around or has not been recycled.
*/
void swap_free(swp_entry_t entry)
@@ -947,7 +947,7 @@ int try_to_free_swap(struct page *page)
* original page might be freed under memory pressure, then
* later read back in from swap, now with the wrong data.
*
- * Hibernation suspends storage while it is writing the image
+ * Hibration suspends storage while it is writing the image
* to disk so check that here.
*/
if (pm_suspended_storage())
@@ -1179,7 +1179,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
* some architectures (e.g. x86_32 with PAE) we might catch a glimpse
* of unmatched parts which look like swp_pte, so unuse_pte must
* recheck under pte lock. Scanning without pte lock lets it be
- * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
+ * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE.
*/
pte = pte_offset_map(pmd, addr);
do {
@@ -1924,17 +1924,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
p->cluster_info = NULL;
p->flags = 0;
frontswap_map = frontswap_map_get(p);
+ frontswap_map_set(p, NULL);
spin_unlock(&p->lock);
spin_unlock(&swap_lock);
frontswap_invalidate_area(type);
- frontswap_map_set(p, NULL);
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
vfree(swap_map);
vfree(cluster_info);
vfree(frontswap_map);
- /* Destroy swap account information */
+ /* Destroy swap account informatin */
swap_cgroup_swapoff(type);
inode = mapping->host;
@@ -2786,8 +2786,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
/*
* We are fortunate that although vmalloc_to_page uses pte_offset_map,
- * no architecture is using highmem pages for kernel page tables: so it
- * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps.
+ * no architecture is using highmem pages for kernel pagetables: so it
+ * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps.
*/
head = vmalloc_to_page(si->swap_map + offset);
offset &= ~PAGE_MASK;
diff --git a/mm/util.c b/mm/util.c
index f7bc209..eaf63fc2 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -7,9 +7,6 @@
#include <linux/security.h>
#include <linux/swap.h>
#include <linux/swapops.h>
-#include <linux/mman.h>
-#include <linux/hugetlb.h>
-
#include <asm/uaccess.h>
#include "internal.h"
@@ -401,16 +398,6 @@ struct address_space *page_mapping(struct page *page)
return mapping;
}
-/*
- * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used
- */
-unsigned long vm_commit_limit(void)
-{
- return ((totalram_pages - hugetlb_total_pages())
- * sysctl_overcommit_ratio / 100) + total_swap_pages;
-}
-
-
/* Tracepoints definitions. */
EXPORT_TRACEPOINT_SYMBOL(kmalloc);
EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 0fdf968..1074543 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -359,12 +359,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size,
if (unlikely(!va))
return ERR_PTR(-ENOMEM);
- /*
- * Only scan the relevant parts containing pointers to other objects
- * to avoid false negatives.
- */
- kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK);
-
retry:
spin_lock(&vmap_area_lock);
/*
@@ -1552,7 +1546,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
gfp_t gfp_mask, pgprot_t prot,
int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
- pgprot_t prot, int node)
+ pgprot_t prot, int node, const void *caller)
{
const int order = 0;
struct page **pages;
@@ -1566,12 +1560,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM,
- PAGE_KERNEL, node, area->caller);
+ PAGE_KERNEL, node, caller);
area->flags |= VM_VPAGES;
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
area->pages = pages;
+ area->caller = caller;
if (!area->pages) {
remove_vm_area(area->addr);
kfree(area);
@@ -1582,7 +1577,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page *page;
gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
- if (node == NUMA_NO_NODE)
+ if (node < 0)
page = alloc_page(tmp_mask);
else
page = alloc_pages_node(node, tmp_mask, order);
@@ -1639,9 +1634,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
if (!area)
goto fail;
- addr = __vmalloc_area_node(area, gfp_mask, prot, node);
+ addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
if (!addr)
- return NULL;
+ goto fail;
/*
* In this function, newly allocated vm_struct has VM_UNINITIALIZED
@@ -1651,11 +1646,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
clear_vm_uninitialized_flag(area);
/*
- * A ref_count = 2 is needed because vm_struct allocated in
- * __get_vm_area_node() contains a reference to the virtual address of
- * the vmalloc'ed block.
+ * A ref_count = 3 is needed because the vm_struct and vmap_area
+ * structures allocated in the __get_vm_area_node() function contain
+ * references to the virtual address of the vmalloc'ed block.
*/
- kmemleak_alloc(addr, real_size, 2, gfp_mask);
+ kmemleak_alloc(addr, real_size, 3, gfp_mask);
return addr;
@@ -2568,11 +2563,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
if (!counters)
return;
- /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
- smp_rmb();
- if (v->flags & VM_UNINITIALIZED)
- return;
-
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
for (nr = 0; nr < v->nr_pages; nr++)
@@ -2589,15 +2579,23 @@ static int s_show(struct seq_file *m, void *p)
struct vmap_area *va = p;
struct vm_struct *v;
- /*
- * s_show can encounter race with remove_vm_area, !VM_VM_AREA on
- * behalf of vmap area is being tear down or vm_map_ram allocation.
- */
- if (!(va->flags & VM_VM_AREA))
+ if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING))
return 0;
+ if (!(va->flags & VM_VM_AREA)) {
+ seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
+ (void *)va->va_start, (void *)va->va_end,
+ va->va_end - va->va_start);
+ return 0;
+ }
+
v = va->vm;
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
+ if (v->flags & VM_UNINITIALIZED)
+ return 0;
+
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 7249614..9bb3145 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -812,7 +812,6 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_NUMA_BALANCING
"numa_pte_updates",
- "numa_huge_pte_updates",
"numa_hint_faults",
"numa_hint_faults_local",
"numa_pages_migrated",
@@ -1230,20 +1229,6 @@ static void start_cpu_timer(int cpu)
schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu));
}
-static void vmstat_cpu_dead(int node)
-{
- int cpu;
-
- get_online_cpus();
- for_each_online_cpu(cpu)
- if (cpu_to_node(cpu) == node)
- goto end;
-
- node_clear_state(node, N_CPU);
-end:
- put_online_cpus();
-}
-
/*
* Use the cpu notifier to insure that the thresholds are recalculated
* when necessary.
@@ -1273,7 +1258,6 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb,
case CPU_DEAD:
case CPU_DEAD_FROZEN:
refresh_zone_stat_thresholds();
- vmstat_cpu_dead(cpu_to_node(cpu));
break;
default:
break;
@@ -1292,12 +1276,8 @@ static int __init setup_vmstat(void)
register_cpu_notifier(&vmstat_notifier);
- get_online_cpus();
- for_each_online_cpu(cpu) {
+ for_each_online_cpu(cpu)
start_cpu_timer(cpu);
- node_set_state(cpu_to_node(cpu), N_CPU);
- }
- put_online_cpus();
#endif
#ifdef CONFIG_PROC_FS
proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
diff --git a/mm/zswap.c b/mm/zswap.c
index 5a63f78..d93510c 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -217,7 +217,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
if (!entry)
return NULL;
entry->refcount = 1;
- RB_CLEAR_NODE(&entry->rbnode);
return entry;
}
@@ -226,6 +225,19 @@ static void zswap_entry_cache_free(struct zswap_entry *entry)
kmem_cache_free(zswap_entry_cache, entry);
}
+/* caller must hold the tree lock */
+static void zswap_entry_get(struct zswap_entry *entry)
+{
+ entry->refcount++;
+}
+
+/* caller must hold the tree lock */
+static int zswap_entry_put(struct zswap_entry *entry)
+{
+ entry->refcount--;
+ return entry->refcount;
+}
+
/*********************************
* rbtree functions
**********************************/
@@ -273,61 +285,6 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
return 0;
}
-static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
-{
- if (!RB_EMPTY_NODE(&entry->rbnode)) {
- rb_erase(&entry->rbnode, root);
- RB_CLEAR_NODE(&entry->rbnode);
- }
-}
-
-/*
- * Carries out the common pattern of freeing and entry's zsmalloc allocation,
- * freeing the entry itself, and decrementing the number of stored pages.
- */
-static void zswap_free_entry(struct zswap_tree *tree,
- struct zswap_entry *entry)
-{
- zbud_free(tree->pool, entry->handle);
- zswap_entry_cache_free(entry);
- atomic_dec(&zswap_stored_pages);
- zswap_pool_pages = zbud_get_pool_size(tree->pool);
-}
-
-/* caller must hold the tree lock */
-static void zswap_entry_get(struct zswap_entry *entry)
-{
- entry->refcount++;
-}
-
-/* caller must hold the tree lock
-* remove from the tree and free it, if nobody reference the entry
-*/
-static void zswap_entry_put(struct zswap_tree *tree,
- struct zswap_entry *entry)
-{
- int refcount = --entry->refcount;
-
- BUG_ON(refcount < 0);
- if (refcount == 0) {
- zswap_rb_erase(&tree->rbroot, entry);
- zswap_free_entry(tree, entry);
- }
-}
-
-/* caller must hold the tree lock */
-static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
- pgoff_t offset)
-{
- struct zswap_entry *entry = NULL;
-
- entry = zswap_rb_search(root, offset);
- if (entry)
- zswap_entry_get(entry);
-
- return entry;
-}
-
/*********************************
* per-cpu code
**********************************/
@@ -411,6 +368,18 @@ static bool zswap_is_full(void)
zswap_pool_pages);
}
+/*
+ * Carries out the common pattern of freeing and entry's zsmalloc allocation,
+ * freeing the entry itself, and decrementing the number of stored pages.
+ */
+static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry)
+{
+ zbud_free(tree->pool, entry->handle);
+ zswap_entry_cache_free(entry);
+ atomic_dec(&zswap_stored_pages);
+ zswap_pool_pages = zbud_get_pool_size(tree->pool);
+}
+
/*********************************
* writeback code
**********************************/
@@ -418,7 +387,7 @@ static bool zswap_is_full(void)
enum zswap_get_swap_ret {
ZSWAP_SWAPCACHE_NEW,
ZSWAP_SWAPCACHE_EXIST,
- ZSWAP_SWAPCACHE_FAIL,
+ ZSWAP_SWAPCACHE_NOMEM
};
/*
@@ -432,10 +401,9 @@ enum zswap_get_swap_ret {
* added to the swap cache, and returned in retpage.
*
* If success, the swap cache page is returned in retpage
- * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
- * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
- * the new page is added to swapcache and locked
- * Returns ZSWAP_SWAPCACHE_FAIL on error
+ * Returns 0 if page was already in the swap cache, page is not locked
+ * Returns 1 if the new page needs to be populated, page is locked
+ * Returns <0 on error
*/
static int zswap_get_swap_cache_page(swp_entry_t entry,
struct page **retpage)
@@ -507,7 +475,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
if (new_page)
page_cache_release(new_page);
if (!found_page)
- return ZSWAP_SWAPCACHE_FAIL;
+ return ZSWAP_SWAPCACHE_NOMEM;
*retpage = found_page;
return ZSWAP_SWAPCACHE_EXIST;
}
@@ -534,7 +502,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
struct page *page;
u8 *src, *dst;
unsigned int dlen;
- int ret;
+ int ret, refcount;
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
};
@@ -549,22 +517,23 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
/* find and ref zswap entry */
spin_lock(&tree->lock);
- entry = zswap_entry_find_get(&tree->rbroot, offset);
+ entry = zswap_rb_search(&tree->rbroot, offset);
if (!entry) {
/* entry was invalidated */
spin_unlock(&tree->lock);
return 0;
}
+ zswap_entry_get(entry);
spin_unlock(&tree->lock);
BUG_ON(offset != entry->offset);
/* try to allocate swap cache page */
switch (zswap_get_swap_cache_page(swpentry, &page)) {
- case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
+ case ZSWAP_SWAPCACHE_NOMEM: /* no memory */
ret = -ENOMEM;
goto fail;
- case ZSWAP_SWAPCACHE_EXIST:
+ case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */
/* page is already in the swap cache, ignore for now */
page_cache_release(page);
ret = -EEXIST;
@@ -587,44 +556,43 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle)
SetPageUptodate(page);
}
- /* move it to the tail of the inactive list after end_writeback */
- SetPageReclaim(page);
-
/* start writeback */
__swap_writepage(page, &wbc, end_swap_bio_write);
page_cache_release(page);
zswap_written_back_pages++;
spin_lock(&tree->lock);
+
/* drop local reference */
- zswap_entry_put(tree, entry);
+ zswap_entry_put(entry);
+ /* drop the initial reference from entry creation */
+ refcount = zswap_entry_put(entry);
/*
- * There are two possible situations for entry here:
- * (1) refcount is 1(normal case), entry is valid and on the tree
- * (2) refcount is 0, entry is freed and not on the tree
- * because invalidate happened during writeback
- * search the tree and free the entry if find entry
- */
- if (entry == zswap_rb_search(&tree->rbroot, offset))
- zswap_entry_put(tree, entry);
+ * There are three possible values for refcount here:
+ * (1) refcount is 1, load is in progress, unlink from rbtree,
+ * load will free
+ * (2) refcount is 0, (normal case) entry is valid,
+ * remove from rbtree and free entry
+ * (3) refcount is -1, invalidate happened during writeback,
+ * free entry
+ */
+ if (refcount >= 0) {
+ /* no invalidate yet, remove from rbtree */
+ rb_erase(&entry->rbnode, &tree->rbroot);
+ }
spin_unlock(&tree->lock);
+ if (refcount <= 0) {
+ /* free the entry */
+ zswap_free_entry(tree, entry);
+ return 0;
+ }
+ return -EAGAIN;
- goto end;
-
- /*
- * if we get here due to ZSWAP_SWAPCACHE_EXIST
- * a load may happening concurrently
- * it is safe and okay to not free the entry
- * if we free the entry in the following put
- * it it either okay to return !0
- */
fail:
spin_lock(&tree->lock);
- zswap_entry_put(tree, entry);
+ zswap_entry_put(entry);
spin_unlock(&tree->lock);
-
-end:
return ret;
}
@@ -708,8 +676,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
if (ret == -EEXIST) {
zswap_duplicate_entry++;
/* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, dupentry);
- zswap_entry_put(tree, dupentry);
+ rb_erase(&dupentry->rbnode, &tree->rbroot);
+ if (!zswap_entry_put(dupentry)) {
+ /* free */
+ zswap_free_entry(tree, dupentry);
+ }
}
} while (ret == -EEXIST);
spin_unlock(&tree->lock);
@@ -738,16 +709,17 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
struct zswap_entry *entry;
u8 *src, *dst;
unsigned int dlen;
- int ret;
+ int refcount, ret;
/* find */
spin_lock(&tree->lock);
- entry = zswap_entry_find_get(&tree->rbroot, offset);
+ entry = zswap_rb_search(&tree->rbroot, offset);
if (!entry) {
/* entry was written back */
spin_unlock(&tree->lock);
return -1;
}
+ zswap_entry_get(entry);
spin_unlock(&tree->lock);
/* decompress */
@@ -762,9 +734,22 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
BUG_ON(ret);
spin_lock(&tree->lock);
- zswap_entry_put(tree, entry);
+ refcount = zswap_entry_put(entry);
+ if (likely(refcount)) {
+ spin_unlock(&tree->lock);
+ return 0;
+ }
spin_unlock(&tree->lock);
+ /*
+ * We don't have to unlink from the rbtree because
+ * zswap_writeback_entry() or zswap_frontswap_invalidate page()
+ * has already done this for us if we are the last reference.
+ */
+ /* free */
+
+ zswap_free_entry(tree, entry);
+
return 0;
}
@@ -773,6 +758,7 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
+ int refcount;
/* find */
spin_lock(&tree->lock);
@@ -784,12 +770,20 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
}
/* remove from rbtree */
- zswap_rb_erase(&tree->rbroot, entry);
+ rb_erase(&entry->rbnode, &tree->rbroot);
/* drop the initial reference from entry creation */
- zswap_entry_put(tree, entry);
+ refcount = zswap_entry_put(entry);
spin_unlock(&tree->lock);
+
+ if (refcount) {
+ /* writeback in progress, writeback will free */
+ return;
+ }
+
+ /* free */
+ zswap_free_entry(tree, entry);
}
/* frees all zswap entries for the given swap type */
@@ -803,8 +797,11 @@ static void zswap_frontswap_invalidate_area(unsigned type)
/* walk the tree and free everything */
spin_lock(&tree->lock);
- rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
- zswap_free_entry(tree, entry);
+ rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) {
+ zbud_free(tree->pool, entry->handle);
+ zswap_entry_cache_free(entry);
+ atomic_dec(&zswap_stored_pages);
+ }
tree->rbroot = RB_ROOT;
spin_unlock(&tree->lock);