summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig9
-rw-r--r--mm/backing-dev.c4
-rw-r--r--mm/filemap.c13
-rw-r--r--mm/huge_memory.c15
-rw-r--r--mm/hugetlb.c13
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memory.c14
-rw-r--r--mm/mempolicy.c14
-rw-r--r--mm/migrate.c2
-rw-r--r--mm/mmap.c96
-rw-r--r--mm/mprotect.c6
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page_alloc.c2
-rw-r--r--mm/pgtable-generic.c8
-rw-r--r--mm/slab.c12
-rw-r--r--mm/slab.h1
-rw-r--r--mm/slab_common.c1
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c38
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/vmstat.c70
21 files changed, 196 insertions, 132 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 97a4e06..03cbfa0 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
bool
config DEFERRED_STRUCT_PAGE_INIT
- bool "Defer initialisation of struct pages to kswapd"
+ bool "Defer initialisation of struct pages to kthreads"
default n
depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
depends on MEMORY_HOTPLUG
@@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT
single thread. On very large machines this can take a considerable
amount of time. If this option is set, large machines will bring up
a subset of memmap at boot and then initialise the rest in parallel
- when kswapd starts. This has a potential performance impact on
- processes running early in the lifetime of the systemm until kswapd
- finishes the initialisation.
+ by starting one-off "pgdatinitX" kernel thread for each node X. This
+ has a potential performance impact on processes running early in the
+ lifetime of the system until these kthreads finish the
+ initialisation.
config IDLE_PAGE_TRACKING
bool "Enable idle page tracking"
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index cc5d29d..c554d17 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -328,7 +328,7 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
return 0;
out_destroy_stat:
- while (--i)
+ while (i--)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
out_put_cong:
@@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
* here rather than calling cond_resched().
*/
if (current->flags & PF_WQ_WORKER)
- schedule_timeout(1);
+ schedule_timeout_uninterruptible(1);
else
cond_resched();
diff --git a/mm/filemap.c b/mm/filemap.c
index bc94386..3461d97 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -446,7 +446,8 @@ int filemap_write_and_wait(struct address_space *mapping)
{
int err = 0;
- if (mapping->nrpages) {
+ if ((!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional)) {
err = filemap_fdatawrite(mapping);
/*
* Even if the above returned error, the pages may be
@@ -482,13 +483,8 @@ int filemap_write_and_wait_range(struct address_space *mapping,
{
int err = 0;
- if (dax_mapping(mapping) && mapping->nrexceptional) {
- err = dax_writeback_mapping_range(mapping, lstart, lend);
- if (err)
- return err;
- }
-
- if (mapping->nrpages) {
+ if ((!dax_mapping(mapping) && mapping->nrpages) ||
+ (dax_mapping(mapping) && mapping->nrexceptional)) {
err = __filemap_fdatawrite_range(mapping, lstart, lend,
WB_SYNC_ALL);
/* See comment of filemap_write_and_wait() */
@@ -1890,6 +1886,7 @@ EXPORT_SYMBOL(generic_file_read_iter);
* page_cache_read - adds requested page to the page cache if not already there
* @file: file to read
* @offset: page index
+ * @gfp_mask: memory allocation flags
*
* This adds the requested page to the page cache if it isn't already there,
* and schedules an I/O to read in its contents from disk.
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 36c0701..e10a4fe 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1700,7 +1700,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
- if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+ if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
+ vma_is_anonymous(vma)) {
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
@@ -2835,6 +2836,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable_t pgtable;
pmd_t _pmd;
bool young, write, dirty;
+ unsigned long addr;
int i;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
@@ -2860,10 +2862,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
young = pmd_young(*pmd);
dirty = pmd_dirty(*pmd);
+ pmdp_huge_split_prepare(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
pte_t entry, *pte;
/*
* Note that NUMA hinting access restrictions are not
@@ -2884,9 +2887,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
if (dirty)
SetPageDirty(page + i);
- pte = pte_offset_map(&_pmd, haddr);
+ pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
+ set_pte_at(mm, addr, pte, entry);
atomic_inc(&page[i]._mapcount);
pte_unmap(pte);
}
@@ -2936,7 +2939,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pmd_populate(mm, pmd, pgtable);
if (freeze) {
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
page_remove_rmap(page + i, false);
put_page(page + i);
}
@@ -3482,7 +3485,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_lock_irqsave(&pgdata->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
- list_for_each_safe(pos, next, &list) {
+ list_for_each_safe(pos, next, &pgdata->split_queue) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
if (get_page_unless_zero(page)) {
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 12908dc..01f2b48 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--)
-#if defined(CONFIG_CMA) && defined(CONFIG_X86_64)
+#if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA))
static void destroy_compound_gigantic_page(struct page *page,
unsigned int order)
{
@@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page)
set_page_private(page, 0);
page->mapping = NULL;
- BUG_ON(page_count(page));
- BUG_ON(page_mapcount(page));
+ VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(page_mapcount(page), page);
restore_reserve = PagePrivate(page);
ClearPagePrivate(page);
@@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
set_page_count(p, 0);
set_compound_head(p, page);
}
+ atomic_set(compound_mapcount_ptr(page), -1);
}
/*
@@ -2629,8 +2630,10 @@ static int __init hugetlb_init(void)
hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
}
default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
- if (default_hstate_max_huge_pages)
- default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ if (default_hstate_max_huge_pages) {
+ if (!default_hstate.max_huge_pages)
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ }
hugetlb_init_hstates();
gather_bootmem_prealloc();
diff --git a/mm/memblock.c b/mm/memblock.c
index d2ed81e..dd79899 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
* Remaining API functions
*/
-phys_addr_t __init memblock_phys_mem_size(void)
+phys_addr_t __init_memblock memblock_phys_mem_size(void)
{
return memblock.memory.total_size;
}
diff --git a/mm/memory.c b/mm/memory.c
index 635451a..8132787 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3404,8 +3404,18 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (unlikely(pmd_none(*pmd)) &&
unlikely(__pte_alloc(mm, vma, pmd, address)))
return VM_FAULT_OOM;
- /* if an huge pmd materialized from under us just retry later */
- if (unlikely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd)))
+ /*
+ * If a huge pmd materialized under us just retry later. Use
+ * pmd_trans_unstable() instead of pmd_trans_huge() to ensure the pmd
+ * didn't become pmd_trans_huge under us and then back to pmd_none, as
+ * a result of MADV_DONTNEED running immediately after a huge pmd fault
+ * in a different thread of this mm, in turn leading to a misleading
+ * pmd_trans_huge() retval. All we have to ensure is that it is a
+ * regular pmd that we can walk with pte_offset_map() and we can do that
+ * through an atomic read in C, which is what pmd_trans_unstable()
+ * provides.
+ */
+ if (unlikely(pmd_trans_unstable(pmd) || pmd_devmap(*pmd)))
return 0;
/*
* A regular pmd is established and it can't morph into a huge pmd
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 27d1354..4c4187c 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -548,8 +548,7 @@ retry:
goto retry;
}
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- migrate_page_add(page, qp->pagelist, flags);
+ migrate_page_add(page, qp->pagelist, flags);
}
pte_unmap_unlock(pte - 1, ptl);
cond_resched();
@@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags;
- if (vma->vm_flags & VM_PFNMAP)
+ if (!vma_migratable(vma))
return 1;
if (endvma > end)
@@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */
- if (vma_migratable(vma) &&
- vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
change_prot_numa(vma, start, endvma);
return 1;
}
- if ((flags & MPOL_MF_STRICT) ||
- ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma)))
- /* queue pages from current vma */
+ /* queue pages from current vma */
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
return 0;
return 1;
}
diff --git a/mm/migrate.c b/mm/migrate.c
index b1034f9..3ad0fea 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1582,7 +1582,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
(GFP_HIGHUSER_MOVABLE |
__GFP_THISNODE | __GFP_NOMEMALLOC |
__GFP_NORETRY | __GFP_NOWARN) &
- ~(__GFP_IO | __GFP_FS), 0);
+ ~__GFP_RECLAIM, 0);
return newpage;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index cfc0cdc..76d1ec2 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -390,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
}
#ifdef CONFIG_DEBUG_VM_RB
-static int browse_rb(struct rb_root *root)
+static int browse_rb(struct mm_struct *mm)
{
+ struct rb_root *root = &mm->mm_rb;
int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0;
@@ -414,12 +415,14 @@ static int browse_rb(struct rb_root *root)
vma->vm_start, vma->vm_end);
bug = 1;
}
+ spin_lock(&mm->page_table_lock);
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
pr_emerg("free gap %lx, correct %lx\n",
vma->rb_subtree_gap,
vma_compute_subtree_gap(vma));
bug = 1;
}
+ spin_unlock(&mm->page_table_lock);
i++;
pn = nd;
prev = vma->vm_start;
@@ -456,12 +459,16 @@ static void validate_mm(struct mm_struct *mm)
struct vm_area_struct *vma = mm->mmap;
while (vma) {
+ struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc;
- vma_lock_anon_vma(vma);
- list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
- anon_vma_interval_tree_verify(avc);
- vma_unlock_anon_vma(vma);
+ if (anon_vma) {
+ anon_vma_lock_read(anon_vma);
+ list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
+ anon_vma_interval_tree_verify(avc);
+ anon_vma_unlock_read(anon_vma);
+ }
+
highest_address = vma->vm_end;
vma = vma->vm_next;
i++;
@@ -475,7 +482,7 @@ static void validate_mm(struct mm_struct *mm)
mm->highest_vm_end, highest_address);
bug = 1;
}
- i = browse_rb(&mm->mm_rb);
+ i = browse_rb(mm);
if (i != mm->map_count) {
if (i != -1)
pr_emerg("map_count %d rb %d\n", mm->map_count, i);
@@ -2142,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
struct mm_struct *mm = vma->vm_mm;
- int error;
+ int error = 0;
if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT;
- /*
- * We must make sure the anon_vma is allocated
- * so that the anon_vma locking is not a noop.
- */
+ /* Guard against wrapping around to address 0. */
+ if (address < PAGE_ALIGN(address+4))
+ address = PAGE_ALIGN(address+4);
+ else
+ return -ENOMEM;
+
+ /* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
- vma_lock_anon_vma(vma);
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
- * Also guard against wrapping around to address 0.
*/
- if (address < PAGE_ALIGN(address+4))
- address = PAGE_ALIGN(address+4);
- else {
- vma_unlock_anon_vma(vma);
- return -ENOMEM;
- }
- error = 0;
+ anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address > vma->vm_end) {
@@ -2185,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
- * vma_lock_anon_vma() doesn't help here, as
+ * anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
@@ -2208,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
}
}
}
- vma_unlock_anon_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
@@ -2224,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm;
int error;
- /*
- * We must make sure the anon_vma is allocated
- * so that the anon_vma locking is not a noop.
- */
- if (unlikely(anon_vma_prepare(vma)))
- return -ENOMEM;
-
address &= PAGE_MASK;
error = security_mmap_addr(address);
if (error)
return error;
- vma_lock_anon_vma(vma);
+ /* We must make sure the anon_vma is allocated. */
+ if (unlikely(anon_vma_prepare(vma)))
+ return -ENOMEM;
/*
* vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
+ anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) {
@@ -2260,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma,
* updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against
* concurrent vma expansions.
- * vma_lock_anon_vma() doesn't help here, as
+ * anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas
* in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard
@@ -2281,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma,
}
}
}
- vma_unlock_anon_vma(vma);
+ anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm);
return error;
@@ -2666,12 +2664,29 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (!vma || !(vma->vm_flags & VM_SHARED))
goto out;
- if (start < vma->vm_start || start + size > vma->vm_end)
+ if (start < vma->vm_start)
goto out;
- if (pgoff == linear_page_index(vma, start)) {
- ret = 0;
- goto out;
+ if (start + size > vma->vm_end) {
+ struct vm_area_struct *next;
+
+ for (next = vma->vm_next; next; next = next->vm_next) {
+ /* hole between vmas ? */
+ if (next->vm_start != next->vm_prev->vm_end)
+ goto out;
+
+ if (next->vm_file != vma->vm_file)
+ goto out;
+
+ if (next->vm_flags != vma->vm_flags)
+ goto out;
+
+ if (start + size <= next->vm_end)
+ break;
+ }
+
+ if (!next)
+ goto out;
}
prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
@@ -2681,9 +2696,16 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
flags &= MAP_NONBLOCK;
flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
if (vma->vm_flags & VM_LOCKED) {
+ struct vm_area_struct *tmp;
flags |= MAP_LOCKED;
+
/* drop PG_Mlocked flag for over-mapped range */
- munlock_vma_pages_range(vma, start, start + size);
+ for (tmp = vma; tmp->vm_start >= start + size;
+ tmp = tmp->vm_next) {
+ munlock_vma_pages_range(tmp,
+ max(tmp->vm_start, start),
+ min(tmp->vm_end, start + size));
+ }
}
file = get_file(vma->vm_file);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 8eb7bb4..f7cb3d4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -160,9 +160,11 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
}
if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) {
- if (next - addr != HPAGE_PMD_SIZE)
+ if (next - addr != HPAGE_PMD_SIZE) {
split_huge_pmd(vma, pmd, addr);
- else {
+ if (pmd_none(*pmd))
+ continue;
+ } else {
int nr_ptes = change_huge_pmd(vma, pmd, addr,
newprot, prot_numa);
diff --git a/mm/mremap.c b/mm/mremap.c
index d77946a..8eeba02 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -210,6 +210,8 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
}
}
split_huge_pmd(vma, old_pmd, old_addr);
+ if (pmd_none(*old_pmd))
+ continue;
VM_BUG_ON(pmd_trans_huge(*old_pmd));
}
if (pmd_none(*new_pmd) && __pte_alloc(new_vma->vm_mm, new_vma,
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index ea2c4d3..838ca8bb 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6620,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page)
return !has_unmovable_pages(zone, page, 0, true);
}
-#ifdef CONFIG_CMA
+#if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static unsigned long pfn_max_align_down(unsigned long pfn)
{
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 9d47676..06a005b 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -90,9 +90,9 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
* ARCHes with special requirements for evicting THP backing TLB entries can
* implement this. Otherwise also, it can help optimize normal TLB flush in
* THP regime. stock flush_tlb_range() typically has optimization to nuke the
- * entire TLB TLB if flush span is greater than a threshhold, which will
+ * entire TLB if flush span is greater than a threshold, which will
* likely be true for a single huge page. Thus a single thp flush will
- * invalidate the entire TLB which is not desitable.
+ * invalidate the entire TLB which is not desirable.
* e.g. see arch/arc: flush_pmd_tlb_range
*/
#define flush_pmd_tlb_range(vma, addr, end) flush_tlb_range(vma, addr, end)
@@ -195,7 +195,9 @@ pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address,
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
VM_BUG_ON(pmd_trans_huge(*pmdp));
pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp);
- flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
+
+ /* collapse entails shooting down ptes not pmd */
+ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return pmd;
}
#endif
diff --git a/mm/slab.c b/mm/slab.c
index 6ecc697..621fbcb 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2275,7 +2275,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
err = setup_cpu_cache(cachep, gfp);
if (err) {
- __kmem_cache_shutdown(cachep);
+ __kmem_cache_release(cachep);
return err;
}
@@ -2414,12 +2414,13 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
+ return __kmem_cache_shrink(cachep, false);
+}
+
+void __kmem_cache_release(struct kmem_cache *cachep)
+{
int i;
struct kmem_cache_node *n;
- int rc = __kmem_cache_shrink(cachep, false);
-
- if (rc)
- return rc;
free_percpu(cachep->cpu_cache);
@@ -2430,7 +2431,6 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
kfree(n);
cachep->node[i] = NULL;
}
- return 0;
}
/*
diff --git a/mm/slab.h b/mm/slab.h
index 834ad24..2eedace 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -140,6 +140,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
int __kmem_cache_shutdown(struct kmem_cache *);
+void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *, bool);
void slab_kmem_cache_release(struct kmem_cache *);
diff --git a/mm/slab_common.c b/mm/slab_common.c
index b50aef0..065b7bd 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -693,6 +693,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s,
void slab_kmem_cache_release(struct kmem_cache *s)
{
+ __kmem_cache_release(s);
destroy_memcg_params(s);
kfree_const(s->name);
kmem_cache_free(kmem_cache, s);
diff --git a/mm/slob.c b/mm/slob.c
index 17e8f8c..5ec1580 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -630,6 +630,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
return 0;
}
+void __kmem_cache_release(struct kmem_cache *c)
+{
+}
+
int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
{
return 0;
diff --git a/mm/slub.c b/mm/slub.c
index 2e1355a..d8fbd4a 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1592,18 +1592,12 @@ static inline void add_partial(struct kmem_cache_node *n,
__add_partial(n, page, tail);
}
-static inline void
-__remove_partial(struct kmem_cache_node *n, struct page *page)
-{
- list_del(&page->lru);
- n->nr_partial--;
-}
-
static inline void remove_partial(struct kmem_cache_node *n,
struct page *page)
{
lockdep_assert_held(&n->list_lock);
- __remove_partial(n, page);
+ list_del(&page->lru);
+ n->nr_partial--;
}
/*
@@ -3184,6 +3178,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
}
}
+void __kmem_cache_release(struct kmem_cache *s)
+{
+ free_percpu(s->cpu_slab);
+ free_kmem_cache_nodes(s);
+}
+
static int init_kmem_cache_nodes(struct kmem_cache *s)
{
int node;
@@ -3443,28 +3443,31 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
/*
* Attempt to free all partial slabs on a node.
- * This is called from kmem_cache_close(). We must be the last thread
- * using the cache and therefore we do not need to lock anymore.
+ * This is called from __kmem_cache_shutdown(). We must take list_lock
+ * because sysfs file might still access partial list after the shutdowning.
*/
static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
{
struct page *page, *h;
+ BUG_ON(irqs_disabled());
+ spin_lock_irq(&n->list_lock);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
- __remove_partial(n, page);
+ remove_partial(n, page);
discard_slab(s, page);
} else {
list_slab_objects(s, page,
- "Objects remaining in %s on kmem_cache_close()");
+ "Objects remaining in %s on __kmem_cache_shutdown()");
}
}
+ spin_unlock_irq(&n->list_lock);
}
/*
* Release all resources used by a slab cache.
*/
-static inline int kmem_cache_close(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s)
{
int node;
struct kmem_cache_node *n;
@@ -3476,16 +3479,9 @@ static inline int kmem_cache_close(struct kmem_cache *s)
if (n->nr_partial || slabs_node(s, node))
return 1;
}
- free_percpu(s->cpu_slab);
- free_kmem_cache_nodes(s);
return 0;
}
-int __kmem_cache_shutdown(struct kmem_cache *s)
-{
- return kmem_cache_close(s);
-}
-
/********************************************************************
* Kmalloc subsystem
*******************************************************************/
@@ -3980,7 +3976,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
memcg_propagate_slab_attrs(s);
err = sysfs_slab_add(s);
if (err)
- kmem_cache_close(s);
+ __kmem_cache_release(s);
return err;
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index eb3dd37..71b1c29 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page)
int ret = -EBUSY;
VM_BUG_ON_PAGE(!page_count(page), page);
- VM_BUG_ON_PAGE(PageTail(page), page);
+ WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
if (PageLRU(page)) {
struct zone *zone = page_zone(page);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 40b2c74..084c672 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w)
* Counters were updated so we expect more updates
* to occur in the future. Keep on running the
* update worker thread.
+ * If we were marked on cpu_stat_off clear the flag
+ * so that vmstat_shepherd doesn't schedule us again.
*/
- queue_delayed_work_on(smp_processor_id(), vmstat_wq,
- this_cpu_ptr(&vmstat_work),
- round_jiffies_relative(sysctl_stat_interval));
+ if (!cpumask_test_and_clear_cpu(smp_processor_id(),
+ cpu_stat_off)) {
+ queue_delayed_work_on(smp_processor_id(), vmstat_wq,
+ this_cpu_ptr(&vmstat_work),
+ round_jiffies_relative(sysctl_stat_interval));
+ }
} else {
/*
* We did not update any counters so the app may be in
@@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w)
* until the diffs stay at zero. The function is used by NOHZ and can only be
* invoked when tick processing is not active.
*/
-void quiet_vmstat(void)
-{
- if (system_state != SYSTEM_RUNNING)
- return;
-
- do {
- if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
- cancel_delayed_work(this_cpu_ptr(&vmstat_work));
-
- } while (refresh_cpu_vm_stats(false));
-}
-
/*
* Check if the diffs for a certain cpu indicate that
* an update is needed.
@@ -1452,6 +1445,30 @@ static bool need_update(int cpu)
return false;
}
+void quiet_vmstat(void)
+{
+ if (system_state != SYSTEM_RUNNING)
+ return;
+
+ /*
+ * If we are already in hands of the shepherd then there
+ * is nothing for us to do here.
+ */
+ if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
+ return;
+
+ if (!need_update(smp_processor_id()))
+ return;
+
+ /*
+ * Just refresh counters and do not care about the pending delayed
+ * vmstat_update. It doesn't fire that often to matter and canceling
+ * it would be too expensive from this path.
+ * vmstat_shepherd will take care about that for us.
+ */
+ refresh_cpu_vm_stats(false);
+}
+
/*
* Shepherd worker thread that checks the
@@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w)
get_online_cpus();
/* Check processors whose vmstat worker threads have been disabled */
- for_each_cpu(cpu, cpu_stat_off)
- if (need_update(cpu) &&
- cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
-
- queue_delayed_work_on(cpu, vmstat_wq,
- &per_cpu(vmstat_work, cpu), 0);
+ for_each_cpu(cpu, cpu_stat_off) {
+ struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
+ if (need_update(cpu)) {
+ if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
+ queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
+ } else {
+ /*
+ * Cancel the work if quiet_vmstat has put this
+ * cpu on cpu_stat_off because the work item might
+ * be still scheduled
+ */
+ cancel_delayed_work(dw);
+ }
+ }
put_online_cpus();
schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval));
-
}
static void __init start_shepherd_timer(void)
@@ -1488,7 +1512,7 @@ static void __init start_shepherd_timer(void)
int cpu;
for_each_possible_cpu(cpu)
- INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
+ INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))