diff options
author | Scott Wood <scottwood@freescale.com> | 2014-04-07 23:49:35 (GMT) |
---|---|---|
committer | Scott Wood <scottwood@freescale.com> | 2014-04-07 23:49:35 (GMT) |
commit | 62b8c978ee6b8d135d9e7953221de58000dba986 (patch) | |
tree | 683b04b2e627f6710c22c151b23c8cc9a165315e /mm | |
parent | 78fd82238d0e5716578c326404184a27ba67fd6e (diff) | |
download | linux-fsl-qoriq-62b8c978ee6b8d135d9e7953221de58000dba986.tar.xz |
Rewind v3.13-rc3+ (78fd82238d0e5716) to v3.12
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 23 | ||||
-rw-r--r-- | mm/bootmem.c | 8 | ||||
-rw-r--r-- | mm/compaction.c | 7 | ||||
-rw-r--r-- | mm/filemap.c | 10 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/huge_memory.c | 334 | ||||
-rw-r--r-- | mm/hugetlb.c | 161 | ||||
-rw-r--r-- | mm/kmemleak.c | 4 | ||||
-rw-r--r-- | mm/ksm.c | 4 | ||||
-rw-r--r-- | mm/memblock.c | 124 | ||||
-rw-r--r-- | mm/memcontrol.c | 179 | ||||
-rw-r--r-- | mm/memory-failure.c | 38 | ||||
-rw-r--r-- | mm/memory.c | 178 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 65 | ||||
-rw-r--r-- | mm/mempolicy.c | 149 | ||||
-rw-r--r-- | mm/migrate.c | 92 | ||||
-rw-r--r-- | mm/mm_init.c | 18 | ||||
-rw-r--r-- | mm/mmap.c | 23 | ||||
-rw-r--r-- | mm/mmzone.c | 14 | ||||
-rw-r--r-- | mm/mprotect.c | 69 | ||||
-rw-r--r-- | mm/nobootmem.c | 25 | ||||
-rw-r--r-- | mm/nommu.c | 5 | ||||
-rw-r--r-- | mm/oom_kill.c | 6 | ||||
-rw-r--r-- | mm/page_alloc.c | 38 | ||||
-rw-r--r-- | mm/percpu.c | 5 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 16 | ||||
-rw-r--r-- | mm/readahead.c | 8 | ||||
-rw-r--r-- | mm/rmap.c | 15 | ||||
-rw-r--r-- | mm/slab.c | 573 | ||||
-rw-r--r-- | mm/slab.h | 6 | ||||
-rw-r--r-- | mm/slab_common.c | 2 | ||||
-rw-r--r-- | mm/slub.c | 49 | ||||
-rw-r--r-- | mm/sparse.c | 53 | ||||
-rw-r--r-- | mm/swap.c | 146 | ||||
-rw-r--r-- | mm/swapfile.c | 16 | ||||
-rw-r--r-- | mm/util.c | 13 | ||||
-rw-r--r-- | mm/vmalloc.c | 48 | ||||
-rw-r--r-- | mm/vmstat.c | 22 | ||||
-rw-r--r-- | mm/zswap.c | 195 |
39 files changed, 1247 insertions, 1496 deletions
@@ -20,7 +20,7 @@ config FLATMEM_MANUAL Some users of more advanced features like NUMA and memory hotplug may have different options here. - DISCONTIGMEM is a more mature, better tested system, + DISCONTIGMEM is an more mature, better tested system, but is incompatible with memory hotplug and may suffer decreased performance over SPARSEMEM. If unsure between "Sparse Memory" and "Discontiguous Memory", choose @@ -153,18 +153,11 @@ config MOVABLE_NODE help Allow a node to have only movable memory. Pages used by the kernel, such as direct mapping pages cannot be migrated. So the corresponding - memory device cannot be hotplugged. This option allows the following - two things: - - When the system is booting, node full of hotpluggable memory can - be arranged to have only movable memory so that the whole node can - be hot-removed. (need movable_node boot option specified). - - After the system is up, the option allows users to online all the - memory of a node as movable memory so that the whole node can be - hot-removed. - - Users who don't use the memory hotplug feature are fine with this - option on since they don't specify movable_node boot option or they - don't online memory as movable. + memory device cannot be hotplugged. This option allows users to + online all the memory of a node as movable memory so that the whole + node can be hotplugged. Users who don't use the memory hotplug + feature are fine with this option on since they don't online memory + as movable. Say Y here if you want to hotplug a whole node. Say N here if you want kernel to use memory on all nodes evenly. @@ -218,11 +211,9 @@ config SPLIT_PTLOCK_CPUS int default "999999" if ARM && !CPU_CACHE_VIPT default "999999" if PARISC && !PA20 + default "999999" if DEBUG_SPINLOCK || DEBUG_LOCK_ALLOC default "4" -config ARCH_ENABLE_SPLIT_PMD_PTLOCK - boolean - # # support for memory balloon compaction config BALLOON_COMPACTION diff --git a/mm/bootmem.c b/mm/bootmem.c index 90bd350..6ab7744 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -172,12 +172,11 @@ void __init free_bootmem_late(unsigned long physaddr, unsigned long size) static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) { struct page *page; - unsigned long *map, start, end, pages, count = 0; + unsigned long start, end, pages, count = 0; if (!bdata->node_bootmem_map) return 0; - map = bdata->node_bootmem_map; start = bdata->node_min_pfn; end = bdata->node_low_pfn; @@ -185,9 +184,10 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) bdata - bootmem_node_data, start, end); while (start < end) { - unsigned long idx, vec; + unsigned long *map, idx, vec; unsigned shift; + map = bdata->node_bootmem_map; idx = start - bdata->node_min_pfn; shift = idx & (BITS_PER_LONG - 1); /* @@ -784,7 +784,7 @@ void * __init __alloc_bootmem_node_high(pg_data_t *pgdat, unsigned long size, return kzalloc_node(size, GFP_NOWAIT, pgdat->node_id); /* update goal according ...MAX_DMA32_PFN */ - end_pfn = pgdat_end_pfn(pgdat); + end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; if (end_pfn > MAX_DMA32_PFN + (128 >> (20 - PAGE_SHIFT)) && (goal >> PAGE_SHIFT) < MAX_DMA32_PFN) { diff --git a/mm/compaction.c b/mm/compaction.c index 805165b..b5326b1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -235,9 +235,10 @@ static bool suitable_migration_target(struct page *page) } /* - * Isolate free pages onto a private freelist. If @strict is true, will abort - * returning 0 on any invalid PFNs or non-free pages inside of the pageblock - * (even though it may still end up isolating some pages). + * Isolate free pages onto a private freelist. Caller must hold zone->lock. + * If @strict is true, will abort returning 0 on any invalid PFNs or non-free + * pages inside of the pageblock (even though it may still end up isolating + * some pages). */ static unsigned long isolate_freepages_block(struct compact_control *cc, unsigned long blockpfn, diff --git a/mm/filemap.c b/mm/filemap.c index b7749a9..ae4846f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1090,6 +1090,7 @@ static void shrink_readahead_size_eio(struct file *filp, * @filp: the file to read * @ppos: current file position * @desc: read_descriptor + * @actor: read method * * This is a generic file read routine, and uses the * mapping->a_ops->readpage() function for the actual low-level stuff. @@ -1098,7 +1099,7 @@ static void shrink_readahead_size_eio(struct file *filp, * of the logic when it comes to error handling etc. */ static void do_generic_file_read(struct file *filp, loff_t *ppos, - read_descriptor_t *desc) + read_descriptor_t *desc, read_actor_t actor) { struct address_space *mapping = filp->f_mapping; struct inode *inode = mapping->host; @@ -1199,14 +1200,13 @@ page_ok: * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... * - * The file_read_actor routine returns how many bytes were - * actually used.. + * The actor routine returns how many bytes were actually used.. * NOTE! This may not be the same as how much of a user buffer * we filled up (we may be padding etc), so we can only update * "pos" here (the actor routine has to update the user buffer * pointers and the remaining count). */ - ret = file_read_actor(desc, page, offset, nr); + ret = actor(desc, page, offset, nr); offset += ret; index += offset >> PAGE_CACHE_SHIFT; offset &= ~PAGE_CACHE_MASK; @@ -1479,7 +1479,7 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov, if (desc.count == 0) continue; desc.error = 0; - do_generic_file_read(filp, ppos, &desc); + do_generic_file_read(filp, ppos, &desc, file_read_actor); retval += desc.written; if (desc.error) { retval = retval ?: desc.error; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index d8d9fe3..28fe26b 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -26,7 +26,7 @@ * of ZERO_PAGE(), such as /dev/zero */ static DEFINE_MUTEX(xip_sparse_mutex); -static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq); +static seqcount_t xip_sparse_seq = SEQCNT_ZERO; static struct page *__xip_sparse_page; /* called under xip_sparse_mutex */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bccd5a6..cca80d9 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -27,12 +27,11 @@ #include "internal.h" /* - * By default transparent hugepage support is disabled in order that avoid - * to risk increase the memory footprint of applications without a guaranteed - * benefit. When transparent hugepage support is enabled, is for all mappings, - * and khugepaged scans all mappings. - * Defrag is invoked by khugepaged hugepage allocations and by page faults - * for all hugepage allocations. + * By default transparent hugepage support is enabled for all mappings + * and khugepaged scans all mappings. Defrag is only invoked by + * khugepaged hugepage allocations and by page faults inside + * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived + * allocations. */ unsigned long transparent_hugepage_flags __read_mostly = #ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS @@ -710,7 +709,6 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct page *page) { pgtable_t pgtable; - spinlock_t *ptl; VM_BUG_ON(!PageCompound(page)); pgtable = pte_alloc_one(mm, haddr); @@ -725,9 +723,9 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, */ __SetPageUptodate(page); - ptl = pmd_lock(mm, pmd); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_none(*pmd))) { - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); mem_cgroup_uncharge_page(page); put_page(page); pte_free(mm, pgtable); @@ -739,8 +737,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); - atomic_long_inc(&mm->nr_ptes); - spin_unlock(ptl); + mm->nr_ptes++; + spin_unlock(&mm->page_table_lock); } return 0; @@ -760,7 +758,14 @@ static inline struct page *alloc_hugepage_vma(int defrag, HPAGE_PMD_ORDER, vma, haddr, nd); } -/* Caller must hold page table lock. */ +#ifndef CONFIG_NUMA +static inline struct page *alloc_hugepage(int defrag) +{ + return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), + HPAGE_PMD_ORDER); +} +#endif + static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, struct page *zero_page) @@ -773,7 +778,7 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = pmd_mkhuge(entry); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - atomic_long_inc(&mm->nr_ptes); + mm->nr_ptes++; return true; } @@ -792,7 +797,6 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (!(flags & FAULT_FLAG_WRITE) && transparent_hugepage_use_zero_page()) { - spinlock_t *ptl; pgtable_t pgtable; struct page *zero_page; bool set; @@ -805,10 +809,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - ptl = pmd_lock(mm, pmd); + spin_lock(&mm->page_table_lock); set = set_huge_zero_page(pgtable, mm, vma, haddr, pmd, zero_page); - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); if (!set) { pte_free(mm, pgtable); put_huge_zero_page(); @@ -841,7 +845,6 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma) { - spinlock_t *dst_ptl, *src_ptl; struct page *src_page; pmd_t pmd; pgtable_t pgtable; @@ -852,9 +855,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (unlikely(!pgtable)) goto out; - dst_ptl = pmd_lock(dst_mm, dst_pmd); - src_ptl = pmd_lockptr(src_mm, src_pmd); - spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + spin_lock(&dst_mm->page_table_lock); + spin_lock_nested(&src_mm->page_table_lock, SINGLE_DEPTH_NESTING); ret = -EAGAIN; pmd = *src_pmd; @@ -863,7 +865,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_unlock; } /* - * When page table lock is held, the huge zero pmd should not be + * mm->page_table_lock is enough to be sure that huge zero pmd is not * under splitting since we don't split the page itself, only pmd to * a page table. */ @@ -884,8 +886,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, } if (unlikely(pmd_trans_splitting(pmd))) { /* split huge page running from under us */ - spin_unlock(src_ptl); - spin_unlock(dst_ptl); + spin_unlock(&src_mm->page_table_lock); + spin_unlock(&dst_mm->page_table_lock); pte_free(dst_mm, pgtable); wait_split_huge_page(vma->anon_vma, src_pmd); /* src_vma */ @@ -901,12 +903,12 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd = pmd_mkold(pmd_wrprotect(pmd)); pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - atomic_long_inc(&dst_mm->nr_ptes); + dst_mm->nr_ptes++; ret = 0; out_unlock: - spin_unlock(src_ptl); - spin_unlock(dst_ptl); + spin_unlock(&src_mm->page_table_lock); + spin_unlock(&dst_mm->page_table_lock); out: return ret; } @@ -917,11 +919,10 @@ void huge_pmd_set_accessed(struct mm_struct *mm, pmd_t *pmd, pmd_t orig_pmd, int dirty) { - spinlock_t *ptl; pmd_t entry; unsigned long haddr; - ptl = pmd_lock(mm, pmd); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto unlock; @@ -931,14 +932,13 @@ void huge_pmd_set_accessed(struct mm_struct *mm, update_mmu_cache_pmd(vma, address, pmd); unlock: - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); } static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr) { - spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; struct page *page; @@ -965,7 +965,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - ptl = pmd_lock(mm, pmd); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_page; @@ -992,7 +992,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, } smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); put_huge_zero_page(); inc_mm_counter(mm, MM_ANONPAGES); @@ -1002,7 +1002,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, out: return ret; out_free_page: - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_page(page); put_page(page); @@ -1016,7 +1016,6 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { - spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; int ret = 0, i; @@ -1063,7 +1062,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - ptl = pmd_lock(mm, pmd); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON(!PageHead(page)); @@ -1089,7 +1088,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, smp_wmb(); /* make pte visible before pmd */ pmd_populate(mm, pmd, pgtable); page_remove_rmap(page); - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); @@ -1100,7 +1099,7 @@ out: return ret; out_free_pages: - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { @@ -1115,19 +1114,17 @@ out_free_pages: int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, pmd_t orig_pmd) { - spinlock_t *ptl; int ret = 0; struct page *page = NULL, *new_page; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ - ptl = pmd_lockptr(mm, pmd); VM_BUG_ON(!vma->anon_vma); haddr = address & HPAGE_PMD_MASK; if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(ptl); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, orig_pmd))) goto out_unlock; @@ -1143,7 +1140,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out_unlock; } get_page(page); - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) @@ -1190,11 +1187,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - spin_lock(ptl); + spin_lock(&mm->page_table_lock); if (page) put_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); mem_cgroup_uncharge_page(new_page); put_page(new_page); goto out_mn; @@ -1216,13 +1213,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); out_mn: mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); return ret; } @@ -1234,7 +1231,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, struct mm_struct *mm = vma->vm_mm; struct page *page = NULL; - assert_spin_locked(pmd_lockptr(mm, pmd)); + assert_spin_locked(&mm->page_table_lock); if (flags & FOLL_WRITE && !pmd_write(*pmd)) goto out; @@ -1281,37 +1278,23 @@ out: int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t pmd, pmd_t *pmdp) { - spinlock_t *ptl; struct anon_vma *anon_vma = NULL; struct page *page; unsigned long haddr = addr & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); - int target_nid, last_cpupid = -1; + int target_nid; bool page_locked; bool migrated = false; - int flags = 0; - ptl = pmd_lock(mm, pmdp); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) goto out_unlock; page = pmd_page(pmd); - BUG_ON(is_huge_zero_page(page)); page_nid = page_to_nid(page); - last_cpupid = page_cpupid_last(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == this_nid) { + if (page_nid == this_nid) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); - flags |= TNF_FAULT_LOCAL; - } - - /* - * Avoid grouping on DSO/COW pages in specific and RO pages - * in general, RO pages shouldn't hurt as much anyway since - * they can be in shared cache state. - */ - if (!pmd_write(pmd)) - flags |= TNF_NO_GROUP; /* * Acquire the page lock to serialise THP migrations but avoid dropping @@ -1329,7 +1312,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * relock and check_same as the page may no longer be mapped. * As the fault is being retried, do not account for it. */ - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1337,13 +1320,13 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Page is misplaced, serialise migrations and parallel THP splits */ get_page(page); - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); if (!page_locked) lock_page(page); anon_vma = page_lock_anon_vma_read(page); - /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(ptl); + /* Confirm the PTE did not while locked */ + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(pmd, *pmdp))) { unlock_page(page); put_page(page); @@ -1355,13 +1338,11 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, * Migrate the THP to the requested node, returns with page unlocked * and pmd_numa cleared. */ - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); migrated = migrate_misplaced_transhuge_page(mm, vma, pmdp, pmd, addr, page, target_nid); - if (migrated) { - flags |= TNF_MIGRATED; + if (migrated) page_nid = target_nid; - } goto out; clear_pmdnuma: @@ -1372,14 +1353,14 @@ clear_pmdnuma: update_mmu_cache_pmd(vma, addr, pmdp); unlock_page(page); out_unlock: - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, flags); + task_numa_fault(page_nid, HPAGE_PMD_NR, migrated); return 0; } @@ -1387,10 +1368,9 @@ out: int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr) { - spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (__pmd_trans_huge_lock(pmd, vma) == 1) { struct page *page; pgtable_t pgtable; pmd_t orig_pmd; @@ -1404,8 +1384,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_remove_pmd_tlb_entry(tlb, pmd, addr); pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { - atomic_long_dec(&tlb->mm->nr_ptes); - spin_unlock(ptl); + tlb->mm->nr_ptes--; + spin_unlock(&tlb->mm->page_table_lock); put_huge_zero_page(); } else { page = pmd_page(orig_pmd); @@ -1413,8 +1393,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, VM_BUG_ON(page_mapcount(page) < 0); add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); VM_BUG_ON(!PageHead(page)); - atomic_long_dec(&tlb->mm->nr_ptes); - spin_unlock(ptl); + tlb->mm->nr_ptes--; + spin_unlock(&tlb->mm->page_table_lock); tlb_remove_page(tlb, page); } pte_free(tlb->mm, pgtable); @@ -1427,15 +1407,14 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec) { - spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (__pmd_trans_huge_lock(pmd, vma) == 1) { /* * All logical pages in the range are present * if backed by a huge page. */ - spin_unlock(ptl); + spin_unlock(&vma->vm_mm->page_table_lock); memset(vec, 1, (end - addr) >> PAGE_SHIFT); ret = 1; } @@ -1448,7 +1427,6 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, unsigned long new_addr, unsigned long old_end, pmd_t *old_pmd, pmd_t *new_pmd) { - spinlock_t *old_ptl, *new_ptl; int ret = 0; pmd_t pmd; @@ -1469,69 +1447,41 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, goto out; } - /* - * We don't have to worry about the ordering of src and dst - * ptlocks because exclusive mmap_sem prevents deadlock. - */ - ret = __pmd_trans_huge_lock(old_pmd, vma, &old_ptl); + ret = __pmd_trans_huge_lock(old_pmd, vma); if (ret == 1) { - new_ptl = pmd_lockptr(mm, new_pmd); - if (new_ptl != old_ptl) - spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); - if (new_ptl != old_ptl) - spin_unlock(new_ptl); - spin_unlock(old_ptl); + spin_unlock(&mm->page_table_lock); } out: return ret; } -/* - * Returns - * - 0 if PMD could not be locked - * - 1 if PMD was locked but protections unchange and TLB flush unnecessary - * - HPAGE_PMD_NR is protections changed and TLB flush necessary - */ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, pgprot_t newprot, int prot_numa) { struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (__pmd_trans_huge_lock(pmd, vma) == 1) { pmd_t entry; - ret = 1; + entry = pmdp_get_and_clear(mm, addr, pmd); if (!prot_numa) { - entry = pmdp_get_and_clear(mm, addr, pmd); entry = pmd_modify(entry, newprot); - ret = HPAGE_PMD_NR; BUG_ON(pmd_write(entry)); } else { struct page *page = pmd_page(*pmd); - /* - * Do not trap faults against the zero page. The - * read-only data is likely to be read-cached on the - * local CPU cache and it is less useful to know about - * local vs remote hits on the zero page. - */ - if (!is_huge_zero_page(page) && + /* only check non-shared pages */ + if (page_mapcount(page) == 1 && !pmd_numa(*pmd)) { - entry = pmdp_get_and_clear(mm, addr, pmd); entry = pmd_mknuma(entry); - ret = HPAGE_PMD_NR; } } - - /* Set PMD if cleared earlier */ - if (ret == HPAGE_PMD_NR) - set_pmd_at(mm, addr, pmd, entry); - - spin_unlock(ptl); + set_pmd_at(mm, addr, pmd, entry); + spin_unlock(&vma->vm_mm->page_table_lock); + ret = 1; } return ret; @@ -1544,13 +1494,12 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * Note that if it returns 1, this routine returns without unlocking page * table locks. So callers must unlock them. */ -int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl) +int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { - *ptl = pmd_lock(vma->vm_mm, pmd); + spin_lock(&vma->vm_mm->page_table_lock); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(*ptl); + spin_unlock(&vma->vm_mm->page_table_lock); wait_split_huge_page(vma->anon_vma, pmd); return -1; } else { @@ -1559,37 +1508,27 @@ int __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, return 1; } } - spin_unlock(*ptl); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } -/* - * This function returns whether a given @page is mapped onto the @address - * in the virtual space of @mm. - * - * When it's true, this function returns *pmd with holding the page table lock - * and passing it back to the caller via @ptl. - * If it's false, returns NULL without holding the page table lock. - */ pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, - enum page_check_address_pmd_flag flag, - spinlock_t **ptl) + enum page_check_address_pmd_flag flag) { - pmd_t *pmd; + pmd_t *pmd, *ret = NULL; if (address & ~HPAGE_PMD_MASK) - return NULL; + goto out; pmd = mm_find_pmd(mm, address); if (!pmd) - return NULL; - *ptl = pmd_lock(mm, pmd); + goto out; if (pmd_none(*pmd)) - goto unlock; + goto out; if (pmd_page(*pmd) != page) - goto unlock; + goto out; /* * split_vma() may create temporary aliased mappings. There is * no risk as long as all huge pmd are found and have their @@ -1599,15 +1538,14 @@ pmd_t *page_check_address_pmd(struct page *page, */ if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && pmd_trans_splitting(*pmd)) - goto unlock; + goto out; if (pmd_trans_huge(*pmd)) { VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && !pmd_trans_splitting(*pmd)); - return pmd; + ret = pmd; } -unlock: - spin_unlock(*ptl); - return NULL; +out: + return ret; } static int __split_huge_page_splitting(struct page *page, @@ -1615,7 +1553,6 @@ static int __split_huge_page_splitting(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; pmd_t *pmd; int ret = 0; /* For mmu_notifiers */ @@ -1623,8 +1560,9 @@ static int __split_huge_page_splitting(struct page *page, const unsigned long mmun_end = address + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); + spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, &ptl); + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); if (pmd) { /* * We can't temporarily set the pmd to null in order @@ -1635,8 +1573,8 @@ static int __split_huge_page_splitting(struct page *page, */ pmdp_splitting_flush(vma, address, pmd); ret = 1; - spin_unlock(ptl); } + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return ret; @@ -1724,7 +1662,7 @@ static void __split_huge_page_refcount(struct page *page, page_tail->mapping = page->mapping; page_tail->index = page->index + i; - page_cpupid_xchg_last(page_tail, page_cpupid_last(page)); + page_nid_xchg_last(page_tail, page_nid_last(page)); BUG_ON(!PageAnon(page_tail)); BUG_ON(!PageUptodate(page_tail)); @@ -1767,14 +1705,14 @@ static int __split_huge_page_map(struct page *page, unsigned long address) { struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; pmd_t *pmd, _pmd; int ret = 0, i; pgtable_t pgtable; unsigned long haddr; + spin_lock(&mm->page_table_lock); pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, &ptl); + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); if (pmd) { pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -1829,8 +1767,8 @@ static int __split_huge_page_map(struct page *page, pmdp_invalidate(vma, address, pmd); pmd_populate(mm, pmd, pgtable); ret = 1; - spin_unlock(ptl); } + spin_unlock(&mm->page_table_lock); return ret; } @@ -2227,34 +2165,7 @@ static void khugepaged_alloc_sleep(void) msecs_to_jiffies(khugepaged_alloc_sleep_millisecs)); } -static int khugepaged_node_load[MAX_NUMNODES]; - #ifdef CONFIG_NUMA -static int khugepaged_find_target_node(void) -{ - static int last_khugepaged_target_node = NUMA_NO_NODE; - int nid, target_node = 0, max_value = 0; - - /* find first node with max normal pages hit */ - for (nid = 0; nid < MAX_NUMNODES; nid++) - if (khugepaged_node_load[nid] > max_value) { - max_value = khugepaged_node_load[nid]; - target_node = nid; - } - - /* do some balance if several nodes have the same hit record */ - if (target_node <= last_khugepaged_target_node) - for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == khugepaged_node_load[nid]) { - target_node = nid; - break; - } - - last_khugepaged_target_node = target_node; - return target_node; -} - static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) { if (IS_ERR(*hpage)) { @@ -2288,8 +2199,9 @@ static struct page * mmap_sem in read mode is good idea also to allow greater * scalability. */ - *hpage = alloc_pages_exact_node(node, alloc_hugepage_gfpmask( - khugepaged_defrag(), __GFP_OTHER_NODE), HPAGE_PMD_ORDER); + *hpage = alloc_hugepage_vma(khugepaged_defrag(), vma, address, + node, __GFP_OTHER_NODE); + /* * After allocating the hugepage, release the mmap_sem read lock in * preparation for taking it in write mode. @@ -2305,17 +2217,6 @@ static struct page return *hpage; } #else -static int khugepaged_find_target_node(void) -{ - return 0; -} - -static inline struct page *alloc_hugepage(int defrag) -{ - return alloc_pages(alloc_hugepage_gfpmask(defrag, 0), - HPAGE_PMD_ORDER); -} - static struct page *khugepaged_alloc_hugepage(bool *wait) { struct page *hpage; @@ -2382,7 +2283,7 @@ static void collapse_huge_page(struct mm_struct *mm, pte_t *pte; pgtable_t pgtable; struct page *new_page; - spinlock_t *pmd_ptl, *pte_ptl; + spinlock_t *ptl; int isolated; unsigned long hstart, hend; unsigned long mmun_start; /* For mmu_notifiers */ @@ -2425,12 +2326,12 @@ static void collapse_huge_page(struct mm_struct *mm, anon_vma_lock_write(vma->anon_vma); pte = pte_offset_map(pmd, address); - pte_ptl = pte_lockptr(mm, pmd); + ptl = pte_lockptr(mm, pmd); mmun_start = address; mmun_end = address + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - pmd_ptl = pmd_lock(mm, pmd); /* probably unnecessary */ + spin_lock(&mm->page_table_lock); /* probably unnecessary */ /* * After this gup_fast can't run anymore. This also removes * any huge TLB entry from the CPU so we won't allow @@ -2438,16 +2339,16 @@ static void collapse_huge_page(struct mm_struct *mm, * to avoid the risk of CPU bugs in that area. */ _pmd = pmdp_clear_flush(vma, address, pmd); - spin_unlock(pmd_ptl); + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - spin_lock(pte_ptl); + spin_lock(ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); - spin_unlock(pte_ptl); + spin_unlock(ptl); if (unlikely(!isolated)) { pte_unmap(pte); - spin_lock(pmd_ptl); + spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); /* * We can only use set_pmd_at when establishing @@ -2455,7 +2356,7 @@ static void collapse_huge_page(struct mm_struct *mm, * points to regular pagetables. Use pmd_populate for that */ pmd_populate(mm, pmd, pmd_pgtable(_pmd)); - spin_unlock(pmd_ptl); + spin_unlock(&mm->page_table_lock); anon_vma_unlock_write(vma->anon_vma); goto out; } @@ -2466,7 +2367,7 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl); + __collapse_huge_page_copy(pte, new_page, vma, address, ptl); pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); @@ -2481,13 +2382,13 @@ static void collapse_huge_page(struct mm_struct *mm, */ smp_wmb(); - spin_lock(pmd_ptl); + spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - spin_unlock(pmd_ptl); + spin_unlock(&mm->page_table_lock); *hpage = NULL; @@ -2522,7 +2423,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (pmd_trans_huge(*pmd)) goto out; - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { @@ -2539,13 +2439,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (unlikely(!page)) goto out_unmap; /* - * Record which node the original page is from and save this - * information to khugepaged_node_load[]. - * Khupaged will allocate hugepage from the node has the max - * hit record. + * Chose the node of the first page. This could + * be more sophisticated and look at more pages, + * but isn't for now. */ - node = page_to_nid(page); - khugepaged_node_load[node]++; + if (node == NUMA_NO_NODE) + node = page_to_nid(page); VM_BUG_ON(PageCompound(page)); if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) goto out_unmap; @@ -2560,11 +2459,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - node = khugepaged_find_target_node(); + if (ret) /* collapse_huge_page will return with the mmap_sem released */ collapse_huge_page(mm, address, hpage, vma, node); - } out: return ret; } @@ -2816,7 +2713,6 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, pmd_t *pmd) { - spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; unsigned long haddr = address & HPAGE_PMD_MASK; @@ -2829,22 +2725,22 @@ void __split_huge_page_pmd(struct vm_area_struct *vma, unsigned long address, mmun_end = haddr + HPAGE_PMD_SIZE; again: mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); - ptl = pmd_lock(mm, pmd); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_trans_huge(*pmd))) { - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } if (is_huge_zero_pmd(*pmd)) { __split_huge_zero_page_pmd(vma, haddr, pmd); - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); return; } page = pmd_page(*pmd); VM_BUG_ON(!page_count(page)); get_page(page); - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); split_huge_page(page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index dee6cf4..0b7656e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -476,6 +476,40 @@ static int vma_has_reserves(struct vm_area_struct *vma, long chg) return 0; } +static void copy_gigantic_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page(h); ) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_huge_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { + copy_gigantic_page(dst, src); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page(h); i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } +} + static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); @@ -702,23 +736,6 @@ int PageHuge(struct page *page) } EXPORT_SYMBOL_GPL(PageHuge); -/* - * PageHeadHuge() only returns true for hugetlbfs head page, but not for - * normal or transparent huge pages. - */ -int PageHeadHuge(struct page *page_head) -{ - compound_page_dtor *dtor; - - if (!PageHead(page_head)) - return 0; - - dtor = get_compound_page_dtor(page_head); - - return dtor == free_huge_page; -} -EXPORT_SYMBOL_GPL(PageHeadHuge); - pgoff_t __basepage_index(struct page *page) { struct page *page_head = compound_head(page); @@ -2359,7 +2376,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; for (addr = vma->vm_start; addr < vma->vm_end; addr += sz) { - spinlock_t *src_ptl, *dst_ptl; src_pte = huge_pte_offset(src, addr); if (!src_pte) continue; @@ -2371,9 +2387,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, if (dst_pte == src_pte) continue; - dst_ptl = huge_pte_lock(h, dst, dst_pte); - src_ptl = huge_pte_lockptr(h, src, src_pte); - spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); + spin_lock(&dst->page_table_lock); + spin_lock_nested(&src->page_table_lock, SINGLE_DEPTH_NESTING); if (!huge_pte_none(huge_ptep_get(src_pte))) { if (cow) huge_ptep_set_wrprotect(src, addr, src_pte); @@ -2383,8 +2398,8 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, page_dup_rmap(ptepage); set_huge_pte_at(dst, addr, dst_pte, entry); } - spin_unlock(src_ptl); - spin_unlock(dst_ptl); + spin_unlock(&src->page_table_lock); + spin_unlock(&dst->page_table_lock); } return 0; @@ -2427,7 +2442,6 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long address; pte_t *ptep; pte_t pte; - spinlock_t *ptl; struct page *page; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); @@ -2441,25 +2455,25 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); again: + spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); if (!ptep) continue; - ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) - goto unlock; + continue; pte = huge_ptep_get(ptep); if (huge_pte_none(pte)) - goto unlock; + continue; /* * HWPoisoned hugepage is already unmapped and dropped reference */ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { huge_pte_clear(mm, address, ptep); - goto unlock; + continue; } page = pte_page(pte); @@ -2470,7 +2484,7 @@ again: */ if (ref_page) { if (page != ref_page) - goto unlock; + continue; /* * Mark the VMA as having unmapped its page so that @@ -2487,18 +2501,13 @@ again: page_remove_rmap(page); force_flush = !__tlb_remove_page(tlb, page); - if (force_flush) { - spin_unlock(ptl); + if (force_flush) break; - } /* Bail out after unmapping reference page if supplied */ - if (ref_page) { - spin_unlock(ptl); + if (ref_page) break; - } -unlock: - spin_unlock(ptl); } + spin_unlock(&mm->page_table_lock); /* * mmu_gather ran out of room to batch pages, we break out of * the PTE lock to avoid doing the potential expensive TLB invalidate @@ -2604,7 +2613,7 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, */ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, pte_t pte, - struct page *pagecache_page, spinlock_t *ptl) + struct page *pagecache_page) { struct hstate *h = hstate_vma(vma); struct page *old_page, *new_page; @@ -2638,8 +2647,8 @@ retry_avoidcopy: page_cache_get(old_page); - /* Drop page table lock as buddy allocator may be called */ - spin_unlock(ptl); + /* Drop page_table_lock as buddy allocator may be called */ + spin_unlock(&mm->page_table_lock); new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { @@ -2657,13 +2666,13 @@ retry_avoidcopy: BUG_ON(huge_pte_none(pte)); if (unmap_ref_private(mm, vma, old_page, address)) { BUG_ON(huge_pte_none(pte)); - spin_lock(ptl); + spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) goto retry_avoidcopy; /* - * race occurs while re-acquiring page table - * lock, and our job is done. + * race occurs while re-acquiring page_table_lock, and + * our job is done. */ return 0; } @@ -2671,7 +2680,7 @@ retry_avoidcopy: } /* Caller expects lock to be held */ - spin_lock(ptl); + spin_lock(&mm->page_table_lock); if (err == -ENOMEM) return VM_FAULT_OOM; else @@ -2686,7 +2695,7 @@ retry_avoidcopy: page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(ptl); + spin_lock(&mm->page_table_lock); return VM_FAULT_OOM; } @@ -2698,10 +2707,10 @@ retry_avoidcopy: mmun_end = mmun_start + huge_page_size(h); mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); /* - * Retake the page table lock to check for racing updates + * Retake the page_table_lock to check for racing updates * before the page tables are altered */ - spin_lock(ptl); + spin_lock(&mm->page_table_lock); ptep = huge_pte_offset(mm, address & huge_page_mask(h)); if (likely(pte_same(huge_ptep_get(ptep), pte))) { ClearPagePrivate(new_page); @@ -2715,13 +2724,13 @@ retry_avoidcopy: /* Make the old page be freed below */ new_page = old_page; } - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); page_cache_release(new_page); page_cache_release(old_page); /* Caller expects lock to be held */ - spin_lock(ptl); + spin_lock(&mm->page_table_lock); return 0; } @@ -2769,7 +2778,6 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; struct address_space *mapping; pte_t new_pte; - spinlock_t *ptl; /* * Currently, we are forced to kill the process in the event the @@ -2856,8 +2864,7 @@ retry: goto backout_unlocked; } - ptl = huge_pte_lockptr(h, mm, ptep); - spin_lock(ptl); + spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) goto backout; @@ -2878,16 +2885,16 @@ retry: if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page, ptl); + ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); } - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); unlock_page(page); out: return ret; backout: - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); backout_unlocked: unlock_page(page); put_page(page); @@ -2899,7 +2906,6 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, { pte_t *ptep; pte_t entry; - spinlock_t *ptl; int ret; struct page *page = NULL; struct page *pagecache_page = NULL; @@ -2912,7 +2918,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { - migration_entry_wait_huge(vma, mm, ptep); + migration_entry_wait_huge(mm, ptep); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | @@ -2968,18 +2974,17 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (page != pagecache_page) lock_page(page); - ptl = huge_pte_lockptr(h, mm, ptep); - spin_lock(ptl); + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) - goto out_ptl; + goto out_page_table_lock; if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, - pagecache_page, ptl); - goto out_ptl; + pagecache_page); + goto out_page_table_lock; } entry = huge_pte_mkdirty(entry); } @@ -2988,8 +2993,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, ptep); -out_ptl: - spin_unlock(ptl); +out_page_table_lock: + spin_unlock(&mm->page_table_lock); if (pagecache_page) { unlock_page(pagecache_page); @@ -3015,9 +3020,9 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long remainder = *nr_pages; struct hstate *h = hstate_vma(vma); + spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; - spinlock_t *ptl = NULL; int absent; struct page *page; @@ -3025,12 +3030,8 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * Some archs (sparc64, sh*) have multiple pte_ts to * each hugepage. We have to make sure we get the * first, for the page indexing below to work. - * - * Note that page table lock is not held when pte is null. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); - if (pte) - ptl = huge_pte_lock(h, mm, pte); absent = !pte || huge_pte_none(huge_ptep_get(pte)); /* @@ -3042,8 +3043,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, */ if (absent && (flags & FOLL_DUMP) && !hugetlbfs_pagecache_present(h, vma, vaddr)) { - if (pte) - spin_unlock(ptl); remainder = 0; break; } @@ -3063,10 +3062,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, !huge_pte_write(huge_ptep_get(pte)))) { int ret; - if (pte) - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); ret = hugetlb_fault(mm, vma, vaddr, (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); + spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; @@ -3097,8 +3096,8 @@ same_page: */ goto same_page; } - spin_unlock(ptl); } + spin_unlock(&mm->page_table_lock); *nr_pages = remainder; *position = vaddr; @@ -3119,15 +3118,13 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, address, end); mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); + spin_lock(&mm->page_table_lock); for (; address < end; address += huge_page_size(h)) { - spinlock_t *ptl; ptep = huge_pte_offset(mm, address); if (!ptep) continue; - ptl = huge_pte_lock(h, mm, ptep); if (huge_pmd_unshare(mm, &address, ptep)) { pages++; - spin_unlock(ptl); continue; } if (!huge_pte_none(huge_ptep_get(ptep))) { @@ -3137,8 +3134,8 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, set_huge_pte_at(mm, address, ptep, pte); pages++; } - spin_unlock(ptl); } + spin_unlock(&mm->page_table_lock); /* * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare * may have cleared our pud entry and done put_page on the page table: @@ -3301,7 +3298,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) unsigned long saddr; pte_t *spte = NULL; pte_t *pte; - spinlock_t *ptl; if (!vma_shareable(vma, addr)) return (pte_t *)pmd_alloc(mm, pud, addr); @@ -3324,14 +3320,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!spte) goto out; - ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); - spin_lock(ptl); + spin_lock(&mm->page_table_lock); if (pud_none(*pud)) pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); else put_page(virt_to_page(spte)); - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); mutex_unlock(&mapping->i_mmap_mutex); @@ -3345,7 +3340,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * called with page table lock held. + * called with vma->vm_mm->page_table_lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 31f01c5..e126b0e 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -753,9 +753,7 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) } spin_lock_irqsave(&object->lock, flags); - if (size == SIZE_MAX) { - size = object->pointer + object->size - ptr; - } else if (ptr + size > object->pointer + object->size) { + if (ptr + size > object->pointer + object->size) { kmemleak_warn("Scan area larger than object 0x%08lx\n", ptr); dump_object_info(object); kmem_cache_free(scan_area_cache, area); @@ -2309,8 +2309,8 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj, * Allocate stable and unstable together: * MAXSMP NODES_SHIFT 10 will use 16kB. */ - buf = kcalloc(nr_node_ids + nr_node_ids, sizeof(*buf), - GFP_KERNEL); + buf = kcalloc(nr_node_ids + nr_node_ids, + sizeof(*buf), GFP_KERNEL | __GFP_ZERO); /* Let us assume that RB_ROOT is NULL is zero */ if (!buf) err = -ENOMEM; diff --git a/mm/memblock.c b/mm/memblock.c index 53e477b..0ac412a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -20,8 +20,6 @@ #include <linux/seq_file.h> #include <linux/memblock.h> -#include <asm-generic/sections.h> - static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock; @@ -34,7 +32,6 @@ struct memblock memblock __initdata_memblock = { .reserved.cnt = 1, /* empty dummy entry */ .reserved.max = INIT_MEMBLOCK_REGIONS, - .bottom_up = false, .current_limit = MEMBLOCK_ALLOC_ANYWHERE, }; @@ -85,73 +82,6 @@ static long __init_memblock memblock_overlaps_region(struct memblock_type *type, return (i < type->cnt) ? i : -1; } -/* - * __memblock_find_range_bottom_up - find free area utility in bottom-up - * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} - * @size: size of free area to find - * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node - * - * Utility called from memblock_find_in_range_node(), find free area bottom-up. - * - * RETURNS: - * Found address on success, 0 on failure. - */ -static phys_addr_t __init_memblock -__memblock_find_range_bottom_up(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid) -{ - phys_addr_t this_start, this_end, cand; - u64 i; - - for_each_free_mem_range(i, nid, &this_start, &this_end, NULL) { - this_start = clamp(this_start, start, end); - this_end = clamp(this_end, start, end); - - cand = round_up(this_start, align); - if (cand < this_end && this_end - cand >= size) - return cand; - } - - return 0; -} - -/** - * __memblock_find_range_top_down - find free area utility, in top-down - * @start: start of candidate range - * @end: end of candidate range, can be %MEMBLOCK_ALLOC_{ANYWHERE|ACCESSIBLE} - * @size: size of free area to find - * @align: alignment of free area to find - * @nid: nid of the free area to find, %MAX_NUMNODES for any node - * - * Utility called from memblock_find_in_range_node(), find free area top-down. - * - * RETURNS: - * Found address on success, 0 on failure. - */ -static phys_addr_t __init_memblock -__memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, - phys_addr_t size, phys_addr_t align, int nid) -{ - phys_addr_t this_start, this_end, cand; - u64 i; - - for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { - this_start = clamp(this_start, start, end); - this_end = clamp(this_end, start, end); - - if (this_end < size) - continue; - - cand = round_down(this_end - size, align); - if (cand >= this_start) - return cand; - } - - return 0; -} - /** * memblock_find_in_range_node - find free area in given range and node * @start: start of candidate range @@ -162,23 +92,15 @@ __memblock_find_range_top_down(phys_addr_t start, phys_addr_t end, * * Find @size free area aligned to @align in the specified range and node. * - * When allocation direction is bottom-up, the @start should be greater - * than the end of the kernel image. Otherwise, it will be trimmed. The - * reason is that we want the bottom-up allocation just near the kernel - * image so it is highly likely that the allocated memory and the kernel - * will reside in the same node. - * - * If bottom-up allocation failed, will try to allocate memory top-down. - * * RETURNS: - * Found address on success, 0 on failure. + * Found address on success, %0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid) { - int ret; - phys_addr_t kernel_end; + phys_addr_t this_start, this_end, cand; + u64 i; /* pump up @end */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) @@ -187,39 +109,19 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, /* avoid allocating the first page */ start = max_t(phys_addr_t, start, PAGE_SIZE); end = max(start, end); - kernel_end = __pa_symbol(_end); - /* - * try bottom-up allocation only when bottom-up mode - * is set and @end is above the kernel image. - */ - if (memblock_bottom_up() && end > kernel_end) { - phys_addr_t bottom_up_start; - - /* make sure we will allocate above the kernel */ - bottom_up_start = max(start, kernel_end); + for_each_free_mem_range_reverse(i, nid, &this_start, &this_end, NULL) { + this_start = clamp(this_start, start, end); + this_end = clamp(this_end, start, end); - /* ok, try bottom-up allocation first */ - ret = __memblock_find_range_bottom_up(bottom_up_start, end, - size, align, nid); - if (ret) - return ret; + if (this_end < size) + continue; - /* - * we always limit bottom-up allocation above the kernel, - * but top-down allocation doesn't have the limit, so - * retrying top-down allocation may succeed when bottom-up - * allocation failed. - * - * bottom-up allocation is expected to be fail very rarely, - * so we use WARN_ONCE() here to see the stack trace if - * fail happens. - */ - WARN_ONCE(1, "memblock: bottom-up allocation failed, " - "memory hotunplug may be affected\n"); + cand = round_down(this_end - size, align); + if (cand >= this_start) + return cand; } - - return __memblock_find_range_top_down(start, end, size, align, nid); + return 0; } /** @@ -232,7 +134,7 @@ phys_addr_t __init_memblock memblock_find_in_range_node(phys_addr_t start, * Find @size free area aligned to @align in the specified range. * * RETURNS: - * Found address on success, 0 on failure. + * Found address on success, %0 on failure. */ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f1a0ae6..13b9d0f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -59,7 +59,6 @@ #include <net/sock.h> #include <net/ip.h> #include <net/tcp_memcontrol.h> -#include "slab.h" #include <asm/uaccess.h> @@ -313,7 +312,7 @@ struct mem_cgroup { atomic_t dead_count; #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) - struct cg_proto tcp_mem; + struct tcp_memcontrol tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) /* analogous to slab_common's slab_caches list. per-memcg */ @@ -500,29 +499,6 @@ static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) return (memcg == root_mem_cgroup); } -/* - * We restrict the id in the range of [1, 65535], so it can fit into - * an unsigned short. - */ -#define MEM_CGROUP_ID_MAX USHRT_MAX - -static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) -{ - /* - * The ID of the root cgroup is 0, but memcg treat 0 as an - * invalid ID, so we return (cgroup_id + 1). - */ - return memcg->css.cgroup->id + 1; -} - -static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) -{ - struct cgroup_subsys_state *css; - - css = css_from_id(id - 1, &mem_cgroup_subsys); - return mem_cgroup_from_css(css); -} - /* Writing them here to avoid exposing memcg's inner layout */ #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) @@ -575,13 +551,13 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) if (!memcg || mem_cgroup_is_root(memcg)) return NULL; - return &memcg->tcp_mem; + return &memcg->tcp_mem.cg_proto; } EXPORT_SYMBOL(tcp_proto_cgroup); static void disarm_sock_keys(struct mem_cgroup *memcg) { - if (!memcg_proto_activated(&memcg->tcp_mem)) + if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) return; static_key_slow_dec(&memcg_socket_limit_enabled); } @@ -594,11 +570,16 @@ static void disarm_sock_keys(struct mem_cgroup *memcg) #ifdef CONFIG_MEMCG_KMEM /* * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. - * The main reason for not using cgroup id for this: - * this works better in sparse environments, where we have a lot of memcgs, - * but only a few kmem-limited. Or also, if we have, for instance, 200 - * memcgs, and none but the 200th is kmem-limited, we'd have to have a - * 200 entry array for that. + * There are two main reasons for not using the css_id for this: + * 1) this works better in sparse environments, where we have a lot of memcgs, + * but only a few kmem-limited. Or also, if we have, for instance, 200 + * memcgs, and none but the 200th is kmem-limited, we'd have to have a + * 200 entry array for that. + * + * 2) In order not to violate the cgroup API, we would like to do all memory + * allocation in ->create(). At that point, we haven't yet allocated the + * css_id. Having a separate index prevents us from messing with the cgroup + * core for this * * The current size of the caches array is stored in * memcg_limited_groups_array_size. It will double each time we have to @@ -613,14 +594,14 @@ int memcg_limited_groups_array_size; * cgroups is a reasonable guess. In the future, it could be a parameter or * tunable, but that is strictly not necessary. * - * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get + * MAX_SIZE should be as large as the number of css_ids. Ideally, we could get * this constant directly from cgroup, but it is understandable that this is * better kept as an internal representation in cgroup.c. In any case, the - * cgrp_id space is not getting any smaller, and we don't have to necessarily + * css_id space is not getting any smaller, and we don't have to necessarily * increase ours as well if it increases. */ #define MEMCG_CACHES_MIN_SIZE 4 -#define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX +#define MEMCG_CACHES_MAX_SIZE 65535 /* * A lot of the calls to the cache allocation functions are expected to be @@ -1427,7 +1408,7 @@ bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, return true; if (!root_memcg->use_hierarchy || !memcg) return false; - return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); + return css_is_ancestor(&memcg->css, &root_memcg->css); } static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, @@ -2845,10 +2826,15 @@ static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, */ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) { + struct cgroup_subsys_state *css; + /* ID 0 is unused ID */ if (!id) return NULL; - return mem_cgroup_from_id(id); + css = css_lookup(&mem_cgroup_subsys, id); + if (!css) + return NULL; + return mem_cgroup_from_css(css); } struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) @@ -2969,7 +2955,7 @@ static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) VM_BUG_ON(p->is_root_cache); cachep = p->root_cache; - return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); + return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)]; } #ifdef CONFIG_SLABINFO @@ -2998,14 +2984,21 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) struct res_counter *fail_res; struct mem_cgroup *_memcg; int ret = 0; + bool may_oom; ret = res_counter_charge(&memcg->kmem, size, &fail_res); if (ret) return ret; + /* + * Conditions under which we can wait for the oom_killer. Those are + * the same conditions tested by the core page allocator + */ + may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); + _memcg = memcg; ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, - &_memcg, oom_gfp_allowed(gfp)); + &_memcg, may_oom); if (ret == -EINTR) { /* @@ -3145,7 +3138,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; - VM_BUG_ON(!is_root_cache(s)); + VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache); if (num_groups > memcg_limited_groups_array_size) { int i; @@ -3406,7 +3399,7 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, idx = memcg_cache_id(memcg); mutex_lock(&memcg_cache_mutex); - new_cachep = cache_from_memcg_idx(cachep, idx); + new_cachep = cachep->memcg_params->memcg_caches[idx]; if (new_cachep) { css_put(&memcg->css); goto out; @@ -3452,8 +3445,8 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s) * we'll take the set_limit_mutex to protect ourselves against this. */ mutex_lock(&set_limit_mutex); - for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); + for (i = 0; i < memcg_limited_groups_array_size; i++) { + c = s->memcg_params->memcg_caches[i]; if (!c) continue; @@ -3586,8 +3579,8 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * code updating memcg_caches will issue a write barrier to match this. */ read_barrier_depends(); - if (likely(cache_from_memcg_idx(cachep, idx))) { - cachep = cache_from_memcg_idx(cachep, idx); + if (likely(cachep->memcg_params->memcg_caches[idx])) { + cachep = cachep->memcg_params->memcg_caches[idx]; goto out; } @@ -4357,7 +4350,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) * css_get() was called in uncharge(). */ if (do_swap_account && swapout && memcg) - swap_cgroup_record(ent, mem_cgroup_id(memcg)); + swap_cgroup_record(ent, css_id(&memcg->css)); } #endif @@ -4409,8 +4402,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, { unsigned short old_id, new_id; - old_id = mem_cgroup_id(from); - new_id = mem_cgroup_id(to); + old_id = css_id(&from->css); + new_id = css_id(&to->css); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { mem_cgroup_swap_statistics(from, false); @@ -5383,50 +5376,45 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, static int memcg_numa_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, struct seq_file *m) { - struct numa_stat { - const char *name; - unsigned int lru_mask; - }; - - static const struct numa_stat stats[] = { - { "total", LRU_ALL }, - { "file", LRU_ALL_FILE }, - { "anon", LRU_ALL_ANON }, - { "unevictable", BIT(LRU_UNEVICTABLE) }, - }; - const struct numa_stat *stat; int nid; - unsigned long nr; + unsigned long total_nr, file_nr, anon_nr, unevictable_nr; + unsigned long node_nr; struct mem_cgroup *memcg = mem_cgroup_from_css(css); - for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { - nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); - seq_printf(m, "%s=%lu", stat->name, nr); - for_each_node_state(nid, N_MEMORY) { - nr = mem_cgroup_node_nr_lru_pages(memcg, nid, - stat->lru_mask); - seq_printf(m, " N%d=%lu", nid, nr); - } - seq_putc(m, '\n'); - } - - for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { - struct mem_cgroup *iter; - - nr = 0; - for_each_mem_cgroup_tree(iter, memcg) - nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); - seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); - for_each_node_state(nid, N_MEMORY) { - nr = 0; - for_each_mem_cgroup_tree(iter, memcg) - nr += mem_cgroup_node_nr_lru_pages( - iter, nid, stat->lru_mask); - seq_printf(m, " N%d=%lu", nid, nr); - } - seq_putc(m, '\n'); + total_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL); + seq_printf(m, "total=%lu", total_nr); + for_each_node_state(nid, N_MEMORY) { + node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL); + seq_printf(m, " N%d=%lu", nid, node_nr); } + seq_putc(m, '\n'); + file_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_FILE); + seq_printf(m, "file=%lu", file_nr); + for_each_node_state(nid, N_MEMORY) { + node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, + LRU_ALL_FILE); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + anon_nr = mem_cgroup_nr_lru_pages(memcg, LRU_ALL_ANON); + seq_printf(m, "anon=%lu", anon_nr); + for_each_node_state(nid, N_MEMORY) { + node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, + LRU_ALL_ANON); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); + + unevictable_nr = mem_cgroup_nr_lru_pages(memcg, BIT(LRU_UNEVICTABLE)); + seq_printf(m, "unevictable=%lu", unevictable_nr); + for_each_node_state(nid, N_MEMORY) { + node_nr = mem_cgroup_node_nr_lru_pages(memcg, nid, + BIT(LRU_UNEVICTABLE)); + seq_printf(m, " N%d=%lu", nid, node_nr); + } + seq_putc(m, '\n'); return 0; } #endif /* CONFIG_NUMA */ @@ -6178,6 +6166,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) size_t size = memcg_size(); mem_cgroup_remove_from_trees(memcg); + free_css_id(&mem_cgroup_subsys, &memcg->css); for_each_node(node) free_mem_cgroup_per_zone_info(memcg, node); @@ -6280,9 +6269,6 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); int error = 0; - if (css->cgroup->id > MEM_CGROUP_ID_MAX) - return -ENOSPC; - if (!parent) return 0; @@ -6554,7 +6540,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, } /* There is a swap entry and a page doesn't exist or isn't charged */ if (ent.val && !ret && - mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) { + css_id(&mc.from->css) == lookup_swap_cgroup_id(ent)) { ret = MC_TARGET_SWAP; if (target) target->ent = ent; @@ -6605,10 +6591,10 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma) == 1) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; - spin_unlock(ptl); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -6797,9 +6783,9 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, * to be unlocked in __split_huge_page_splitting(), where the main * part of thp split is not executed yet. */ - if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma) == 1) { if (mc.precharge < HPAGE_PMD_NR) { - spin_unlock(ptl); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } target_type = get_mctgt_type_thp(vma, addr, *pmd, &target); @@ -6816,7 +6802,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, } put_page(page); } - spin_unlock(ptl); + spin_unlock(&vma->vm_mm->page_table_lock); return 0; } @@ -6974,6 +6960,7 @@ struct cgroup_subsys mem_cgroup_subsys = { .bind = mem_cgroup_bind, .base_cftypes = mem_cgroup_files, .early_init = 0, + .use_id = 1, }; #ifdef CONFIG_MEMCG_SWAP diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b7c1716..bf3351b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1269,7 +1269,7 @@ void memory_failure_queue(unsigned long pfn, int trapno, int flags) mf_cpu = &get_cpu_var(memory_failure_cpu); spin_lock_irqsave(&mf_cpu->lock, proc_flags); - if (kfifo_put(&mf_cpu->fifo, entry)) + if (kfifo_put(&mf_cpu->fifo, &entry)) schedule_work_on(smp_processor_id(), &mf_cpu->work); else pr_err("Memory failure: buffer overflow when queuing memory failure at %#lx\n", @@ -1423,6 +1423,19 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) return 1; /* + * The lock_memory_hotplug prevents a race with memory hotplug. + * This is a big hammer, a better would be nicer. + */ + lock_memory_hotplug(); + + /* + * Isolate the page, so that it doesn't get reallocated if it + * was free. This flag should be kept set until the source page + * is freed and PG_hwpoison on it is set. + */ + if (get_pageblock_migratetype(p) != MIGRATE_ISOLATE) + set_migratetype_isolate(p, true); + /* * When the target page is a free hugepage, just remove it * from free hugepage list. */ @@ -1442,6 +1455,7 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) /* Not a free page */ ret = 1; } + unlock_memory_hotplug(); return ret; } @@ -1640,28 +1654,15 @@ int soft_offline_page(struct page *page, int flags) } } - /* - * The lock_memory_hotplug prevents a race with memory hotplug. - * This is a big hammer, a better would be nicer. - */ - lock_memory_hotplug(); - - /* - * Isolate the page, so that it doesn't get reallocated if it - * was free. This flag should be kept set until the source page - * is freed and PG_hwpoison on it is set. - */ - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) - set_migratetype_isolate(page, true); - ret = get_any_page(page, pfn, flags); - unlock_memory_hotplug(); - if (ret > 0) { /* for in-use pages */ + if (ret < 0) + goto unset; + if (ret) { /* for in-use pages */ if (PageHuge(page)) ret = soft_offline_huge_page(page, flags); else ret = __soft_offline_page(page, flags); - } else if (ret == 0) { /* for free pages */ + } else { /* for free pages */ if (PageHuge(page)) { set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); @@ -1672,6 +1673,7 @@ int soft_offline_page(struct page *page, int flags) atomic_long_inc(&num_poisoned_pages); } } +unset: unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } diff --git a/mm/memory.c b/mm/memory.c index 5d9025f..d176154 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -69,8 +69,8 @@ #include "internal.h" -#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS -#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_cpupid. +#ifdef LAST_NID_NOT_IN_PAGE_FLAGS +#warning Unfortunate NUMA and NUMA Balancing config, growing page-frame for last_nid. #endif #ifndef CONFIG_NEED_MULTIPLE_NODES @@ -382,7 +382,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd, pgtable_t token = pmd_pgtable(*pmd); pmd_clear(pmd); pte_free_tlb(tlb, token, addr); - atomic_long_dec(&tlb->mm->nr_ptes); + tlb->mm->nr_ptes--; } static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, @@ -453,6 +453,8 @@ static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, /* * This function frees user-level page tables of a process. + * + * Must be called with pagetable lock held. */ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, @@ -550,7 +552,6 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, pmd_t *pmd, unsigned long address) { - spinlock_t *ptl; pgtable_t new = pte_alloc_one(mm, address); int wait_split_huge_page; if (!new) @@ -571,15 +572,15 @@ int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, */ smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ - ptl = pmd_lock(mm, pmd); + spin_lock(&mm->page_table_lock); wait_split_huge_page = 0; if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ - atomic_long_inc(&mm->nr_ptes); + mm->nr_ptes++; pmd_populate(mm, pmd, new); new = NULL; } else if (unlikely(pmd_trans_splitting(*pmd))) wait_split_huge_page = 1; - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); if (new) pte_free(mm, new); if (wait_split_huge_page) @@ -680,7 +681,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, if (vma->vm_ops) printk(KERN_ALERT "vma->vm_ops->fault: %pSR\n", vma->vm_ops->fault); - if (vma->vm_file) + if (vma->vm_file && vma->vm_file->f_op) printk(KERN_ALERT "vma->vm_file->f_op->mmap: %pSR\n", vma->vm_file->f_op->mmap); dump_stack(); @@ -1517,20 +1518,20 @@ struct page *follow_page_mask(struct vm_area_struct *vma, split_huge_page_pmd(vma, address, pmd); goto split_fallthrough; } - ptl = pmd_lock(mm, pmd); + spin_lock(&mm->page_table_lock); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); wait_split_huge_page(vma->anon_vma, pmd); } else { page = follow_trans_huge_pmd(vma, address, pmd, flags); - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); *page_mask = HPAGE_PMD_NR - 1; goto out; } } else - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); /* fall through */ } split_fallthrough: @@ -2720,14 +2721,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); reuse: - /* - * Clear the pages cpupid information as the existing - * information potentially belongs to a now completely - * unrelated process. - */ - if (old_page) - page_cpupid_xchg_last(old_page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -3528,16 +3521,13 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, } int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, - unsigned long addr, int page_nid, - int *flags) + unsigned long addr, int page_nid) { get_page(page); count_vm_numa_event(NUMA_HINT_FAULTS); - if (page_nid == numa_node_id()) { + if (page_nid == numa_node_id()) count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL); - *flags |= TNF_FAULT_LOCAL; - } return mpol_misplaced(page, vma, addr); } @@ -3548,10 +3538,8 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; spinlock_t *ptl; int page_nid = -1; - int last_cpupid; int target_nid; bool migrated = false; - int flags = 0; /* * The "pte" at this point cannot be used safely without @@ -3578,26 +3566,9 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(ptep, ptl); return 0; } - BUG_ON(is_zero_pfn(page_to_pfn(page))); - - /* - * Avoid grouping on DSO/COW pages in specific and RO pages - * in general, RO pages shouldn't hurt as much anyway since - * they can be in shared cache state. - */ - if (!pte_write(pte)) - flags |= TNF_NO_GROUP; - - /* - * Flag if the page is shared between multiple address spaces. This - * is later used when determining whether to group tasks together - */ - if (page_mapcount(page) > 1 && (vma->vm_flags & VM_SHARED)) - flags |= TNF_SHARED; - last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags); + target_nid = numa_migrate_prep(page, vma, addr, page_nid); pte_unmap_unlock(ptep, ptl); if (target_nid == -1) { put_page(page); @@ -3605,18 +3576,103 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, } /* Migrate to the requested node */ - migrated = migrate_misplaced_page(page, vma, target_nid); - if (migrated) { + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) page_nid = target_nid; - flags |= TNF_MIGRATED; - } out: if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, 1, flags); + task_numa_fault(page_nid, 1, migrated); return 0; } +/* NUMA hinting page fault entry point for regular pmds */ +#ifdef CONFIG_NUMA_BALANCING +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd; + pte_t *pte, *orig_pte; + unsigned long _addr = addr & PMD_MASK; + unsigned long offset; + spinlock_t *ptl; + bool numa = false; + + spin_lock(&mm->page_table_lock); + pmd = *pmdp; + if (pmd_numa(pmd)) { + set_pmd_at(mm, _addr, pmdp, pmd_mknonnuma(pmd)); + numa = true; + } + spin_unlock(&mm->page_table_lock); + + if (!numa) + return 0; + + /* we're in a page fault so some vma must be in the range */ + BUG_ON(!vma); + BUG_ON(vma->vm_start >= _addr + PMD_SIZE); + offset = max(_addr, vma->vm_start) & ~PMD_MASK; + VM_BUG_ON(offset >= PMD_SIZE); + orig_pte = pte = pte_offset_map_lock(mm, pmdp, _addr, &ptl); + pte += offset >> PAGE_SHIFT; + for (addr = _addr + offset; addr < _addr + PMD_SIZE; pte++, addr += PAGE_SIZE) { + pte_t pteval = *pte; + struct page *page; + int page_nid = -1; + int target_nid; + bool migrated = false; + + if (!pte_present(pteval)) + continue; + if (!pte_numa(pteval)) + continue; + if (addr >= vma->vm_end) { + vma = find_vma(mm, addr); + /* there's a pte present so there must be a vma */ + BUG_ON(!vma); + BUG_ON(addr < vma->vm_start); + } + if (pte_numa(pteval)) { + pteval = pte_mknonnuma(pteval); + set_pte_at(mm, addr, pte, pteval); + } + page = vm_normal_page(vma, addr, pteval); + if (unlikely(!page)) + continue; + /* only check non-shared pages */ + if (unlikely(page_mapcount(page) != 1)) + continue; + + page_nid = page_to_nid(page); + target_nid = numa_migrate_prep(page, vma, addr, page_nid); + pte_unmap_unlock(pte, ptl); + if (target_nid != -1) { + migrated = migrate_misplaced_page(page, target_nid); + if (migrated) + page_nid = target_nid; + } else { + put_page(page); + } + + if (page_nid != -1) + task_numa_fault(page_nid, 1, migrated); + + pte = pte_offset_map_lock(mm, pmdp, addr, &ptl); + } + pte_unmap_unlock(orig_pte, ptl); + + return 0; +} +#else +static int do_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + BUG(); + return 0; +} +#endif /* CONFIG_NUMA_BALANCING */ + /* * These routines also need to handle stuff like marking pages dirty * and/or accessed for architectures that don't do it in hardware (most @@ -3755,8 +3811,8 @@ retry: } } - /* THP should already have been handled */ - BUG_ON(pmd_numa(*pmd)); + if (pmd_numa(*pmd)) + return do_pmd_numa_page(mm, vma, address, pmd); /* * Use __pte_alloc instead of pte_alloc_map, because we can't @@ -4270,21 +4326,3 @@ void copy_user_huge_page(struct page *dst, struct page *src, } } #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ - -#if USE_SPLIT_PTE_PTLOCKS && BLOATED_SPINLOCKS -bool ptlock_alloc(struct page *page) -{ - spinlock_t *ptl; - - ptl = kmalloc(sizeof(spinlock_t), GFP_KERNEL); - if (!ptl) - return false; - page->ptl = ptl; - return true; -} - -void ptlock_free(struct page *page) -{ - kfree(page->ptl); -} -#endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 489f235..ed85fe3 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -31,7 +31,6 @@ #include <linux/firmware-map.h> #include <linux/stop_machine.h> #include <linux/hugetlb.h> -#include <linux/memblock.h> #include <asm/tlbflush.h> @@ -366,7 +365,8 @@ out_fail: static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long old_pgdat_end_pfn = pgdat_end_pfn(pgdat); + unsigned long old_pgdat_end_pfn = + pgdat->node_start_pfn + pgdat->node_spanned_pages; if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) pgdat->node_start_pfn = start_pfn; @@ -402,12 +402,13 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn) static int __meminit __add_section(int nid, struct zone *zone, unsigned long phys_start_pfn) { + int nr_pages = PAGES_PER_SECTION; int ret; if (pfn_valid(phys_start_pfn)) return -EEXIST; - ret = sparse_add_one_section(zone, phys_start_pfn); + ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages); if (ret < 0) return ret; @@ -578,9 +579,9 @@ static void shrink_zone_span(struct zone *zone, unsigned long start_pfn, static void shrink_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { - unsigned long pgdat_start_pfn = pgdat->node_start_pfn; - unsigned long p = pgdat_end_pfn(pgdat); /* pgdat_end_pfn namespace clash */ - unsigned long pgdat_end_pfn = p; + unsigned long pgdat_start_pfn = pgdat->node_start_pfn; + unsigned long pgdat_end_pfn = + pgdat->node_start_pfn + pgdat->node_spanned_pages; unsigned long pfn; struct mem_section *ms; int nid = pgdat->node_id; @@ -934,7 +935,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); - nid = pfn_to_nid(pfn); + nid = page_to_nid(pfn_to_page(pfn)); ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); @@ -1043,23 +1044,17 @@ static void rollback_node_hotadd(int nid, pg_data_t *pgdat) } -/** - * try_online_node - online a node if offlined - * +/* * called by cpu_up() to online a node without onlined memory. */ -int try_online_node(int nid) +int mem_online_node(int nid) { pg_data_t *pgdat; int ret; - if (node_online(nid)) - return 0; - lock_memory_hotplug(); pgdat = hotadd_new_pgdat(nid, 0); if (!pgdat) { - pr_err("Cannot online node %d due to NULL pgdat\n", nid); ret = -ENOMEM; goto out; } @@ -1067,12 +1062,6 @@ int try_online_node(int nid) ret = register_one_node(nid); BUG_ON(ret); - if (pgdat->node_zonelists->_zonerefs->zone == NULL) { - mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL, NULL); - mutex_unlock(&zonelists_mutex); - } - out: unlock_memory_hotplug(); return ret; @@ -1423,36 +1412,6 @@ static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) } #endif /* CONFIG_MOVABLE_NODE */ -static int __init cmdline_parse_movable_node(char *p) -{ -#ifdef CONFIG_MOVABLE_NODE - /* - * Memory used by the kernel cannot be hot-removed because Linux - * cannot migrate the kernel pages. When memory hotplug is - * enabled, we should prevent memblock from allocating memory - * for the kernel. - * - * ACPI SRAT records all hotpluggable memory ranges. But before - * SRAT is parsed, we don't know about it. - * - * The kernel image is loaded into memory at very early time. We - * cannot prevent this anyway. So on NUMA system, we set any - * node the kernel resides in as un-hotpluggable. - * - * Since on modern servers, one node could have double-digit - * gigabytes memory, we can assume the memory around the kernel - * image is also un-hotpluggable. So before SRAT is parsed, just - * allocate memory near the kernel image to try the best to keep - * the kernel away from hotpluggable memory. - */ - memblock_set_bottom_up(true); -#else - pr_warn("movable_node option not supported\n"); -#endif - return 0; -} -early_param("movable_node", cmdline_parse_movable_node); - /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) @@ -1743,7 +1702,7 @@ int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, } #ifdef CONFIG_MEMORY_HOTREMOVE -static int check_memblock_offlined_cb(struct memory_block *mem, void *arg) +static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); @@ -1895,7 +1854,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) * if this is not the case. */ ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, - check_memblock_offlined_cb); + is_memblock_offlined_cb); if (ret) { unlock_memory_hotplug(); BUG(); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index eca4a31..0472964 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -525,9 +525,8 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, #ifdef CONFIG_HUGETLB_PAGE int nid; struct page *page; - spinlock_t *ptl; - ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd); + spin_lock(&vma->vm_mm->page_table_lock); page = pte_page(huge_ptep_get((pte_t *)pmd)); nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) @@ -537,7 +536,7 @@ static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma, (flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) isolate_huge_page(page, private); unlock: - spin_unlock(ptl); + spin_unlock(&vma->vm_mm->page_table_lock); #else BUG(); #endif @@ -1126,7 +1125,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, tmp = *from; while (!nodes_empty(tmp)) { int s,d; - int source = NUMA_NO_NODE; + int source = -1; int dest = 0; for_each_node_mask(s, tmp) { @@ -1161,7 +1160,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from, if (!node_isset(dest, tmp)) break; } - if (source == NUMA_NO_NODE) + if (source == -1) break; node_clear(source, tmp); @@ -1680,30 +1679,6 @@ struct mempolicy *get_vma_policy(struct task_struct *task, return pol; } -bool vma_policy_mof(struct task_struct *task, struct vm_area_struct *vma) -{ - struct mempolicy *pol = get_task_policy(task); - if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) { - bool ret = false; - - pol = vma->vm_ops->get_policy(vma, vma->vm_start); - if (pol && (pol->flags & MPOL_F_MOF)) - ret = true; - mpol_cond_put(pol); - - return ret; - } else if (vma->vm_policy) { - pol = vma->vm_policy; - } - } - - if (!pol) - return default_policy.flags & MPOL_F_MOF; - - return pol->flags & MPOL_F_MOF; -} - static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; @@ -1836,7 +1811,7 @@ static unsigned offset_il_node(struct mempolicy *pol, unsigned nnodes = nodes_weight(pol->v.nodes); unsigned target; int c; - int nid = NUMA_NO_NODE; + int nid = -1; if (!nnodes) return numa_node_id(); @@ -1873,11 +1848,11 @@ static inline unsigned interleave_nid(struct mempolicy *pol, /* * Return the bit number of a random bit set in the nodemask. - * (returns NUMA_NO_NODE if nodemask is empty) + * (returns -1 if nodemask is empty) */ int node_random(const nodemask_t *maskp) { - int w, bit = NUMA_NO_NODE; + int w, bit = -1; w = nodes_weight(*maskp); if (w) @@ -2302,35 +2277,6 @@ static void sp_free(struct sp_node *n) kmem_cache_free(sn_cache, n); } -#ifdef CONFIG_NUMA_BALANCING -static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - /* Never defer a private fault */ - if (cpupid_match_pid(p, last_cpupid)) - return false; - - if (p->numa_migrate_deferred) { - p->numa_migrate_deferred--; - return true; - } - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ - p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred; -} -#else -static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid) -{ - return false; -} - -static inline void defer_numa_migrate(struct task_struct *p) -{ -} -#endif /* CONFIG_NUMA_BALANCING */ - /** * mpol_misplaced - check whether current page node is valid in policy * @@ -2354,8 +2300,6 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long struct zone *zone; int curnid = page_to_nid(page); unsigned long pgoff; - int thiscpu = raw_smp_processor_id(); - int thisnid = cpu_to_node(thiscpu); int polnid = -1; int ret = -1; @@ -2404,11 +2348,9 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long /* Migrate the page towards the node whose CPU is referencing it */ if (pol->flags & MPOL_F_MORON) { - int last_cpupid; - int this_cpupid; + int last_nid; - polnid = thisnid; - this_cpupid = cpu_pid_to_cpupid(thiscpu, current->pid); + polnid = numa_node_id(); /* * Multi-stage node selection is used in conjunction @@ -2431,25 +2373,8 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long * it less likely we act on an unlikely task<->page * relation. */ - last_cpupid = page_cpupid_xchg_last(page, this_cpupid); - if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) { - - /* See sysctl_numa_balancing_migrate_deferred comment */ - if (!cpupid_match_pid(current, last_cpupid)) - defer_numa_migrate(current); - - goto out; - } - - /* - * The quadratic filter above reduces extraneous migration - * of shared pages somewhat. This code reduces it even more, - * reducing the overhead of page migrations of shared pages. - * This makes workloads with shared pages rely more on - * "move task near its memory", and less on "move memory - * towards its task", which is exactly what we want. - */ - if (numa_migrate_deferred(current, last_cpupid)) + last_nid = page_nid_xchg_last(page, polnid); + if (last_nid != polnid) goto out; } @@ -2915,45 +2840,62 @@ out: * @maxlen: length of @buffer * @pol: pointer to mempolicy to be formatted * - * Convert @pol into a string. If @buffer is too short, truncate the string. - * Recommend a @maxlen of at least 32 for the longest mode, "interleave", the - * longest flag, "relative", and to display at least a few node ids. + * Convert a mempolicy into a string. + * Returns the number of characters in buffer (if positive) + * or an error (negative) */ -void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) { char *p = buffer; - nodemask_t nodes = NODE_MASK_NONE; - unsigned short mode = MPOL_DEFAULT; - unsigned short flags = 0; + int l; + nodemask_t nodes; + unsigned short mode; + unsigned short flags = pol ? pol->flags : 0; - if (pol && pol != &default_policy) { + /* + * Sanity check: room for longest mode, flag and some nodes + */ + VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16); + + if (!pol || pol == &default_policy) + mode = MPOL_DEFAULT; + else mode = pol->mode; - flags = pol->flags; - } switch (mode) { case MPOL_DEFAULT: + nodes_clear(nodes); break; + case MPOL_PREFERRED: + nodes_clear(nodes); if (flags & MPOL_F_LOCAL) mode = MPOL_LOCAL; else node_set(pol->v.preferred_node, nodes); break; + case MPOL_BIND: + /* Fall through */ case MPOL_INTERLEAVE: nodes = pol->v.nodes; break; + default: - WARN_ON_ONCE(1); - snprintf(p, maxlen, "unknown"); - return; + return -EINVAL; } - p += snprintf(p, maxlen, "%s", policy_modes[mode]); + l = strlen(policy_modes[mode]); + if (buffer + maxlen < p + l + 1) + return -ENOSPC; + + strcpy(p, policy_modes[mode]); + p += l; if (flags & MPOL_MODE_FLAGS) { - p += snprintf(p, buffer + maxlen - p, "="); + if (buffer + maxlen < p + 2) + return -ENOSPC; + *p++ = '='; /* * Currently, the only defined flags are mutually exclusive @@ -2965,7 +2907,10 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) } if (!nodes_empty(nodes)) { - p += snprintf(p, buffer + maxlen - p, ":"); + if (buffer + maxlen < p + 2) + return -ENOSPC; + *p++ = ':'; p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); } + return p - buffer; } diff --git a/mm/migrate.c b/mm/migrate.c index bb94004..c046927 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -130,7 +130,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, addr); if (!ptep) goto out; - ptl = huge_pte_lockptr(hstate_vma(vma), mm, ptep); + ptl = &mm->page_table_lock; } else { pmd = mm_find_pmd(mm, addr); if (!pmd) @@ -249,10 +249,9 @@ void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, __migration_entry_wait(mm, ptep, ptl); } -void migration_entry_wait_huge(struct vm_area_struct *vma, - struct mm_struct *mm, pte_t *pte) +void migration_entry_wait_huge(struct mm_struct *mm, pte_t *pte) { - spinlock_t *ptl = huge_pte_lockptr(hstate_vma(vma), mm, pte); + spinlock_t *ptl = &(mm)->page_table_lock; __migration_entry_wait(mm, pte, ptl); } @@ -442,60 +441,10 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, } /* - * Gigantic pages are so large that we do not guarantee that page++ pointer - * arithmetic will work across the entire page. We need something more - * specialized. - */ -static void __copy_gigantic_page(struct page *dst, struct page *src, - int nr_pages) -{ - int i; - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < nr_pages; ) { - cond_resched(); - copy_highpage(dst, src); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -static void copy_huge_page(struct page *dst, struct page *src) -{ - int i; - int nr_pages; - - if (PageHuge(src)) { - /* hugetlbfs page */ - struct hstate *h = page_hstate(src); - nr_pages = pages_per_huge_page(h); - - if (unlikely(nr_pages > MAX_ORDER_NR_PAGES)) { - __copy_gigantic_page(dst, src, nr_pages); - return; - } - } else { - /* thp page */ - BUG_ON(!PageTransHuge(src)); - nr_pages = hpage_nr_pages(src); - } - - for (i = 0; i < nr_pages; i++) { - cond_resched(); - copy_highpage(dst + i, src + i); - } -} - -/* * Copy the page to its new location */ void migrate_page_copy(struct page *newpage, struct page *page) { - int cpupid; - if (PageHuge(page) || PageTransHuge(page)) copy_huge_page(newpage, page); else @@ -532,13 +481,6 @@ void migrate_page_copy(struct page *newpage, struct page *page) __set_page_dirty_nobuffers(newpage); } - /* - * Copy NUMA information to the new page, to prevent over-eager - * future migrations of this same page. - */ - cpupid = page_cpupid_xchg_last(page, -1); - page_cpupid_xchg_last(newpage, cpupid); - mlock_migrate_page(newpage, page); ksm_migrate_page(newpage, page); /* @@ -1558,7 +1500,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_cpupid_xchg_last(newpage, page_cpupid_last(page)); + page_nid_xchg_last(newpage, page_nid_last(page)); return newpage; } @@ -1659,8 +1601,7 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) * node. Caller is expected to have an elevated reference count on * the page that will be dropped by this function before returning. */ -int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, - int node) +int migrate_misplaced_page(struct page *page, int node) { pg_data_t *pgdat = NODE_DATA(node); int isolated; @@ -1668,11 +1609,10 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, LIST_HEAD(migratepages); /* - * Don't migrate file pages that are mapped in multiple processes - * with execute permissions as they are probably shared libraries. + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer */ - if (page_mapcount(page) != 1 && page_is_file_cache(page) && - (vma->vm_flags & VM_EXEC)) + if (page_mapcount(page) != 1) goto out; /* @@ -1715,7 +1655,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unsigned long address, struct page *page, int node) { - spinlock_t *ptl; unsigned long haddr = address & HPAGE_PMD_MASK; pg_data_t *pgdat = NODE_DATA(node); int isolated = 0; @@ -1724,6 +1663,13 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int page_lru = page_is_file_cache(page); /* + * Don't migrate pages that are mapped in multiple processes. + * TODO: Handle false sharing detection instead of this hammer + */ + if (page_mapcount(page) != 1) + goto out_dropref; + + /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! @@ -1736,7 +1682,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_cpupid_xchg_last(new_page, page_cpupid_last(page)); + page_nid_xchg_last(new_page, page_nid_last(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { @@ -1755,9 +1701,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, WARN_ON(PageLRU(new_page)); /* Recheck the target PMD */ - ptl = pmd_lock(mm, pmd); + spin_lock(&mm->page_table_lock); if (unlikely(!pmd_same(*pmd, entry))) { - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); /* Reverse changes made by migrate_page_copy() */ if (TestClearPageActive(new_page)) @@ -1802,7 +1748,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, * before it's fully transferred to the new page. */ mem_cgroup_end_migration(memcg, page, new_page, true); - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); unlock_page(new_page); unlock_page(page); diff --git a/mm/mm_init.c b/mm/mm_init.c index 68562e9..633c088 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -71,26 +71,26 @@ void __init mminit_verify_pageflags_layout(void) unsigned long or_mask, add_mask; shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT; + width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_NID_SHIFT; mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", - "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n", + "Section %d Node %d Zone %d Lastnid %d Flags %d\n", SECTIONS_WIDTH, NODES_WIDTH, ZONES_WIDTH, - LAST_CPUPID_WIDTH, + LAST_NID_WIDTH, NR_PAGEFLAGS); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d\n", + "Section %d Node %d Zone %d Lastnid %d\n", SECTIONS_SHIFT, NODES_SHIFT, ZONES_SHIFT, - LAST_CPUPID_SHIFT); + LAST_NID_SHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts", - "Section %lu Node %lu Zone %lu Lastcpupid %lu\n", + "Section %lu Node %lu Zone %lu Lastnid %lu\n", (unsigned long)SECTIONS_PGSHIFT, (unsigned long)NODES_PGSHIFT, (unsigned long)ZONES_PGSHIFT, - (unsigned long)LAST_CPUPID_PGSHIFT); + (unsigned long)LAST_NID_PGSHIFT); mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid", "Node/Zone ID: %lu -> %lu\n", (unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT), @@ -102,9 +102,9 @@ void __init mminit_verify_pageflags_layout(void) mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", "Node not in page flags"); #endif -#ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS +#ifdef LAST_NID_NOT_IN_PAGE_FLAGS mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodeflags", - "Last cpupid not in page flags"); + "Last nid not in page flags"); #endif if (SECTIONS_WIDTH) { @@ -179,12 +179,14 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) goto error; } - allowed = vm_commit_limit(); + allowed = (totalram_pages - hugetlb_total_pages()) + * sysctl_overcommit_ratio / 100; /* * Reserve some for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed += total_swap_pages; /* * Don't let a single process grow so big a user can't recover @@ -1297,7 +1299,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, vm_flags &= ~VM_MAYEXEC; } - if (!file->f_op->mmap) + if (!file->f_op || !file->f_op->mmap) return -ENODEV; if (vm_flags & (VM_GROWSDOWN|VM_GROWSUP)) return -EINVAL; @@ -1854,7 +1856,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, struct vm_area_struct *vma; struct vm_unmapped_area_info info; - if (len > TASK_SIZE - mmap_min_addr) + if (len > TASK_SIZE) return -ENOMEM; if (flags & MAP_FIXED) @@ -1863,14 +1865,14 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) return addr; } info.flags = 0; info.length = len; - info.low_limit = mm->mmap_base; + info.low_limit = TASK_UNMAPPED_BASE; info.high_limit = TASK_SIZE; info.align_mask = 0; return vm_unmapped_area(&info); @@ -1893,7 +1895,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, struct vm_unmapped_area_info info; /* requested length too big for entire address space */ - if (len > TASK_SIZE - mmap_min_addr) + if (len > TASK_SIZE) return -ENOMEM; if (flags & MAP_FIXED) @@ -1903,14 +1905,14 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, if (addr) { addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); - if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && + if (TASK_SIZE - len >= addr && (!vma || addr + len <= vma->vm_start)) return addr; } info.flags = VM_UNMAPPED_AREA_TOPDOWN; info.length = len; - info.low_limit = max(PAGE_SIZE, mmap_min_addr); + info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; info.align_mask = 0; addr = vm_unmapped_area(&info); @@ -1949,7 +1951,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, return -ENOMEM; get_area = current->mm->get_unmapped_area; - if (file && file->f_op->get_unmapped_area) + if (file && file->f_op && file->f_op->get_unmapped_area) get_area = file->f_op->get_unmapped_area; addr = get_area(file, addr, len, pgoff, flags); if (IS_ERR_VALUE(addr)) @@ -2724,8 +2726,7 @@ void exit_mmap(struct mm_struct *mm) } vm_unacct_memory(nr_accounted); - WARN_ON(atomic_long_read(&mm->nr_ptes) > - (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + WARN_ON(mm->nr_ptes > (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); } /* Insert vm structure into process list sorted by address diff --git a/mm/mmzone.c b/mm/mmzone.c index bf34fb8..2ac0afb 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -97,20 +97,20 @@ void lruvec_init(struct lruvec *lruvec) INIT_LIST_HEAD(&lruvec->lists[lru]); } -#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) -int page_cpupid_xchg_last(struct page *page, int cpupid) +#if defined(CONFIG_NUMA_BALANCING) && !defined(LAST_NID_NOT_IN_PAGE_FLAGS) +int page_nid_xchg_last(struct page *page, int nid) { unsigned long old_flags, flags; - int last_cpupid; + int last_nid; do { old_flags = flags = page->flags; - last_cpupid = page_cpupid_last(page); + last_nid = page_nid_last(page); - flags &= ~(LAST_CPUPID_MASK << LAST_CPUPID_PGSHIFT); - flags |= (cpupid & LAST_CPUPID_MASK) << LAST_CPUPID_PGSHIFT; + flags &= ~(LAST_NID_MASK << LAST_NID_PGSHIFT); + flags |= (nid & LAST_NID_MASK) << LAST_NID_PGSHIFT; } while (unlikely(cmpxchg(&page->flags, old_flags, flags) != old_flags)); - return last_cpupid; + return last_nid; } #endif diff --git a/mm/mprotect.c b/mm/mprotect.c index 2666797..412ba2b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -37,12 +37,14 @@ static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, pgprot_t newprot, - int dirty_accountable, int prot_numa) + int dirty_accountable, int prot_numa, bool *ret_all_same_node) { struct mm_struct *mm = vma->vm_mm; pte_t *pte, oldpte; spinlock_t *ptl; unsigned long pages = 0; + bool all_same_node = true; + int last_nid = -1; pte = pte_offset_map_lock(mm, pmd, addr, &ptl); arch_enter_lazy_mmu_mode(); @@ -61,7 +63,15 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = vm_normal_page(vma, addr, oldpte); if (page) { - if (!pte_numa(oldpte)) { + int this_nid = page_to_nid(page); + if (last_nid == -1) + last_nid = this_nid; + if (last_nid != this_nid) + all_same_node = false; + + /* only check non-shared pages */ + if (!pte_numa(oldpte) && + page_mapcount(page) == 1) { ptent = pte_mknuma(ptent); updated = true; } @@ -94,17 +104,33 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (pte_swp_soft_dirty(oldpte)) newpte = pte_swp_mksoft_dirty(newpte); set_pte_at(mm, addr, pte, newpte); - - pages++; } + pages++; } } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); pte_unmap_unlock(pte - 1, ptl); + *ret_all_same_node = all_same_node; return pages; } +#ifdef CONFIG_NUMA_BALANCING +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + spin_lock(&mm->page_table_lock); + set_pmd_at(mm, addr & PMD_MASK, pmd, pmd_mknuma(*pmd)); + spin_unlock(&mm->page_table_lock); +} +#else +static inline void change_pmd_protnuma(struct mm_struct *mm, unsigned long addr, + pmd_t *pmd) +{ + BUG(); +} +#endif /* CONFIG_NUMA_BALANCING */ + static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable, int prot_numa) @@ -112,39 +138,36 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma, pmd_t *pmd; unsigned long next; unsigned long pages = 0; - unsigned long nr_huge_updates = 0; + bool all_same_node; pmd = pmd_offset(pud, addr); do { - unsigned long this_pages; - next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { if (next - addr != HPAGE_PMD_SIZE) split_huge_page_pmd(vma, addr, pmd); - else { - int nr_ptes = change_huge_pmd(vma, pmd, addr, - newprot, prot_numa); - - if (nr_ptes) { - if (nr_ptes == HPAGE_PMD_NR) { - pages += HPAGE_PMD_NR; - nr_huge_updates++; - } - continue; - } + else if (change_huge_pmd(vma, pmd, addr, newprot, + prot_numa)) { + pages++; + continue; } /* fall through */ } if (pmd_none_or_clear_bad(pmd)) continue; - this_pages = change_pte_range(vma, pmd, addr, next, newprot, - dirty_accountable, prot_numa); - pages += this_pages; + pages += change_pte_range(vma, pmd, addr, next, newprot, + dirty_accountable, prot_numa, &all_same_node); + + /* + * If we are changing protections for NUMA hinting faults then + * set pmd_numa if the examined pages were all on the same + * node. This allows a regular PMD to be handled as one fault + * and effectively batches the taking of the PTL + */ + if (prot_numa && all_same_node) + change_pmd_protnuma(vma->vm_mm, addr, pmd); } while (pmd++, addr = next, addr != end); - if (nr_huge_updates) - count_vm_numa_events(NUMA_HUGE_PTE_UPDATES, nr_huge_updates); return pages; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 2c254d3..61107cf 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -82,18 +82,27 @@ void __init free_bootmem_late(unsigned long addr, unsigned long size) static void __init __free_pages_memory(unsigned long start, unsigned long end) { - int order; + unsigned long i, start_aligned, end_aligned; + int order = ilog2(BITS_PER_LONG); - while (start < end) { - order = min(MAX_ORDER - 1UL, __ffs(start)); + start_aligned = (start + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); + end_aligned = end & ~(BITS_PER_LONG - 1); - while (start + (1UL << order) > end) - order--; + if (end_aligned <= start_aligned) { + for (i = start; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); - __free_pages_bootmem(pfn_to_page(start), order); - - start += (1UL << order); + return; } + + for (i = start; i < start_aligned; i++) + __free_pages_bootmem(pfn_to_page(i), 0); + + for (i = start_aligned; i < end_aligned; i += BITS_PER_LONG) + __free_pages_bootmem(pfn_to_page(i), order); + + for (i = end_aligned; i < end; i++) + __free_pages_bootmem(pfn_to_page(i), 0); } static unsigned long __init __free_memory_core(phys_addr_t start, @@ -937,7 +937,7 @@ static int validate_mmap_request(struct file *file, struct address_space *mapping; /* files must support mmap */ - if (!file->f_op->mmap) + if (!file->f_op || !file->f_op->mmap) return -ENODEV; /* work out if what we've got could possibly be shared @@ -1948,12 +1948,13 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) goto error; } - allowed = vm_commit_limit(); + allowed = totalram_pages * sysctl_overcommit_ratio / 100; /* * Reserve some 3% for root */ if (!cap_sys_admin) allowed -= sysctl_admin_reserve_kbytes >> (PAGE_SHIFT - 10); + allowed += total_swap_pages; /* * Don't let a single process grow so big a user can't recover diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1e4a600..6738c47 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -161,7 +161,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + + points = get_mm_rss(p->mm) + p->mm->nr_ptes + get_mm_counter(p->mm, MM_SWAPENTS); task_unlock(p); @@ -364,10 +364,10 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - atomic_long_read(&task->mm->nr_ptes), + task->mm->nr_ptes, get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 580a5f0..dd886fa 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -234,8 +234,8 @@ int page_group_by_mobility_disabled __read_mostly; void set_pageblock_migratetype(struct page *page, int migratetype) { - if (unlikely(page_group_by_mobility_disabled && - migratetype < MIGRATE_PCPTYPES)) + + if (unlikely(page_group_by_mobility_disabled)) migratetype = MIGRATE_UNMOVABLE; set_pageblock_flags_group(page, (unsigned long)migratetype, @@ -626,7 +626,7 @@ static inline int free_pages_check(struct page *page) bad_page(page); return 1; } - page_cpupid_reset_last(page); + page_nid_reset_last(page); if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; return 0; @@ -1027,10 +1027,6 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, { int current_order = page_order(page); - /* - * When borrowing from MIGRATE_CMA, we need to release the excess - * buddy pages to CMA itself. - */ if (is_migrate_cma(fallback_type)) return fallback_type; @@ -1095,11 +1091,21 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) list_del(&page->lru); rmv_page_order(page); + /* + * Borrow the excess buddy pages as well, irrespective + * of whether we stole freepages, or took ownership of + * the pageblock or not. + * + * Exception: When borrowing from MIGRATE_CMA, release + * the excess buddy pages to CMA itself. + */ expand(zone, page, order, current_order, area, - new_type); + is_migrate_cma(migratetype) + ? migratetype : start_migratetype); - trace_mm_page_alloc_extfrag(page, order, current_order, - start_migratetype, migratetype, new_type); + trace_mm_page_alloc_extfrag(page, order, + current_order, start_migratetype, migratetype, + new_type == start_migratetype); return page; } @@ -1705,7 +1711,7 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, * comments in mmzone.h. Reduces cache footprint of zonelist scans * that have to skip over a lot of full or unallowed zones. * - * If the zonelist cache is present in the passed zonelist, then + * If the zonelist cache is present in the passed in zonelist, then * returns a pointer to the allowed node mask (either the current * tasks mems_allowed, or node_states[N_MEMORY].) * @@ -2587,7 +2593,7 @@ rebalance: * running out of options and have to consider going OOM */ if (!did_some_progress) { - if (oom_gfp_allowed(gfp_mask)) { + if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { if (oom_killer_disabled) goto nopage; /* Coredumps can quickly deplete all memory reserves */ @@ -3875,6 +3881,8 @@ static inline unsigned long wait_table_bits(unsigned long size) return ffz(~size); } +#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1)) + /* * Check if a pageblock contains reserved pages */ @@ -4007,7 +4015,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, mminit_verify_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); - page_cpupid_reset_last(page); + page_nid_reset_last(page); SetPageReserved(page); /* * Mark the block movable so that blocks are reserved for @@ -4258,7 +4266,7 @@ static __meminit void zone_pcp_init(struct zone *zone) */ zone->pageset = &boot_pageset; - if (populated_zone(zone)) + if (zone->present_pages) printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n", zone->name, zone->present_pages, zone_batchsize(zone)); @@ -5152,7 +5160,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) for (zone_type = 0; zone_type <= ZONE_MOVABLE - 1; zone_type++) { struct zone *zone = &pgdat->node_zones[zone_type]; - if (populated_zone(zone)) { + if (zone->present_pages) { node_set_state(nid, N_HIGH_MEMORY); if (N_NORMAL_MEMORY != N_HIGH_MEMORY && zone_type <= ZONE_NORMAL) diff --git a/mm/percpu.c b/mm/percpu.c index 0d10def..8c8e08f 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1706,9 +1706,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, out_free_areas: for (group = 0; group < ai->nr_groups; group++) - if (areas[group]) - free_fn(areas[group], - ai->groups[group].nr_units * ai->unit_size); + free_fn(areas[group], + ai->groups[group].nr_units * ai->unit_size); out_free: pcpu_free_alloc_info(ai); if (areas) diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index cbb3854..3929a40 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -151,14 +151,14 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable) { - assert_spin_locked(pmd_lockptr(mm, pmdp)); + assert_spin_locked(&mm->page_table_lock); /* FIFO */ - if (!pmd_huge_pte(mm, pmdp)) + if (!mm->pmd_huge_pte) INIT_LIST_HEAD(&pgtable->lru); else - list_add(&pgtable->lru, &pmd_huge_pte(mm, pmdp)->lru); - pmd_huge_pte(mm, pmdp) = pgtable; + list_add(&pgtable->lru, &mm->pmd_huge_pte->lru); + mm->pmd_huge_pte = pgtable; } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -170,14 +170,14 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; - assert_spin_locked(pmd_lockptr(mm, pmdp)); + assert_spin_locked(&mm->page_table_lock); /* FIFO */ - pgtable = pmd_huge_pte(mm, pmdp); + pgtable = mm->pmd_huge_pte; if (list_empty(&pgtable->lru)) - pmd_huge_pte(mm, pmdp) = NULL; + mm->pmd_huge_pte = NULL; else { - pmd_huge_pte(mm, pmdp) = list_entry(pgtable->lru.next, + mm->pmd_huge_pte = list_entry(pgtable->lru.next, struct page, lru); list_del(&pgtable->lru); } diff --git a/mm/readahead.c b/mm/readahead.c index 7cdbb44..e4ed041 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -401,7 +401,6 @@ ondemand_readahead(struct address_space *mapping, unsigned long req_size) { unsigned long max = max_sane_readahead(ra->ra_pages); - pgoff_t prev_offset; /* * start of file @@ -453,11 +452,8 @@ ondemand_readahead(struct address_space *mapping, /* * sequential cache miss - * trivial case: (offset - prev_offset) == 1 - * unaligned reads: (offset - prev_offset) == 0 */ - prev_offset = (unsigned long long)ra->prev_pos >> PAGE_CACHE_SHIFT; - if (offset - prev_offset <= 1UL) + if (offset - (ra->prev_pos >> PAGE_CACHE_SHIFT) <= 1UL) goto initial_readahead; /* @@ -573,7 +569,7 @@ static ssize_t do_readahead(struct address_space *mapping, struct file *filp, pgoff_t index, unsigned long nr) { - if (!mapping || !mapping->a_ops) + if (!mapping || !mapping->a_ops || !mapping->a_ops->readpage) return -EINVAL; force_page_cache_readahead(mapping, filp, index, nr); @@ -601,7 +601,7 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, if (unlikely(PageHuge(page))) { pte = huge_pte_offset(mm, address); - ptl = huge_pte_lockptr(page_hstate(page), mm, pte); + ptl = &mm->page_table_lock; goto check; } @@ -665,23 +665,25 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; - spinlock_t *ptl; int referenced = 0; if (unlikely(PageTransHuge(page))) { pmd_t *pmd; + spin_lock(&mm->page_table_lock); /* * rmap might return false positives; we must filter * these out using page_check_address_pmd(). */ pmd = page_check_address_pmd(page, mm, address, - PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl); - if (!pmd) + PAGE_CHECK_ADDRESS_PMD_FLAG); + if (!pmd) { + spin_unlock(&mm->page_table_lock); goto out; + } if (vma->vm_flags & VM_LOCKED) { - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; goto out; @@ -690,9 +692,10 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, /* go ahead even if the pmd is pmd_trans_splitting() */ if (pmdp_clear_flush_young_notify(vma, address, pmd)) referenced++; - spin_unlock(ptl); + spin_unlock(&mm->page_table_lock); } else { pte_t *pte; + spinlock_t *ptl; /* * rmap might return false positives; we must filter @@ -164,6 +164,72 @@ static bool pfmemalloc_active __read_mostly; /* + * kmem_bufctl_t: + * + * Bufctl's are used for linking objs within a slab + * linked offsets. + * + * This implementation relies on "struct page" for locating the cache & + * slab an object belongs to. + * This allows the bufctl structure to be small (one int), but limits + * the number of objects a slab (not a cache) can contain when off-slab + * bufctls are used. The limit is the size of the largest general cache + * that does not use off-slab slabs. + * For 32bit archs with 4 kB pages, is this 56. + * This is not serious, as it is only for large objects, when it is unwise + * to have too many per slab. + * Note: This limit can be raised by introducing a general cache whose size + * is less than 512 (PAGE_SIZE<<3), but greater than 256. + */ + +typedef unsigned int kmem_bufctl_t; +#define BUFCTL_END (((kmem_bufctl_t)(~0U))-0) +#define BUFCTL_FREE (((kmem_bufctl_t)(~0U))-1) +#define BUFCTL_ACTIVE (((kmem_bufctl_t)(~0U))-2) +#define SLAB_LIMIT (((kmem_bufctl_t)(~0U))-3) + +/* + * struct slab_rcu + * + * slab_destroy on a SLAB_DESTROY_BY_RCU cache uses this structure to + * arrange for kmem_freepages to be called via RCU. This is useful if + * we need to approach a kernel structure obliquely, from its address + * obtained without the usual locking. We can lock the structure to + * stabilize it and check it's still at the given address, only if we + * can be sure that the memory has not been meanwhile reused for some + * other kind of object (which our subsystem's lock might corrupt). + * + * rcu_read_lock before reading the address, then rcu_read_unlock after + * taking the spinlock within the structure expected at that address. + */ +struct slab_rcu { + struct rcu_head head; + struct kmem_cache *cachep; + void *addr; +}; + +/* + * struct slab + * + * Manages the objs in a slab. Placed either at the beginning of mem allocated + * for a slab, or allocated from an general cache. + * Slabs are chained into three list: fully used, partial, fully free slabs. + */ +struct slab { + union { + struct { + struct list_head list; + unsigned long colouroff; + void *s_mem; /* including colour offset */ + unsigned int inuse; /* num of objs active in slab */ + kmem_bufctl_t free; + unsigned short nodeid; + }; + struct slab_rcu __slab_cover_slab_rcu; + }; +}; + +/* * struct array_cache * * Purpose: @@ -390,10 +456,18 @@ static inline struct kmem_cache *virt_to_cache(const void *obj) return page->slab_cache; } -static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, +static inline struct slab *virt_to_slab(const void *obj) +{ + struct page *page = virt_to_head_page(obj); + + VM_BUG_ON(!PageSlab(page)); + return page->slab_page; +} + +static inline void *index_to_obj(struct kmem_cache *cache, struct slab *slab, unsigned int idx) { - return page->s_mem + cache->size * idx; + return slab->s_mem + cache->size * idx; } /* @@ -403,9 +477,9 @@ static inline void *index_to_obj(struct kmem_cache *cache, struct page *page, * reciprocal_divide(offset, cache->reciprocal_buffer_size) */ static inline unsigned int obj_to_index(const struct kmem_cache *cache, - const struct page *page, void *obj) + const struct slab *slab, void *obj) { - u32 offset = (obj - page->s_mem); + u32 offset = (obj - slab->s_mem); return reciprocal_divide(offset, cache->reciprocal_buffer_size); } @@ -567,7 +641,7 @@ static inline struct array_cache *cpu_cache_get(struct kmem_cache *cachep) static size_t slab_mgmt_size(size_t nr_objs, size_t align) { - return ALIGN(nr_objs * sizeof(unsigned int), align); + return ALIGN(sizeof(struct slab)+nr_objs*sizeof(kmem_bufctl_t), align); } /* @@ -586,7 +660,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * on it. For the latter case, the memory allocated for a * slab is used for: * - * - One unsigned int for each object + * - The struct slab + * - One kmem_bufctl_t for each object * - Padding to respect alignment of @align * - @buffer_size bytes for each object * @@ -599,6 +674,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, mgmt_size = 0; nr_objs = slab_size / buffer_size; + if (nr_objs > SLAB_LIMIT) + nr_objs = SLAB_LIMIT; } else { /* * Ignore padding for the initial guess. The padding @@ -608,7 +685,8 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, * into the memory allocation when taking the padding * into account. */ - nr_objs = (slab_size) / (buffer_size + sizeof(unsigned int)); + nr_objs = (slab_size - sizeof(struct slab)) / + (buffer_size + sizeof(kmem_bufctl_t)); /* * This calculated number will be either the right @@ -618,6 +696,9 @@ static void cache_estimate(unsigned long gfporder, size_t buffer_size, > slab_size) nr_objs--; + if (nr_objs > SLAB_LIMIT) + nr_objs = SLAB_LIMIT; + mgmt_size = slab_mgmt_size(nr_objs, align); } *num = nr_objs; @@ -748,8 +829,10 @@ static struct array_cache *alloc_arraycache(int node, int entries, return nc; } -static inline bool is_slab_pfmemalloc(struct page *page) +static inline bool is_slab_pfmemalloc(struct slab *slabp) { + struct page *page = virt_to_page(slabp->s_mem); + return PageSlabPfmemalloc(page); } @@ -758,23 +841,23 @@ static void recheck_pfmemalloc_active(struct kmem_cache *cachep, struct array_cache *ac) { struct kmem_cache_node *n = cachep->node[numa_mem_id()]; - struct page *page; + struct slab *slabp; unsigned long flags; if (!pfmemalloc_active) return; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) - if (is_slab_pfmemalloc(page)) + list_for_each_entry(slabp, &n->slabs_full, list) + if (is_slab_pfmemalloc(slabp)) goto out; - list_for_each_entry(page, &n->slabs_partial, lru) - if (is_slab_pfmemalloc(page)) + list_for_each_entry(slabp, &n->slabs_partial, list) + if (is_slab_pfmemalloc(slabp)) goto out; - list_for_each_entry(page, &n->slabs_free, lru) - if (is_slab_pfmemalloc(page)) + list_for_each_entry(slabp, &n->slabs_free, list) + if (is_slab_pfmemalloc(slabp)) goto out; pfmemalloc_active = false; @@ -814,8 +897,8 @@ static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, */ n = cachep->node[numa_mem_id()]; if (!list_empty(&n->slabs_free) && force_refill) { - struct page *page = virt_to_head_page(objp); - ClearPageSlabPfmemalloc(page); + struct slab *slabp = virt_to_slab(objp); + ClearPageSlabPfmemalloc(virt_to_head_page(slabp->s_mem)); clear_obj_pfmemalloc(&objp); recheck_pfmemalloc_active(cachep, ac); return objp; @@ -1016,7 +1099,8 @@ static void drain_alien_cache(struct kmem_cache *cachep, static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) { - int nodeid = page_to_nid(virt_to_page(objp)); + struct slab *slabp = virt_to_slab(objp); + int nodeid = slabp->nodeid; struct kmem_cache_node *n; struct array_cache *alien = NULL; int node; @@ -1027,7 +1111,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) * Make sure we are not freeing a object from another node to the array * cache on this cpu. */ - if (likely(nodeid == node)) + if (likely(slabp->nodeid == node)) return 0; n = cachep->node[node]; @@ -1428,8 +1512,6 @@ void __init kmem_cache_init(void) { int i; - BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < - sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; setup_node_pointer(kmem_cache); @@ -1605,7 +1687,7 @@ static noinline void slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { struct kmem_cache_node *n; - struct page *page; + struct slab *slabp; unsigned long flags; int node; @@ -1624,15 +1706,15 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) continue; spin_lock_irqsave(&n->list_lock, flags); - list_for_each_entry(page, &n->slabs_full, lru) { + list_for_each_entry(slabp, &n->slabs_full, list) { active_objs += cachep->num; active_slabs++; } - list_for_each_entry(page, &n->slabs_partial, lru) { - active_objs += page->active; + list_for_each_entry(slabp, &n->slabs_partial, list) { + active_objs += slabp->inuse; active_slabs++; } - list_for_each_entry(page, &n->slabs_free, lru) + list_for_each_entry(slabp, &n->slabs_free, list) num_slabs++; free_objects += n->free_objects; @@ -1654,11 +1736,19 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) * did not request dmaable memory, we might get it, but that * would be relatively rare and ignorable. */ -static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, - int nodeid) +static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) { struct page *page; int nr_pages; + int i; + +#ifndef CONFIG_MMU + /* + * Nommu uses slab's for process anonymous memory allocations, and thus + * requires __GFP_COMP to properly refcount higher order allocations + */ + flags |= __GFP_COMP; +#endif flags |= cachep->allocflags; if (cachep->flags & SLAB_RECLAIM_ACCOUNT) @@ -1682,9 +1772,12 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); - __SetPageSlab(page); - if (page->pfmemalloc) - SetPageSlabPfmemalloc(page); + for (i = 0; i < nr_pages; i++) { + __SetPageSlab(page + i); + + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page + i); + } memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { @@ -1696,15 +1789,17 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, kmemcheck_mark_unallocated_pages(page, nr_pages); } - return page; + return page_address(page); } /* * Interface to system's page release. */ -static void kmem_freepages(struct kmem_cache *cachep, struct page *page) +static void kmem_freepages(struct kmem_cache *cachep, void *addr) { - const unsigned long nr_freed = (1 << cachep->gfporder); + unsigned long i = (1 << cachep->gfporder); + struct page *page = virt_to_page(addr); + const unsigned long nr_freed = i; kmemcheck_free_shadow(page, cachep->gfporder); @@ -1714,28 +1809,27 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) else sub_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_freed); - - BUG_ON(!PageSlab(page)); - __ClearPageSlabPfmemalloc(page); - __ClearPageSlab(page); - page_mapcount_reset(page); - page->mapping = NULL; + while (i--) { + BUG_ON(!PageSlab(page)); + __ClearPageSlabPfmemalloc(page); + __ClearPageSlab(page); + page++; + } memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_memcg_kmem_pages(page, cachep->gfporder); + free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) { - struct kmem_cache *cachep; - struct page *page; - - page = container_of(head, struct page, rcu_head); - cachep = page->slab_cache; + struct slab_rcu *slab_rcu = (struct slab_rcu *)head; + struct kmem_cache *cachep = slab_rcu->cachep; - kmem_freepages(cachep, page); + kmem_freepages(cachep, slab_rcu->addr); + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slab_rcu); } #if DEBUG @@ -1884,19 +1978,19 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) /* Print some data about the neighboring objects, if they * exist: */ - struct page *page = virt_to_head_page(objp); + struct slab *slabp = virt_to_slab(objp); unsigned int objnr; - objnr = obj_to_index(cachep, page, objp); + objnr = obj_to_index(cachep, slabp, objp); if (objnr) { - objp = index_to_obj(cachep, page, objnr - 1); + objp = index_to_obj(cachep, slabp, objnr - 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Prev obj: start=%p, len=%d\n", realobj, size); print_objinfo(cachep, objp, 2); } if (objnr + 1 < cachep->num) { - objp = index_to_obj(cachep, page, objnr + 1); + objp = index_to_obj(cachep, slabp, objnr + 1); realobj = (char *)objp + obj_offset(cachep); printk(KERN_ERR "Next obj: start=%p, len=%d\n", realobj, size); @@ -1907,12 +2001,11 @@ static void check_poison_obj(struct kmem_cache *cachep, void *objp) #endif #if DEBUG -static void slab_destroy_debugcheck(struct kmem_cache *cachep, - struct page *page) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, page, i); + void *objp = index_to_obj(cachep, slabp, i); if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC @@ -1937,8 +2030,7 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, } } #else -static void slab_destroy_debugcheck(struct kmem_cache *cachep, - struct page *page) +static void slab_destroy_debugcheck(struct kmem_cache *cachep, struct slab *slabp) { } #endif @@ -1952,34 +2044,23 @@ static void slab_destroy_debugcheck(struct kmem_cache *cachep, * Before calling the slab must have been unlinked from the cache. The * cache-lock is not held/needed. */ -static void slab_destroy(struct kmem_cache *cachep, struct page *page) +static void slab_destroy(struct kmem_cache *cachep, struct slab *slabp) { - void *freelist; + void *addr = slabp->s_mem - slabp->colouroff; - freelist = page->freelist; - slab_destroy_debugcheck(cachep, page); + slab_destroy_debugcheck(cachep, slabp); if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU)) { - struct rcu_head *head; - - /* - * RCU free overloads the RCU head over the LRU. - * slab_page has been overloeaded over the LRU, - * however it is not used from now on so that - * we can use it safely. - */ - head = (void *)&page->rcu_head; - call_rcu(head, kmem_rcu_free); + struct slab_rcu *slab_rcu; + slab_rcu = (struct slab_rcu *)slabp; + slab_rcu->cachep = cachep; + slab_rcu->addr = addr; + call_rcu(&slab_rcu->head, kmem_rcu_free); } else { - kmem_freepages(cachep, page); + kmem_freepages(cachep, addr); + if (OFF_SLAB(cachep)) + kmem_cache_free(cachep->slabp_cache, slabp); } - - /* - * From now on, we don't use freelist - * although actual page can be freed in rcu context - */ - if (OFF_SLAB(cachep)) - kmem_cache_free(cachep->freelist_cache, freelist); } /** @@ -2016,8 +2097,8 @@ static size_t calculate_slab_order(struct kmem_cache *cachep, * use off-slab slabs. Needed to avoid a possible * looping condition in cache_grow(). */ - offslab_limit = size; - offslab_limit /= sizeof(unsigned int); + offslab_limit = size - sizeof(struct slab); + offslab_limit /= sizeof(kmem_bufctl_t); if (num > offslab_limit) break; @@ -2139,7 +2220,7 @@ static int __init_refok setup_cpu_cache(struct kmem_cache *cachep, gfp_t gfp) int __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) { - size_t left_over, freelist_size, ralign; + size_t left_over, slab_size, ralign; gfp_t gfp; int err; size_t size = cachep->size; @@ -2258,21 +2339,22 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (!cachep->num) return -E2BIG; - freelist_size = - ALIGN(cachep->num * sizeof(unsigned int), cachep->align); + slab_size = ALIGN(cachep->num * sizeof(kmem_bufctl_t) + + sizeof(struct slab), cachep->align); /* * If the slab has been placed off-slab, and we have enough space then * move it on-slab. This is at the expense of any extra colouring. */ - if (flags & CFLGS_OFF_SLAB && left_over >= freelist_size) { + if (flags & CFLGS_OFF_SLAB && left_over >= slab_size) { flags &= ~CFLGS_OFF_SLAB; - left_over -= freelist_size; + left_over -= slab_size; } if (flags & CFLGS_OFF_SLAB) { /* really off slab. No need for manual alignment */ - freelist_size = cachep->num * sizeof(unsigned int); + slab_size = + cachep->num * sizeof(kmem_bufctl_t) + sizeof(struct slab); #ifdef CONFIG_PAGE_POISONING /* If we're going to use the generic kernel_map_pages() @@ -2289,16 +2371,16 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) if (cachep->colour_off < cachep->align) cachep->colour_off = cachep->align; cachep->colour = left_over / cachep->colour_off; - cachep->freelist_size = freelist_size; + cachep->slab_size = slab_size; cachep->flags = flags; - cachep->allocflags = __GFP_COMP; + cachep->allocflags = 0; if (CONFIG_ZONE_DMA_FLAG && (flags & SLAB_CACHE_DMA)) cachep->allocflags |= GFP_DMA; cachep->size = size; cachep->reciprocal_buffer_size = reciprocal_value(size); if (flags & CFLGS_OFF_SLAB) { - cachep->freelist_cache = kmalloc_slab(freelist_size, 0u); + cachep->slabp_cache = kmalloc_slab(slab_size, 0u); /* * This is a possibility for one of the malloc_sizes caches. * But since we go off slab only for object size greater than @@ -2306,7 +2388,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags) * this should not happen at all. * But leave a BUG_ON for some lucky dude. */ - BUG_ON(ZERO_OR_NULL_PTR(cachep->freelist_cache)); + BUG_ON(ZERO_OR_NULL_PTR(cachep->slabp_cache)); } err = setup_cpu_cache(cachep, gfp); @@ -2412,7 +2494,7 @@ static int drain_freelist(struct kmem_cache *cache, { struct list_head *p; int nr_freed; - struct page *page; + struct slab *slabp; nr_freed = 0; while (nr_freed < tofree && !list_empty(&n->slabs_free)) { @@ -2424,18 +2506,18 @@ static int drain_freelist(struct kmem_cache *cache, goto out; } - page = list_entry(p, struct page, lru); + slabp = list_entry(p, struct slab, list); #if DEBUG - BUG_ON(page->active); + BUG_ON(slabp->inuse); #endif - list_del(&page->lru); + list_del(&slabp->list); /* * Safe to drop the lock. The slab is no longer linked * to the cache. */ n->free_objects -= cache->num; spin_unlock_irq(&n->list_lock); - slab_destroy(cache, page); + slab_destroy(cache, slabp); nr_freed++; } out: @@ -2518,42 +2600,52 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep) * descriptors in kmem_cache_create, we search through the malloc_sizes array. * If we are creating a malloc_sizes cache here it would not be visible to * kmem_find_general_cachep till the initialization is complete. - * Hence we cannot have freelist_cache same as the original cache. + * Hence we cannot have slabp_cache same as the original cache. */ -static void *alloc_slabmgmt(struct kmem_cache *cachep, - struct page *page, int colour_off, - gfp_t local_flags, int nodeid) +static struct slab *alloc_slabmgmt(struct kmem_cache *cachep, void *objp, + int colour_off, gfp_t local_flags, + int nodeid) { - void *freelist; - void *addr = page_address(page); + struct slab *slabp; if (OFF_SLAB(cachep)) { /* Slab management obj is off-slab. */ - freelist = kmem_cache_alloc_node(cachep->freelist_cache, + slabp = kmem_cache_alloc_node(cachep->slabp_cache, local_flags, nodeid); - if (!freelist) + /* + * If the first object in the slab is leaked (it's allocated + * but no one has a reference to it), we want to make sure + * kmemleak does not treat the ->s_mem pointer as a reference + * to the object. Otherwise we will not report the leak. + */ + kmemleak_scan_area(&slabp->list, sizeof(struct list_head), + local_flags); + if (!slabp) return NULL; } else { - freelist = addr + colour_off; - colour_off += cachep->freelist_size; + slabp = objp + colour_off; + colour_off += cachep->slab_size; } - page->active = 0; - page->s_mem = addr + colour_off; - return freelist; + slabp->inuse = 0; + slabp->colouroff = colour_off; + slabp->s_mem = objp + colour_off; + slabp->nodeid = nodeid; + slabp->free = 0; + return slabp; } -static inline unsigned int *slab_freelist(struct page *page) +static inline kmem_bufctl_t *slab_bufctl(struct slab *slabp) { - return (unsigned int *)(page->freelist); + return (kmem_bufctl_t *) (slabp + 1); } static void cache_init_objs(struct kmem_cache *cachep, - struct page *page) + struct slab *slabp) { int i; for (i = 0; i < cachep->num; i++) { - void *objp = index_to_obj(cachep, page, i); + void *objp = index_to_obj(cachep, slabp, i); #if DEBUG /* need to poison the objs? */ if (cachep->flags & SLAB_POISON) @@ -2589,8 +2681,9 @@ static void cache_init_objs(struct kmem_cache *cachep, if (cachep->ctor) cachep->ctor(objp); #endif - slab_freelist(page)[i] = i; + slab_bufctl(slabp)[i] = i + 1; } + slab_bufctl(slabp)[i - 1] = BUFCTL_END; } static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) @@ -2603,41 +2696,41 @@ static void kmem_flagcheck(struct kmem_cache *cachep, gfp_t flags) } } -static void *slab_get_obj(struct kmem_cache *cachep, struct page *page, +static void *slab_get_obj(struct kmem_cache *cachep, struct slab *slabp, int nodeid) { - void *objp; + void *objp = index_to_obj(cachep, slabp, slabp->free); + kmem_bufctl_t next; - objp = index_to_obj(cachep, page, slab_freelist(page)[page->active]); - page->active++; + slabp->inuse++; + next = slab_bufctl(slabp)[slabp->free]; #if DEBUG - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); + slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE; + WARN_ON(slabp->nodeid != nodeid); #endif + slabp->free = next; return objp; } -static void slab_put_obj(struct kmem_cache *cachep, struct page *page, +static void slab_put_obj(struct kmem_cache *cachep, struct slab *slabp, void *objp, int nodeid) { - unsigned int objnr = obj_to_index(cachep, page, objp); -#if DEBUG - unsigned int i; + unsigned int objnr = obj_to_index(cachep, slabp, objp); +#if DEBUG /* Verify that the slab belongs to the intended node */ - WARN_ON(page_to_nid(virt_to_page(objp)) != nodeid); + WARN_ON(slabp->nodeid != nodeid); - /* Verify double free bug */ - for (i = page->active; i < cachep->num; i++) { - if (slab_freelist(page)[i] == objnr) { - printk(KERN_ERR "slab: double free detected in cache " - "'%s', objp %p\n", cachep->name, objp); - BUG(); - } + if (slab_bufctl(slabp)[objnr] + 1 <= SLAB_LIMIT + 1) { + printk(KERN_ERR "slab: double free detected in cache " + "'%s', objp %p\n", cachep->name, objp); + BUG(); } #endif - page->active--; - slab_freelist(page)[page->active] = objnr; + slab_bufctl(slabp)[objnr] = slabp->free; + slabp->free = objnr; + slabp->inuse--; } /* @@ -2645,11 +2738,23 @@ static void slab_put_obj(struct kmem_cache *cachep, struct page *page, * for the slab allocator to be able to lookup the cache and slab of a * virtual address for kfree, ksize, and slab debugging. */ -static void slab_map_pages(struct kmem_cache *cache, struct page *page, - void *freelist) +static void slab_map_pages(struct kmem_cache *cache, struct slab *slab, + void *addr) { - page->slab_cache = cache; - page->freelist = freelist; + int nr_pages; + struct page *page; + + page = virt_to_page(addr); + + nr_pages = 1; + if (likely(!PageCompound(page))) + nr_pages <<= cache->gfporder; + + do { + page->slab_cache = cache; + page->slab_page = slab; + page++; + } while (--nr_pages); } /* @@ -2657,9 +2762,9 @@ static void slab_map_pages(struct kmem_cache *cache, struct page *page, * kmem_cache_alloc() when there are no active objs left in a cache. */ static int cache_grow(struct kmem_cache *cachep, - gfp_t flags, int nodeid, struct page *page) + gfp_t flags, int nodeid, void *objp) { - void *freelist; + struct slab *slabp; size_t offset; gfp_t local_flags; struct kmem_cache_node *n; @@ -2700,20 +2805,20 @@ static int cache_grow(struct kmem_cache *cachep, * Get mem for the objs. Attempt to allocate a physical page from * 'nodeid'. */ - if (!page) - page = kmem_getpages(cachep, local_flags, nodeid); - if (!page) + if (!objp) + objp = kmem_getpages(cachep, local_flags, nodeid); + if (!objp) goto failed; /* Get slab management. */ - freelist = alloc_slabmgmt(cachep, page, offset, + slabp = alloc_slabmgmt(cachep, objp, offset, local_flags & ~GFP_CONSTRAINT_MASK, nodeid); - if (!freelist) + if (!slabp) goto opps1; - slab_map_pages(cachep, page, freelist); + slab_map_pages(cachep, slabp, objp); - cache_init_objs(cachep, page); + cache_init_objs(cachep, slabp); if (local_flags & __GFP_WAIT) local_irq_disable(); @@ -2721,13 +2826,13 @@ static int cache_grow(struct kmem_cache *cachep, spin_lock(&n->list_lock); /* Make slab active. */ - list_add_tail(&page->lru, &(n->slabs_free)); + list_add_tail(&slabp->list, &(n->slabs_free)); STATS_INC_GROWN(cachep); n->free_objects += cachep->num; spin_unlock(&n->list_lock); return 1; opps1: - kmem_freepages(cachep, page); + kmem_freepages(cachep, objp); failed: if (local_flags & __GFP_WAIT) local_irq_disable(); @@ -2775,8 +2880,9 @@ static inline void verify_redzone_free(struct kmem_cache *cache, void *obj) static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, unsigned long caller) { - unsigned int objnr; struct page *page; + unsigned int objnr; + struct slab *slabp; BUG_ON(virt_to_cache(objp) != cachep); @@ -2784,6 +2890,8 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, kfree_debugcheck(objp); page = virt_to_head_page(objp); + slabp = page->slab_page; + if (cachep->flags & SLAB_RED_ZONE) { verify_redzone_free(cachep, objp); *dbg_redzone1(cachep, objp) = RED_INACTIVE; @@ -2792,11 +2900,14 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, if (cachep->flags & SLAB_STORE_USER) *dbg_userword(cachep, objp) = (void *)caller; - objnr = obj_to_index(cachep, page, objp); + objnr = obj_to_index(cachep, slabp, objp); BUG_ON(objnr >= cachep->num); - BUG_ON(objp != index_to_obj(cachep, page, objnr)); + BUG_ON(objp != index_to_obj(cachep, slabp, objnr)); +#ifdef CONFIG_DEBUG_SLAB_LEAK + slab_bufctl(slabp)[objnr] = BUFCTL_FREE; +#endif if (cachep->flags & SLAB_POISON) { #ifdef CONFIG_DEBUG_PAGEALLOC if ((cachep->size % PAGE_SIZE)==0 && OFF_SLAB(cachep)) { @@ -2813,9 +2924,33 @@ static void *cache_free_debugcheck(struct kmem_cache *cachep, void *objp, return objp; } +static void check_slabp(struct kmem_cache *cachep, struct slab *slabp) +{ + kmem_bufctl_t i; + int entries = 0; + + /* Check slab's freelist to see if this obj is there. */ + for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) { + entries++; + if (entries > cachep->num || i >= cachep->num) + goto bad; + } + if (entries != cachep->num - slabp->inuse) { +bad: + printk(KERN_ERR "slab: Internal list corruption detected in " + "cache '%s'(%d), slabp %p(%d). Tainted(%s). Hexdump:\n", + cachep->name, cachep->num, slabp, slabp->inuse, + print_tainted()); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, slabp, + sizeof(*slabp) + cachep->num * sizeof(kmem_bufctl_t), + 1); + BUG(); + } +} #else #define kfree_debugcheck(x) do { } while(0) #define cache_free_debugcheck(x,objp,z) (objp) +#define check_slabp(x,y) do { } while(0) #endif static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, @@ -2854,7 +2989,7 @@ retry: while (batchcount > 0) { struct list_head *entry; - struct page *page; + struct slab *slabp; /* Get slab alloc is to come from. */ entry = n->slabs_partial.next; if (entry == &n->slabs_partial) { @@ -2864,7 +2999,8 @@ retry: goto must_grow; } - page = list_entry(entry, struct page, lru); + slabp = list_entry(entry, struct slab, list); + check_slabp(cachep, slabp); check_spinlock_acquired(cachep); /* @@ -2872,23 +3008,24 @@ retry: * there must be at least one object available for * allocation. */ - BUG_ON(page->active >= cachep->num); + BUG_ON(slabp->inuse >= cachep->num); - while (page->active < cachep->num && batchcount--) { + while (slabp->inuse < cachep->num && batchcount--) { STATS_INC_ALLOCED(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac_put_obj(cachep, ac, slab_get_obj(cachep, page, + ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, node)); } + check_slabp(cachep, slabp); /* move slabp to correct slabp list: */ - list_del(&page->lru); - if (page->active == cachep->num) - list_add(&page->list, &n->slabs_full); + list_del(&slabp->list); + if (slabp->free == BUFCTL_END) + list_add(&slabp->list, &n->slabs_full); else - list_add(&page->list, &n->slabs_partial); + list_add(&slabp->list, &n->slabs_partial); } must_grow: @@ -2960,6 +3097,16 @@ static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, *dbg_redzone1(cachep, objp) = RED_ACTIVE; *dbg_redzone2(cachep, objp) = RED_ACTIVE; } +#ifdef CONFIG_DEBUG_SLAB_LEAK + { + struct slab *slabp; + unsigned objnr; + + slabp = virt_to_head_page(objp)->slab_page; + objnr = (unsigned)(objp - slabp->s_mem) / cachep->size; + slab_bufctl(slabp)[objnr] = BUFCTL_ACTIVE; + } +#endif objp += obj_offset(cachep); if (cachep->ctor && cachep->flags & SLAB_POISON) cachep->ctor(objp); @@ -3101,20 +3248,18 @@ retry: * We may trigger various forms of reclaim on the allowed * set and go into memory reserves if necessary. */ - struct page *page; - if (local_flags & __GFP_WAIT) local_irq_enable(); kmem_flagcheck(cache, flags); - page = kmem_getpages(cache, local_flags, numa_mem_id()); + obj = kmem_getpages(cache, local_flags, numa_mem_id()); if (local_flags & __GFP_WAIT) local_irq_disable(); - if (page) { + if (obj) { /* * Insert into the appropriate per node queues */ - nid = page_to_nid(page); - if (cache_grow(cache, flags, nid, page)) { + nid = page_to_nid(virt_to_page(obj)); + if (cache_grow(cache, flags, nid, obj)) { obj = ____cache_alloc_node(cache, flags | GFP_THISNODE, nid); if (!obj) @@ -3143,7 +3288,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { struct list_head *entry; - struct page *page; + struct slab *slabp; struct kmem_cache_node *n; void *obj; int x; @@ -3163,24 +3308,26 @@ retry: goto must_grow; } - page = list_entry(entry, struct page, lru); + slabp = list_entry(entry, struct slab, list); check_spinlock_acquired_node(cachep, nodeid); + check_slabp(cachep, slabp); STATS_INC_NODEALLOCS(cachep); STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - BUG_ON(page->active == cachep->num); + BUG_ON(slabp->inuse == cachep->num); - obj = slab_get_obj(cachep, page, nodeid); + obj = slab_get_obj(cachep, slabp, nodeid); + check_slabp(cachep, slabp); n->free_objects--; /* move slabp to correct slabp list: */ - list_del(&page->lru); + list_del(&slabp->list); - if (page->active == cachep->num) - list_add(&page->lru, &n->slabs_full); + if (slabp->free == BUFCTL_END) + list_add(&slabp->list, &n->slabs_full); else - list_add(&page->lru, &n->slabs_partial); + list_add(&slabp->list, &n->slabs_partial); spin_unlock(&n->list_lock); goto done; @@ -3330,21 +3477,23 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, for (i = 0; i < nr_objects; i++) { void *objp; - struct page *page; + struct slab *slabp; clear_obj_pfmemalloc(&objpp[i]); objp = objpp[i]; - page = virt_to_head_page(objp); + slabp = virt_to_slab(objp); n = cachep->node[node]; - list_del(&page->lru); + list_del(&slabp->list); check_spinlock_acquired_node(cachep, node); - slab_put_obj(cachep, page, objp, node); + check_slabp(cachep, slabp); + slab_put_obj(cachep, slabp, objp, node); STATS_DEC_ACTIVE(cachep); n->free_objects++; + check_slabp(cachep, slabp); /* fixup slab chains */ - if (page->active == 0) { + if (slabp->inuse == 0) { if (n->free_objects > n->free_limit) { n->free_objects -= cachep->num; /* No need to drop any previously held @@ -3353,16 +3502,16 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, * a different cache, refer to comments before * alloc_slabmgmt. */ - slab_destroy(cachep, page); + slab_destroy(cachep, slabp); } else { - list_add(&page->lru, &n->slabs_free); + list_add(&slabp->list, &n->slabs_free); } } else { /* Unconditionally move a slab to the end of the * partial list on free - maximum time for the * other objects to be freed, too. */ - list_add_tail(&page->lru, &n->slabs_partial); + list_add_tail(&slabp->list, &n->slabs_partial); } } } @@ -3402,10 +3551,10 @@ free_done: p = n->slabs_free.next; while (p != &(n->slabs_free)) { - struct page *page; + struct slab *slabp; - page = list_entry(p, struct page, lru); - BUG_ON(page->active); + slabp = list_entry(p, struct slab, list); + BUG_ON(slabp->inuse); i++; p = p->next; @@ -3833,7 +3982,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, VM_BUG_ON(!mutex_is_locked(&slab_mutex)); for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(cachep, i); + c = cache_from_memcg(cachep, i); if (c) /* return value determined by the parent cache only */ __do_tune_cpucache(c, limit, batchcount, shared, gfp); @@ -4009,7 +4158,7 @@ out: #ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) { - struct page *page; + struct slab *slabp; unsigned long active_objs; unsigned long num_objs; unsigned long active_slabs = 0; @@ -4029,23 +4178,23 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(page, &n->slabs_full, lru) { - if (page->active != cachep->num && !error) + list_for_each_entry(slabp, &n->slabs_full, list) { + if (slabp->inuse != cachep->num && !error) error = "slabs_full accounting error"; active_objs += cachep->num; active_slabs++; } - list_for_each_entry(page, &n->slabs_partial, lru) { - if (page->active == cachep->num && !error) - error = "slabs_partial accounting error"; - if (!page->active && !error) - error = "slabs_partial accounting error"; - active_objs += page->active; + list_for_each_entry(slabp, &n->slabs_partial, list) { + if (slabp->inuse == cachep->num && !error) + error = "slabs_partial inuse accounting error"; + if (!slabp->inuse && !error) + error = "slabs_partial/inuse accounting error"; + active_objs += slabp->inuse; active_slabs++; } - list_for_each_entry(page, &n->slabs_free, lru) { - if (page->active && !error) - error = "slabs_free accounting error"; + list_for_each_entry(slabp, &n->slabs_free, list) { + if (slabp->inuse && !error) + error = "slabs_free/inuse accounting error"; num_slabs++; } free_objects += n->free_objects; @@ -4197,27 +4346,15 @@ static inline int add_caller(unsigned long *n, unsigned long v) return 1; } -static void handle_slab(unsigned long *n, struct kmem_cache *c, - struct page *page) +static void handle_slab(unsigned long *n, struct kmem_cache *c, struct slab *s) { void *p; - int i, j; - + int i; if (n[0] == n[1]) return; - for (i = 0, p = page->s_mem; i < c->num; i++, p += c->size) { - bool active = true; - - for (j = page->active; j < c->num; j++) { - /* Skip freed item */ - if (slab_freelist(page)[j] == i) { - active = false; - break; - } - } - if (!active) + for (i = 0, p = s->s_mem; i < c->num; i++, p += c->size) { + if (slab_bufctl(s)[i] != BUFCTL_ACTIVE) continue; - if (!add_caller(n, (unsigned long)*dbg_userword(c, p))) return; } @@ -4242,7 +4379,7 @@ static void show_symbol(struct seq_file *m, unsigned long address) static int leaks_show(struct seq_file *m, void *p) { struct kmem_cache *cachep = list_entry(p, struct kmem_cache, list); - struct page *page; + struct slab *slabp; struct kmem_cache_node *n; const char *name; unsigned long *x = m->private; @@ -4266,10 +4403,10 @@ static int leaks_show(struct seq_file *m, void *p) check_irq_on(); spin_lock_irq(&n->list_lock); - list_for_each_entry(page, &n->slabs_full, lru) - handle_slab(x, cachep, page); - list_for_each_entry(page, &n->slabs_partial, lru) - handle_slab(x, cachep, page); + list_for_each_entry(slabp, &n->slabs_full, list) + handle_slab(x, cachep, slabp); + list_for_each_entry(slabp, &n->slabs_partial, list) + handle_slab(x, cachep, slabp); spin_unlock_irq(&n->list_lock); } name = cachep->name; @@ -160,8 +160,7 @@ static inline const char *cache_name(struct kmem_cache *s) return s->name; } -static inline struct kmem_cache * -cache_from_memcg_idx(struct kmem_cache *s, int idx) +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) { if (!s->memcg_params) return NULL; @@ -205,8 +204,7 @@ static inline const char *cache_name(struct kmem_cache *s) return s->name; } -static inline struct kmem_cache * -cache_from_memcg_idx(struct kmem_cache *s, int idx) +static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx) { return NULL; } diff --git a/mm/slab_common.c b/mm/slab_common.c index 0b7bb39..e2e98af 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -571,7 +571,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info) return; for_each_memcg_cache_index(i) { - c = cache_from_memcg_idx(s, i); + c = cache_from_memcg(s, i); if (!c) continue; @@ -155,7 +155,7 @@ static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) /* * Maximum number of desirable partial slabs. * The existence of more partial slabs makes kmem_cache_shrink - * sort the partial list by the number of objects in use. + * sort the partial list by the number of objects in the. */ #define MAX_PARTIAL 10 @@ -933,16 +933,6 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. */ -static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - kmemleak_alloc(ptr, size, 1, flags); -} - -static inline void kfree_hook(const void *x) -{ - kmemleak_free(x); -} - static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { flags &= gfp_allowed_mask; @@ -965,7 +955,7 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) kmemleak_free_recursive(x, s->flags); /* - * Trouble is that we may no longer disable interrupts in the fast path + * Trouble is that we may no longer disable interupts in the fast path * So in order to make the debug calls that expect irqs to be * disabled we need to disable interrupts temporarily. */ @@ -1227,8 +1217,8 @@ static unsigned long kmem_cache_flags(unsigned long object_size, /* * Enable debugging if selected on the kernel commandline. */ - if (slub_debug && (!slub_debug_slabs || (name && - !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs))))) + if (slub_debug && (!slub_debug_slabs || + !strncmp(slub_debug_slabs, name, strlen(slub_debug_slabs)))) flags |= slub_debug; return flags; @@ -1270,30 +1260,13 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects) {} -static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) -{ - kmemleak_alloc(ptr, size, 1, flags); -} - -static inline void kfree_hook(const void *x) -{ - kmemleak_free(x); -} - static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags) { return 0; } static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, - void *object) -{ - kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, - flags & gfp_allowed_mask); -} + void *object) {} -static inline void slab_free_hook(struct kmem_cache *s, void *x) -{ - kmemleak_free_recursive(x, s->flags); -} +static inline void slab_free_hook(struct kmem_cache *s, void *x) {} #endif /* CONFIG_SLUB_DEBUG */ @@ -2856,8 +2829,8 @@ static struct kmem_cache *kmem_cache_node; * slab on the node for this slabcache. There are no concurrent accesses * possible. * - * Note that this function only works on the kmem_cache_node - * when allocating for the kmem_cache_node. This is used for bootstrapping + * Note that this function only works on the kmalloc_node_cache + * when allocating for the kmalloc_node_cache. This is used for bootstrapping * memory on a fresh node that has no slab structures yet. */ static void early_kmem_cache_node_alloc(int node) @@ -3299,7 +3272,7 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) if (page) ptr = page_address(page); - kmalloc_large_node_hook(ptr, size, flags); + kmemleak_alloc(ptr, size, 1, flags); return ptr; } @@ -3363,7 +3336,7 @@ void kfree(const void *x) page = virt_to_head_page(x); if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); - kfree_hook(x); + kmemleak_free(x); __free_memcg_kmem_pages(page, compound_order(page)); return; } @@ -5010,7 +4983,7 @@ static ssize_t slab_attr_store(struct kobject *kobj, * through the descendants with best-effort propagation. */ for_each_memcg_cache_index(i) { - struct kmem_cache *c = cache_from_memcg_idx(s, i); + struct kmem_cache *c = cache_from_memcg(s, i); if (c) attribute->store(c, buf, len); } diff --git a/mm/sparse.c b/mm/sparse.c index 8cc7be0..4ac1d7e 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -590,32 +590,33 @@ void __init sparse_init(void) #ifdef CONFIG_MEMORY_HOTPLUG #ifdef CONFIG_SPARSEMEM_VMEMMAP -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, + unsigned long nr_pages) { /* This will make the necessary allocations eventually. */ return sparse_mem_map_populate(pnum, nid); } -static void __kfree_section_memmap(struct page *memmap) +static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + unsigned long end = (unsigned long)(memmap + nr_pages); vmemmap_free(start, end); } #ifdef CONFIG_MEMORY_HOTREMOVE -static void free_map_bootmem(struct page *memmap) +static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { unsigned long start = (unsigned long)memmap; - unsigned long end = (unsigned long)(memmap + PAGES_PER_SECTION); + unsigned long end = (unsigned long)(memmap + nr_pages); vmemmap_free(start, end); } #endif /* CONFIG_MEMORY_HOTREMOVE */ #else -static struct page *__kmalloc_section_memmap(void) +static struct page *__kmalloc_section_memmap(unsigned long nr_pages) { struct page *page, *ret; - unsigned long memmap_size = sizeof(struct page) * PAGES_PER_SECTION; + unsigned long memmap_size = sizeof(struct page) * nr_pages; page = alloc_pages(GFP_KERNEL|__GFP_NOWARN, get_order(memmap_size)); if (page) @@ -633,30 +634,28 @@ got_map_ptr: return ret; } -static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid) +static inline struct page *kmalloc_section_memmap(unsigned long pnum, int nid, + unsigned long nr_pages) { - return __kmalloc_section_memmap(); + return __kmalloc_section_memmap(nr_pages); } -static void __kfree_section_memmap(struct page *memmap) +static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) { if (is_vmalloc_addr(memmap)) vfree(memmap); else free_pages((unsigned long)memmap, - get_order(sizeof(struct page) * PAGES_PER_SECTION)); + get_order(sizeof(struct page) * nr_pages)); } #ifdef CONFIG_MEMORY_HOTREMOVE -static void free_map_bootmem(struct page *memmap) +static void free_map_bootmem(struct page *memmap, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; - unsigned long magic, nr_pages; + unsigned long magic; struct page *page = virt_to_page(memmap); - nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) - >> PAGE_SHIFT; - for (i = 0; i < nr_pages; i++, page++) { magic = (unsigned long) page->lru.next; @@ -685,7 +684,8 @@ static void free_map_bootmem(struct page *memmap) * set. If this is <=0, then that means that the passed-in * map was not consumed and must be freed. */ -int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) +int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, + int nr_pages) { unsigned long section_nr = pfn_to_section_nr(start_pfn); struct pglist_data *pgdat = zone->zone_pgdat; @@ -702,12 +702,12 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) ret = sparse_index_init(section_nr, pgdat->node_id); if (ret < 0 && ret != -EEXIST) return ret; - memmap = kmalloc_section_memmap(section_nr, pgdat->node_id); + memmap = kmalloc_section_memmap(section_nr, pgdat->node_id, nr_pages); if (!memmap) return -ENOMEM; usemap = __kmalloc_section_usemap(); if (!usemap) { - __kfree_section_memmap(memmap); + __kfree_section_memmap(memmap, nr_pages); return -ENOMEM; } @@ -719,7 +719,7 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn) goto out; } - memset(memmap, 0, sizeof(struct page) * PAGES_PER_SECTION); + memset(memmap, 0, sizeof(struct page) * nr_pages); ms->section_mem_map |= SECTION_MARKED_PRESENT; @@ -729,7 +729,7 @@ out: pgdat_resize_unlock(pgdat, &flags); if (ret <= 0) { kfree(usemap); - __kfree_section_memmap(memmap); + __kfree_section_memmap(memmap, nr_pages); } return ret; } @@ -759,6 +759,7 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) static void free_section_usemap(struct page *memmap, unsigned long *usemap) { struct page *usemap_page; + unsigned long nr_pages; if (!usemap) return; @@ -770,7 +771,7 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) if (PageSlab(usemap_page) || PageCompound(usemap_page)) { kfree(usemap); if (memmap) - __kfree_section_memmap(memmap); + __kfree_section_memmap(memmap, PAGES_PER_SECTION); return; } @@ -779,8 +780,12 @@ static void free_section_usemap(struct page *memmap, unsigned long *usemap) * on the section which has pgdat at boot time. Just keep it as is now. */ - if (memmap) - free_map_bootmem(memmap); + if (memmap) { + nr_pages = PAGE_ALIGN(PAGES_PER_SECTION * sizeof(struct page)) + >> PAGE_SHIFT; + + free_map_bootmem(memmap, nr_pages); + } } void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) @@ -82,6 +82,19 @@ static void __put_compound_page(struct page *page) static void put_compound_page(struct page *page) { + /* + * hugetlbfs pages cannot be split from under us. If this is a + * hugetlbfs page, check refcount on head page and release the page if + * the refcount becomes zero. + */ + if (PageHuge(page)) { + page = compound_head(page); + if (put_page_testzero(page)) + __put_compound_page(page); + + return; + } + if (unlikely(PageTail(page))) { /* __split_huge_page_refcount can run under us */ struct page *page_head = compound_trans_head(page); @@ -98,31 +111,14 @@ static void put_compound_page(struct page *page) * still hot on arches that do not support * this_cpu_cmpxchg_double(). */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON(!PageHead(page_head)); - atomic_dec(&page->_mapcount); + if (PageSlab(page_head)) { + if (PageTail(page)) { if (put_page_testzero(page_head)) VM_BUG_ON(1); - if (put_page_testzero(page_head)) - __put_compound_page(page_head); - return; + + atomic_dec(&page->_mapcount); + goto skip_lock_tail; } else - /* - * __split_huge_page_refcount - * run before us, "page" was a - * THP tail. The split - * page_head has been freed - * and reallocated as slab or - * hugetlbfs page of smaller - * order (only possible if - * reallocated as slab on - * x86). - */ goto skip_lock; } /* @@ -136,27 +132,8 @@ static void put_compound_page(struct page *page) /* __split_huge_page_refcount run before us */ compound_unlock_irqrestore(page_head, flags); skip_lock: - if (put_page_testzero(page_head)) { - /* - * The head page may have been - * freed and reallocated as a - * compound page of smaller - * order and then freed again. - * All we know is that it - * cannot have become: a THP - * page, a compound page of - * higher order, a tail page. - * That is because we still - * hold the refcount of the - * split THP tail and - * page_head was the THP head - * before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } + if (put_page_testzero(page_head)) + __put_single_page(page_head); out_put_single: if (put_page_testzero(page)) __put_single_page(page); @@ -178,6 +155,7 @@ out_put_single: VM_BUG_ON(atomic_read(&page->_count) != 0); compound_unlock_irqrestore(page_head, flags); +skip_lock_tail: if (put_page_testzero(page_head)) { if (PageHead(page_head)) __put_compound_page(page_head); @@ -220,52 +198,51 @@ bool __get_page_tail(struct page *page) * proper PT lock that already serializes against * split_huge_page(). */ - unsigned long flags; bool got = false; - struct page *page_head = compound_trans_head(page); + struct page *page_head; + + /* + * If this is a hugetlbfs page it cannot be split under us. Simply + * increment refcount for the head page. + */ + if (PageHuge(page)) { + page_head = compound_head(page); + atomic_inc(&page_head->_count); + got = true; + } else { + unsigned long flags; - if (likely(page != page_head && get_page_unless_zero(page_head))) { - /* Ref to put_compound_page() comment. */ - if (PageSlab(page_head) || PageHeadHuge(page_head)) { + page_head = compound_trans_head(page); + if (likely(page != page_head && + get_page_unless_zero(page_head))) { + + /* Ref to put_compound_page() comment. */ + if (PageSlab(page_head)) { + if (likely(PageTail(page))) { + __get_page_tail_foll(page, false); + return true; + } else { + put_page(page_head); + return false; + } + } + + /* + * page_head wasn't a dangling pointer but it + * may not be a head page anymore by the time + * we obtain the lock. That is ok as long as it + * can't be freed from under us. + */ + flags = compound_lock_irqsave(page_head); + /* here __split_huge_page_refcount won't run anymore */ if (likely(PageTail(page))) { - /* - * This is a hugetlbfs page or a slab - * page. __split_huge_page_refcount - * cannot race here. - */ - VM_BUG_ON(!PageHead(page_head)); __get_page_tail_foll(page, false); - return true; - } else { - /* - * __split_huge_page_refcount run - * before us, "page" was a THP - * tail. The split page_head has been - * freed and reallocated as slab or - * hugetlbfs page of smaller order - * (only possible if reallocated as - * slab on x86). - */ - put_page(page_head); - return false; + got = true; } + compound_unlock_irqrestore(page_head, flags); + if (unlikely(!got)) + put_page(page_head); } - - /* - * page_head wasn't a dangling pointer but it - * may not be a head page anymore by the time - * we obtain the lock. That is ok as long as it - * can't be freed from under us. - */ - flags = compound_lock_irqsave(page_head); - /* here __split_huge_page_refcount won't run anymore */ - if (likely(PageTail(page))) { - __get_page_tail_foll(page, false); - got = true; - } - compound_unlock_irqrestore(page_head, flags); - if (unlikely(!got)) - put_page(page_head); } return got; } @@ -957,8 +934,7 @@ void __init swap_setup(void) #ifdef CONFIG_SWAP int i; - if (bdi_init(swapper_spaces[0].backing_dev_info)) - panic("Failed to init swap bdi"); + bdi_init(swapper_spaces[0].backing_dev_info); for (i = 0; i < MAX_SWAPFILES; i++) { spin_lock_init(&swapper_spaces[i].tree_lock); INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear); diff --git a/mm/swapfile.c b/mm/swapfile.c index 612a7c9..de7c904 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -707,7 +707,7 @@ noswap: return (swp_entry_t) {0}; } -/* The only caller of this function is now suspend routine */ +/* The only caller of this function is now susupend routine */ swp_entry_t get_swap_page_of_type(int type) { struct swap_info_struct *si; @@ -845,7 +845,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, } /* - * Caller has made sure that the swap device corresponding to entry + * Caller has made sure that the swapdevice corresponding to entry * is still around or has not been recycled. */ void swap_free(swp_entry_t entry) @@ -947,7 +947,7 @@ int try_to_free_swap(struct page *page) * original page might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * - * Hibernation suspends storage while it is writing the image + * Hibration suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) @@ -1179,7 +1179,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * some architectures (e.g. x86_32 with PAE) we might catch a glimpse * of unmatched parts which look like swp_pte, so unuse_pte must * recheck under pte lock. Scanning without pte lock lets it be - * preemptable whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. + * preemptible whenever CONFIG_PREEMPT but not CONFIG_HIGHPTE. */ pte = pte_offset_map(pmd, addr); do { @@ -1924,17 +1924,17 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->cluster_info = NULL; p->flags = 0; frontswap_map = frontswap_map_get(p); + frontswap_map_set(p, NULL); spin_unlock(&p->lock); spin_unlock(&swap_lock); frontswap_invalidate_area(type); - frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); p->percpu_cluster = NULL; vfree(swap_map); vfree(cluster_info); vfree(frontswap_map); - /* Destroy swap account information */ + /* Destroy swap account informatin */ swap_cgroup_swapoff(type); inode = mapping->host; @@ -2786,8 +2786,8 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask) /* * We are fortunate that although vmalloc_to_page uses pte_offset_map, - * no architecture is using highmem pages for kernel page tables: so it - * will not corrupt the GFP_ATOMIC caller's atomic page table kmaps. + * no architecture is using highmem pages for kernel pagetables: so it + * will not corrupt the GFP_ATOMIC caller's atomic pagetable kmaps. */ head = vmalloc_to_page(si->swap_map + offset); offset &= ~PAGE_MASK; @@ -7,9 +7,6 @@ #include <linux/security.h> #include <linux/swap.h> #include <linux/swapops.h> -#include <linux/mman.h> -#include <linux/hugetlb.h> - #include <asm/uaccess.h> #include "internal.h" @@ -401,16 +398,6 @@ struct address_space *page_mapping(struct page *page) return mapping; } -/* - * Committed memory limit enforced when OVERCOMMIT_NEVER policy is used - */ -unsigned long vm_commit_limit(void) -{ - return ((totalram_pages - hugetlb_total_pages()) - * sysctl_overcommit_ratio / 100) + total_swap_pages; -} - - /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 0fdf968..1074543 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -359,12 +359,6 @@ static struct vmap_area *alloc_vmap_area(unsigned long size, if (unlikely(!va)) return ERR_PTR(-ENOMEM); - /* - * Only scan the relevant parts containing pointers to other objects - * to avoid false negatives. - */ - kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask & GFP_RECLAIM_MASK); - retry: spin_lock(&vmap_area_lock); /* @@ -1552,7 +1546,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask, pgprot_t prot, int node, const void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot, int node) + pgprot_t prot, int node, const void *caller) { const int order = 0; struct page **pages; @@ -1566,12 +1560,13 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, - PAGE_KERNEL, node, area->caller); + PAGE_KERNEL, node, caller); area->flags |= VM_VPAGES; } else { pages = kmalloc_node(array_size, nested_gfp, node); } area->pages = pages; + area->caller = caller; if (!area->pages) { remove_vm_area(area->addr); kfree(area); @@ -1582,7 +1577,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, struct page *page; gfp_t tmp_mask = gfp_mask | __GFP_NOWARN; - if (node == NUMA_NO_NODE) + if (node < 0) page = alloc_page(tmp_mask); else page = alloc_pages_node(node, tmp_mask, order); @@ -1639,9 +1634,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!area) goto fail; - addr = __vmalloc_area_node(area, gfp_mask, prot, node); + addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); if (!addr) - return NULL; + goto fail; /* * In this function, newly allocated vm_struct has VM_UNINITIALIZED @@ -1651,11 +1646,11 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, clear_vm_uninitialized_flag(area); /* - * A ref_count = 2 is needed because vm_struct allocated in - * __get_vm_area_node() contains a reference to the virtual address of - * the vmalloc'ed block. + * A ref_count = 3 is needed because the vm_struct and vmap_area + * structures allocated in the __get_vm_area_node() function contain + * references to the virtual address of the vmalloc'ed block. */ - kmemleak_alloc(addr, real_size, 2, gfp_mask); + kmemleak_alloc(addr, real_size, 3, gfp_mask); return addr; @@ -2568,11 +2563,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; - /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ - smp_rmb(); - if (v->flags & VM_UNINITIALIZED) - return; - memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr++) @@ -2589,15 +2579,23 @@ static int s_show(struct seq_file *m, void *p) struct vmap_area *va = p; struct vm_struct *v; - /* - * s_show can encounter race with remove_vm_area, !VM_VM_AREA on - * behalf of vmap area is being tear down or vm_map_ram allocation. - */ - if (!(va->flags & VM_VM_AREA)) + if (va->flags & (VM_LAZY_FREE | VM_LAZY_FREEING)) return 0; + if (!(va->flags & VM_VM_AREA)) { + seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n", + (void *)va->va_start, (void *)va->va_end, + va->va_end - va->va_start); + return 0; + } + v = va->vm; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + if (v->flags & VM_UNINITIALIZED) + return 0; + seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); diff --git a/mm/vmstat.c b/mm/vmstat.c index 7249614..9bb3145 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -812,7 +812,6 @@ const char * const vmstat_text[] = { #ifdef CONFIG_NUMA_BALANCING "numa_pte_updates", - "numa_huge_pte_updates", "numa_hint_faults", "numa_hint_faults_local", "numa_pages_migrated", @@ -1230,20 +1229,6 @@ static void start_cpu_timer(int cpu) schedule_delayed_work_on(cpu, work, __round_jiffies_relative(HZ, cpu)); } -static void vmstat_cpu_dead(int node) -{ - int cpu; - - get_online_cpus(); - for_each_online_cpu(cpu) - if (cpu_to_node(cpu) == node) - goto end; - - node_clear_state(node, N_CPU); -end: - put_online_cpus(); -} - /* * Use the cpu notifier to insure that the thresholds are recalculated * when necessary. @@ -1273,7 +1258,6 @@ static int vmstat_cpuup_callback(struct notifier_block *nfb, case CPU_DEAD: case CPU_DEAD_FROZEN: refresh_zone_stat_thresholds(); - vmstat_cpu_dead(cpu_to_node(cpu)); break; default: break; @@ -1292,12 +1276,8 @@ static int __init setup_vmstat(void) register_cpu_notifier(&vmstat_notifier); - get_online_cpus(); - for_each_online_cpu(cpu) { + for_each_online_cpu(cpu) start_cpu_timer(cpu); - node_set_state(cpu_to_node(cpu), N_CPU); - } - put_online_cpus(); #endif #ifdef CONFIG_PROC_FS proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); @@ -217,7 +217,6 @@ static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) if (!entry) return NULL; entry->refcount = 1; - RB_CLEAR_NODE(&entry->rbnode); return entry; } @@ -226,6 +225,19 @@ static void zswap_entry_cache_free(struct zswap_entry *entry) kmem_cache_free(zswap_entry_cache, entry); } +/* caller must hold the tree lock */ +static void zswap_entry_get(struct zswap_entry *entry) +{ + entry->refcount++; +} + +/* caller must hold the tree lock */ +static int zswap_entry_put(struct zswap_entry *entry) +{ + entry->refcount--; + return entry->refcount; +} + /********************************* * rbtree functions **********************************/ @@ -273,61 +285,6 @@ static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, return 0; } -static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) -{ - if (!RB_EMPTY_NODE(&entry->rbnode)) { - rb_erase(&entry->rbnode, root); - RB_CLEAR_NODE(&entry->rbnode); - } -} - -/* - * Carries out the common pattern of freeing and entry's zsmalloc allocation, - * freeing the entry itself, and decrementing the number of stored pages. - */ -static void zswap_free_entry(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - zbud_free(tree->pool, entry->handle); - zswap_entry_cache_free(entry); - atomic_dec(&zswap_stored_pages); - zswap_pool_pages = zbud_get_pool_size(tree->pool); -} - -/* caller must hold the tree lock */ -static void zswap_entry_get(struct zswap_entry *entry) -{ - entry->refcount++; -} - -/* caller must hold the tree lock -* remove from the tree and free it, if nobody reference the entry -*/ -static void zswap_entry_put(struct zswap_tree *tree, - struct zswap_entry *entry) -{ - int refcount = --entry->refcount; - - BUG_ON(refcount < 0); - if (refcount == 0) { - zswap_rb_erase(&tree->rbroot, entry); - zswap_free_entry(tree, entry); - } -} - -/* caller must hold the tree lock */ -static struct zswap_entry *zswap_entry_find_get(struct rb_root *root, - pgoff_t offset) -{ - struct zswap_entry *entry = NULL; - - entry = zswap_rb_search(root, offset); - if (entry) - zswap_entry_get(entry); - - return entry; -} - /********************************* * per-cpu code **********************************/ @@ -411,6 +368,18 @@ static bool zswap_is_full(void) zswap_pool_pages); } +/* + * Carries out the common pattern of freeing and entry's zsmalloc allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) +{ + zbud_free(tree->pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_pool_pages = zbud_get_pool_size(tree->pool); +} + /********************************* * writeback code **********************************/ @@ -418,7 +387,7 @@ static bool zswap_is_full(void) enum zswap_get_swap_ret { ZSWAP_SWAPCACHE_NEW, ZSWAP_SWAPCACHE_EXIST, - ZSWAP_SWAPCACHE_FAIL, + ZSWAP_SWAPCACHE_NOMEM }; /* @@ -432,10 +401,9 @@ enum zswap_get_swap_ret { * added to the swap cache, and returned in retpage. * * If success, the swap cache page is returned in retpage - * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache - * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated, - * the new page is added to swapcache and locked - * Returns ZSWAP_SWAPCACHE_FAIL on error + * Returns 0 if page was already in the swap cache, page is not locked + * Returns 1 if the new page needs to be populated, page is locked + * Returns <0 on error */ static int zswap_get_swap_cache_page(swp_entry_t entry, struct page **retpage) @@ -507,7 +475,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry, if (new_page) page_cache_release(new_page); if (!found_page) - return ZSWAP_SWAPCACHE_FAIL; + return ZSWAP_SWAPCACHE_NOMEM; *retpage = found_page; return ZSWAP_SWAPCACHE_EXIST; } @@ -534,7 +502,7 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) struct page *page; u8 *src, *dst; unsigned int dlen; - int ret; + int ret, refcount; struct writeback_control wbc = { .sync_mode = WB_SYNC_NONE, }; @@ -549,22 +517,23 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) /* find and ref zswap entry */ spin_lock(&tree->lock); - entry = zswap_entry_find_get(&tree->rbroot, offset); + entry = zswap_rb_search(&tree->rbroot, offset); if (!entry) { /* entry was invalidated */ spin_unlock(&tree->lock); return 0; } + zswap_entry_get(entry); spin_unlock(&tree->lock); BUG_ON(offset != entry->offset); /* try to allocate swap cache page */ switch (zswap_get_swap_cache_page(swpentry, &page)) { - case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */ + case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ ret = -ENOMEM; goto fail; - case ZSWAP_SWAPCACHE_EXIST: + case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ /* page is already in the swap cache, ignore for now */ page_cache_release(page); ret = -EEXIST; @@ -587,44 +556,43 @@ static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) SetPageUptodate(page); } - /* move it to the tail of the inactive list after end_writeback */ - SetPageReclaim(page); - /* start writeback */ __swap_writepage(page, &wbc, end_swap_bio_write); page_cache_release(page); zswap_written_back_pages++; spin_lock(&tree->lock); + /* drop local reference */ - zswap_entry_put(tree, entry); + zswap_entry_put(entry); + /* drop the initial reference from entry creation */ + refcount = zswap_entry_put(entry); /* - * There are two possible situations for entry here: - * (1) refcount is 1(normal case), entry is valid and on the tree - * (2) refcount is 0, entry is freed and not on the tree - * because invalidate happened during writeback - * search the tree and free the entry if find entry - */ - if (entry == zswap_rb_search(&tree->rbroot, offset)) - zswap_entry_put(tree, entry); + * There are three possible values for refcount here: + * (1) refcount is 1, load is in progress, unlink from rbtree, + * load will free + * (2) refcount is 0, (normal case) entry is valid, + * remove from rbtree and free entry + * (3) refcount is -1, invalidate happened during writeback, + * free entry + */ + if (refcount >= 0) { + /* no invalidate yet, remove from rbtree */ + rb_erase(&entry->rbnode, &tree->rbroot); + } spin_unlock(&tree->lock); + if (refcount <= 0) { + /* free the entry */ + zswap_free_entry(tree, entry); + return 0; + } + return -EAGAIN; - goto end; - - /* - * if we get here due to ZSWAP_SWAPCACHE_EXIST - * a load may happening concurrently - * it is safe and okay to not free the entry - * if we free the entry in the following put - * it it either okay to return !0 - */ fail: spin_lock(&tree->lock); - zswap_entry_put(tree, entry); + zswap_entry_put(entry); spin_unlock(&tree->lock); - -end: return ret; } @@ -708,8 +676,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, if (ret == -EEXIST) { zswap_duplicate_entry++; /* remove from rbtree */ - zswap_rb_erase(&tree->rbroot, dupentry); - zswap_entry_put(tree, dupentry); + rb_erase(&dupentry->rbnode, &tree->rbroot); + if (!zswap_entry_put(dupentry)) { + /* free */ + zswap_free_entry(tree, dupentry); + } } } while (ret == -EEXIST); spin_unlock(&tree->lock); @@ -738,16 +709,17 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, struct zswap_entry *entry; u8 *src, *dst; unsigned int dlen; - int ret; + int refcount, ret; /* find */ spin_lock(&tree->lock); - entry = zswap_entry_find_get(&tree->rbroot, offset); + entry = zswap_rb_search(&tree->rbroot, offset); if (!entry) { /* entry was written back */ spin_unlock(&tree->lock); return -1; } + zswap_entry_get(entry); spin_unlock(&tree->lock); /* decompress */ @@ -762,9 +734,22 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, BUG_ON(ret); spin_lock(&tree->lock); - zswap_entry_put(tree, entry); + refcount = zswap_entry_put(entry); + if (likely(refcount)) { + spin_unlock(&tree->lock); + return 0; + } spin_unlock(&tree->lock); + /* + * We don't have to unlink from the rbtree because + * zswap_writeback_entry() or zswap_frontswap_invalidate page() + * has already done this for us if we are the last reference. + */ + /* free */ + + zswap_free_entry(tree, entry); + return 0; } @@ -773,6 +758,7 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) { struct zswap_tree *tree = zswap_trees[type]; struct zswap_entry *entry; + int refcount; /* find */ spin_lock(&tree->lock); @@ -784,12 +770,20 @@ static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) } /* remove from rbtree */ - zswap_rb_erase(&tree->rbroot, entry); + rb_erase(&entry->rbnode, &tree->rbroot); /* drop the initial reference from entry creation */ - zswap_entry_put(tree, entry); + refcount = zswap_entry_put(entry); spin_unlock(&tree->lock); + + if (refcount) { + /* writeback in progress, writeback will free */ + return; + } + + /* free */ + zswap_free_entry(tree, entry); } /* frees all zswap entries for the given swap type */ @@ -803,8 +797,11 @@ static void zswap_frontswap_invalidate_area(unsigned type) /* walk the tree and free everything */ spin_lock(&tree->lock); - rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) - zswap_free_entry(tree, entry); + rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode) { + zbud_free(tree->pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + } tree->rbroot = RB_ROOT; spin_unlock(&tree->lock); |