diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 39 | ||||
-rw-r--r-- | mm/fadvise.c | 11 | ||||
-rw-r--r-- | mm/filemap.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 46 | ||||
-rw-r--r-- | mm/internal.h | 3 | ||||
-rw-r--r-- | mm/kasan/kasan.c | 10 | ||||
-rw-r--r-- | mm/kmemleak.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 12 | ||||
-rw-r--r-- | mm/memory.c | 31 | ||||
-rw-r--r-- | mm/mempool.c | 12 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/oom_kill.c | 14 | ||||
-rw-r--r-- | mm/page-writeback.c | 21 | ||||
-rw-r--r-- | mm/page_alloc.c | 39 | ||||
-rw-r--r-- | mm/page_owner.c | 32 | ||||
-rw-r--r-- | mm/page_poison.c | 8 | ||||
-rw-r--r-- | mm/percpu.c | 73 | ||||
-rw-r--r-- | mm/shmem.c | 2 | ||||
-rw-r--r-- | mm/swap.c | 31 | ||||
-rw-r--r-- | mm/swap_state.c | 5 | ||||
-rw-r--r-- | mm/vmalloc.c | 9 | ||||
-rw-r--r-- | mm/vmstat.c | 2 | ||||
-rw-r--r-- | mm/z3fold.c | 24 |
23 files changed, 276 insertions, 154 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 1427366..79bfe0e 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -441,25 +441,23 @@ static unsigned long isolate_freepages_block(struct compact_control *cc, /* Found a free page, break it into order-0 pages */ isolated = split_free_page(page); + if (!isolated) + break; + total_isolated += isolated; + cc->nr_freepages += isolated; for (i = 0; i < isolated; i++) { list_add(&page->lru, freelist); page++; } - - /* If a page was split, advance to the end of it */ - if (isolated) { - cc->nr_freepages += isolated; - if (!strict && - cc->nr_migratepages <= cc->nr_freepages) { - blockpfn += isolated; - break; - } - - blockpfn += isolated - 1; - cursor += isolated - 1; - continue; + if (!strict && cc->nr_migratepages <= cc->nr_freepages) { + blockpfn += isolated; + break; } + /* Advance to the end of split page */ + blockpfn += isolated - 1; + cursor += isolated - 1; + continue; isolate_fail: if (strict) @@ -469,6 +467,9 @@ isolate_fail: } + if (locked) + spin_unlock_irqrestore(&cc->zone->lock, flags); + /* * There is a tiny chance that we have read bogus compound_order(), * so be careful to not go outside of the pageblock. @@ -490,9 +491,6 @@ isolate_fail: if (strict && blockpfn < end_pfn) total_isolated = 0; - if (locked) - spin_unlock_irqrestore(&cc->zone->lock, flags); - /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) update_pageblock_skip(cc, valid_page, total_isolated, false); @@ -1011,6 +1009,7 @@ static void isolate_freepages(struct compact_control *cc) block_end_pfn = block_start_pfn, block_start_pfn -= pageblock_nr_pages, isolate_start_pfn = block_start_pfn) { + unsigned long isolated; /* * This can iterate a massively long zone without finding any @@ -1035,8 +1034,12 @@ static void isolate_freepages(struct compact_control *cc) continue; /* Found a block suitable for isolating free pages from. */ - isolate_freepages_block(cc, &isolate_start_pfn, - block_end_pfn, freelist, false); + isolated = isolate_freepages_block(cc, &isolate_start_pfn, + block_end_pfn, freelist, false); + /* If isolation failed early, do not continue needlessly */ + if (!isolated && isolate_start_pfn < block_end_pfn && + cc->nr_migratepages > cc->nr_freepages) + break; /* * If we isolated enough freepages, or aborted due to async diff --git a/mm/fadvise.c b/mm/fadvise.c index b8024fa..6c707bf 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -126,6 +126,17 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) */ start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT; end_index = (endbyte >> PAGE_SHIFT); + if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) { + /* First page is tricky as 0 - 1 = -1, but pgoff_t + * is unsigned, so the end_index >= start_index + * check below would be true and we'll discard the whole + * file cache which is not what was asked. + */ + if (end_index == 0) + break; + + end_index--; + } if (end_index >= start_index) { unsigned long count = invalidate_mapping_pages(mapping, diff --git a/mm/filemap.c b/mm/filemap.c index 00ae878..20f3b1f 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2186,7 +2186,7 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; addr = address + (page->index - vmf->pgoff) * PAGE_SIZE; - do_set_pte(vma, addr, page, pte, false, false, true); + do_set_pte(vma, addr, page, pte, false, false); unlock_page(page); goto next; unlock: diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b322c85..df3c176 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -832,8 +832,27 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg) * Only the process that called mmap() has reserves for * private mappings. */ - if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return true; + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + /* + * Like the shared case above, a hole punch or truncate + * could have been performed on the private mapping. + * Examine the value of chg to determine if reserves + * actually exist or were previously consumed. + * Very Subtle - The value of chg comes from a previous + * call to vma_needs_reserves(). The reserve map for + * private mappings has different (opposite) semantics + * than that of shared mappings. vma_needs_reserves() + * has already taken this difference in semantics into + * account. Therefore, the meaning of chg is the same + * as in the shared case above. Code could easily be + * combined, but keeping it separate draws attention to + * subtle differences. + */ + if (chg) + return false; + else + return true; + } return false; } @@ -1011,6 +1030,7 @@ static void destroy_compound_gigantic_page(struct page *page, int nr_pages = 1 << order; struct page *p = page + 1; + atomic_set(compound_mapcount_ptr(page), 0); for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { clear_compound_head(p); set_page_refcounted(p); @@ -1816,6 +1836,25 @@ static long __vma_reservation_common(struct hstate *h, if (vma->vm_flags & VM_MAYSHARE) return ret; + else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER) && ret >= 0) { + /* + * In most cases, reserves always exist for private mappings. + * However, a file associated with mapping could have been + * hole punched or truncated after reserves were consumed. + * As subsequent fault on such a range will not use reserves. + * Subtle - The reserve map for private mappings has the + * opposite meaning than that of shared mappings. If NO + * entry is in the reserve map, it means a reservation exists. + * If an entry exists in the reserve map, it means the + * reservation has already been consumed. As a result, the + * return value of this routine is the opposite of the + * value returned from reserve map manipulation routines above. + */ + if (ret) + return 0; + else + return 1; + } else return ret < 0 ? ret : 0; } @@ -4190,7 +4229,6 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (saddr) { spte = huge_pte_offset(svma->vm_mm, saddr); if (spte) { - mm_inc_nr_pmds(mm); get_page(virt_to_page(spte)); break; } @@ -4205,9 +4243,9 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); + mm_inc_nr_pmds(mm); } else { put_page(virt_to_page(spte)); - mm_inc_nr_pmds(mm); } spin_unlock(ptl); out: diff --git a/mm/internal.h b/mm/internal.h index a37e5b6..2524ec8 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -24,7 +24,8 @@ */ #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ - __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ + __GFP_ATOMIC) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c index 18b6a2b..6845f92 100644 --- a/mm/kasan/kasan.c +++ b/mm/kasan/kasan.c @@ -508,7 +508,7 @@ void kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags) kasan_kmalloc(cache, object, cache->object_size, flags); } -void kasan_poison_slab_free(struct kmem_cache *cache, void *object) +static void kasan_poison_slab_free(struct kmem_cache *cache, void *object) { unsigned long size = cache->object_size; unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE); @@ -626,7 +626,7 @@ void kasan_krealloc(const void *object, size_t size, gfp_t flags) kasan_kmalloc(page->slab_cache, object, size, flags); } -void kasan_kfree(void *ptr) +void kasan_poison_kfree(void *ptr) { struct page *page; @@ -636,7 +636,7 @@ void kasan_kfree(void *ptr) kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page), KASAN_FREE_PAGE); else - kasan_slab_free(page->slab_cache, ptr); + kasan_poison_slab_free(page->slab_cache, ptr); } void kasan_kfree_large(const void *ptr) @@ -763,8 +763,8 @@ static int kasan_mem_notifier(struct notifier_block *nb, static int __init kasan_memhotplug_init(void) { - pr_err("WARNING: KASAN doesn't support memory hot-add\n"); - pr_err("Memory hot-add will be disabled\n"); + pr_info("WARNING: KASAN doesn't support memory hot-add\n"); + pr_info("Memory hot-add will be disabled\n"); hotplug_memory_notifier(kasan_mem_notifier, 0); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index e642992..04320d3 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -307,8 +307,10 @@ static void hex_dump_object(struct seq_file *seq, len = min_t(size_t, object->size, HEX_MAX_LINES * HEX_ROW_SIZE); seq_printf(seq, " hex dump (first %zu bytes):\n", len); + kasan_disable_current(); seq_hex_dump(seq, " ", DUMP_PREFIX_NONE, HEX_ROW_SIZE, HEX_GROUP_SIZE, ptr, len, HEX_ASCII); + kasan_enable_current(); } /* diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 925b431..ac8664db 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1608,7 +1608,7 @@ static void memcg_oom_recover(struct mem_cgroup *memcg) static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) { - if (!current->memcg_may_oom || current->memcg_in_oom) + if (!current->memcg_may_oom) return; /* * We are in the middle of the charge context here, so we @@ -2896,6 +2896,7 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) * ordering is imposed by list_lru_node->lock taken by * memcg_drain_all_list_lrus(). */ + rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */ css_for_each_descendant_pre(css, &memcg->css) { child = mem_cgroup_from_css(css); BUG_ON(child->kmemcg_id != kmemcg_id); @@ -2903,6 +2904,8 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) if (!memcg->use_hierarchy) break; } + rcu_read_unlock(); + memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id); memcg_free_cache_id(kmemcg_id); @@ -4200,7 +4203,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) return &memcg->css; fail: mem_cgroup_free(memcg); - return NULL; + return ERR_PTR(-ENOMEM); } static int @@ -5541,6 +5544,7 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) struct mem_cgroup *memcg; unsigned int nr_pages; bool compound; + unsigned long flags; VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); @@ -5571,10 +5575,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) commit_charge(newpage, memcg, false); - local_irq_disable(); + local_irq_save(flags); mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages); memcg_check_events(memcg, newpage); - local_irq_enable(); + local_irq_restore(flags); } DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); diff --git a/mm/memory.c b/mm/memory.c index 15322b7..cd1f29e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2877,7 +2877,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address, * vm_ops->map_pages. */ void do_set_pte(struct vm_area_struct *vma, unsigned long address, - struct page *page, pte_t *pte, bool write, bool anon, bool old) + struct page *page, pte_t *pte, bool write, bool anon) { pte_t entry; @@ -2885,8 +2885,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, entry = mk_pte(page, vma->vm_page_prot); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (old) - entry = pte_mkold(entry); if (anon) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address, false); @@ -2900,16 +2898,8 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } -/* - * If architecture emulates "accessed" or "young" bit without HW support, - * there is no much gain with fault_around. - */ static unsigned long fault_around_bytes __read_mostly = -#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS - PAGE_SIZE; -#else rounddown_pow_of_two(65536); -#endif #ifdef CONFIG_DEBUG_FS static int fault_around_bytes_get(void *data, u64 *val) @@ -3032,20 +3022,9 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); - if (!pte_same(*pte, orig_pte)) - goto unlock_out; do_fault_around(vma, address, pte, pgoff, flags); - /* Check if the fault is handled by faultaround */ - if (!pte_same(*pte, orig_pte)) { - /* - * Faultaround produce old pte, but the pte we've - * handler fault for should be young. - */ - pte_t entry = pte_mkyoung(*pte); - if (ptep_set_access_flags(vma, address, pte, entry, 0)) - update_mmu_cache(vma, address, pte); + if (!pte_same(*pte, orig_pte)) goto unlock_out; - } pte_unmap_unlock(pte, ptl); } @@ -3060,7 +3039,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(fault_page); return ret; } - do_set_pte(vma, address, fault_page, pte, false, false, false); + do_set_pte(vma, address, fault_page, pte, false, false); unlock_page(fault_page); unlock_out: pte_unmap_unlock(pte, ptl); @@ -3111,7 +3090,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma, } goto uncharge_out; } - do_set_pte(vma, address, new_page, pte, true, true, false); + do_set_pte(vma, address, new_page, pte, true, true); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); pte_unmap_unlock(pte, ptl); @@ -3164,7 +3143,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(fault_page); return ret; } - do_set_pte(vma, address, fault_page, pte, true, false, false); + do_set_pte(vma, address, fault_page, pte, true, false); pte_unmap_unlock(pte, ptl); if (set_page_dirty(fault_page)) diff --git a/mm/mempool.c b/mm/mempool.c index 9e075f8..8f65464 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -104,20 +104,16 @@ static inline void poison_element(mempool_t *pool, void *element) static void kasan_poison_element(mempool_t *pool, void *element) { - if (pool->alloc == mempool_alloc_slab) - kasan_poison_slab_free(pool->pool_data, element); - if (pool->alloc == mempool_kmalloc) - kasan_kfree(element); + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + kasan_poison_kfree(element); if (pool->alloc == mempool_alloc_pages) kasan_free_pages(element, (unsigned long)pool->pool_data); } static void kasan_unpoison_element(mempool_t *pool, void *element, gfp_t flags) { - if (pool->alloc == mempool_alloc_slab) - kasan_slab_alloc(pool->pool_data, element, flags); - if (pool->alloc == mempool_kmalloc) - kasan_krealloc(element, (size_t)pool->pool_data, flags); + if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) + kasan_unpoison_slab(element); if (pool->alloc == mempool_alloc_pages) kasan_alloc_pages(element, (unsigned long)pool->pool_data); } diff --git a/mm/migrate.c b/mm/migrate.c index 9baf41c..bd3fdc2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -431,6 +431,7 @@ int migrate_page_move_mapping(struct address_space *mapping, return MIGRATEPAGE_SUCCESS; } +EXPORT_SYMBOL(migrate_page_move_mapping); /* * The expected number of remaining references is the same as that @@ -586,6 +587,7 @@ void migrate_page_copy(struct page *newpage, struct page *page) mem_cgroup_migrate(page, newpage); } +EXPORT_SYMBOL(migrate_page_copy); /************************************************************ * Migration functions diff --git a/mm/oom_kill.c b/mm/oom_kill.c index dfb1ab6..ddf7448 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -474,13 +474,8 @@ static bool __oom_reap_task(struct task_struct *tsk) p = find_lock_task_mm(tsk); if (!p) goto unlock_oom; - mm = p->mm; - if (!atomic_inc_not_zero(&mm->mm_users)) { - task_unlock(p); - goto unlock_oom; - } - + atomic_inc(&mm->mm_users); task_unlock(p); if (!down_read_trylock(&mm->mmap_sem)) { @@ -625,8 +620,6 @@ void try_oom_reaper(struct task_struct *tsk) if (atomic_read(&mm->mm_users) > 1) { rcu_read_lock(); for_each_process(p) { - bool exiting; - if (!process_shares_mm(p, mm)) continue; if (fatal_signal_pending(p)) @@ -636,10 +629,7 @@ void try_oom_reaper(struct task_struct *tsk) * If the task is exiting make sure the whole thread group * is exiting and cannot acces mm anymore. */ - spin_lock_irq(&p->sighand->siglock); - exiting = signal_group_exit(p->signal); - spin_unlock_irq(&p->sighand->siglock); - if (exiting) + if (signal_group_exit(p->signal)) continue; /* Give up */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b9956fd..e248194 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -373,8 +373,9 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) struct dirty_throttle_control *gdtc = mdtc_gdtc(dtc); unsigned long bytes = vm_dirty_bytes; unsigned long bg_bytes = dirty_background_bytes; - unsigned long ratio = vm_dirty_ratio; - unsigned long bg_ratio = dirty_background_ratio; + /* convert ratios to per-PAGE_SIZE for higher precision */ + unsigned long ratio = (vm_dirty_ratio * PAGE_SIZE) / 100; + unsigned long bg_ratio = (dirty_background_ratio * PAGE_SIZE) / 100; unsigned long thresh; unsigned long bg_thresh; struct task_struct *tsk; @@ -386,26 +387,28 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) /* * The byte settings can't be applied directly to memcg * domains. Convert them to ratios by scaling against - * globally available memory. + * globally available memory. As the ratios are in + * per-PAGE_SIZE, they can be obtained by dividing bytes by + * number of pages. */ if (bytes) - ratio = min(DIV_ROUND_UP(bytes, PAGE_SIZE) * 100 / - global_avail, 100UL); + ratio = min(DIV_ROUND_UP(bytes, global_avail), + PAGE_SIZE); if (bg_bytes) - bg_ratio = min(DIV_ROUND_UP(bg_bytes, PAGE_SIZE) * 100 / - global_avail, 100UL); + bg_ratio = min(DIV_ROUND_UP(bg_bytes, global_avail), + PAGE_SIZE); bytes = bg_bytes = 0; } if (bytes) thresh = DIV_ROUND_UP(bytes, PAGE_SIZE); else - thresh = (ratio * available_memory) / 100; + thresh = (ratio * available_memory) / PAGE_SIZE; if (bg_bytes) bg_thresh = DIV_ROUND_UP(bg_bytes, PAGE_SIZE); else - bg_thresh = (bg_ratio * available_memory) / 100; + bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; if (bg_thresh >= thresh) bg_thresh = thresh / 2; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f8f3bfc..6903b69 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -656,6 +656,9 @@ static inline void set_page_guard(struct zone *zone, struct page *page, return; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __set_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); INIT_LIST_HEAD(&page->lru); @@ -673,6 +676,9 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, return; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __clear_bit(PAGE_EXT_DEBUG_GUARD, &page_ext->flags); set_page_private(page, 0); @@ -2609,11 +2615,12 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, page = list_last_entry(list, struct page, lru); else page = list_first_entry(list, struct page, lru); - } while (page && check_new_pcp(page)); - __dec_zone_state(zone, NR_ALLOC_BATCH); - list_del(&page->lru); - pcp->count--; + __dec_zone_state(zone, NR_ALLOC_BATCH); + list_del(&page->lru); + pcp->count--; + + } while (check_new_pcp(page)); } else { /* * We most definitely don't want callers attempting to @@ -3023,6 +3030,7 @@ reset_fair: apply_fair = false; fair_skipped = false; reset_alloc_batches(ac->preferred_zoneref->zone); + z = ac->preferred_zoneref; goto zonelist_scan; } @@ -3596,6 +3604,17 @@ retry: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* + * Reset the zonelist iterators if memory policies can be ignored. + * These allocations are high priority and system rather than user + * orientated. + */ + if ((alloc_flags & ALLOC_NO_WATERMARKS) || !(alloc_flags & ALLOC_CPUSET)) { + ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); + ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, + ac->high_zoneidx, ac->nodemask); + } + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, order, alloc_flags & ~ALLOC_NO_WATERMARKS, ac); @@ -3604,12 +3623,6 @@ retry: /* Allocate without watermarks if the context allows */ if (alloc_flags & ALLOC_NO_WATERMARKS) { - /* - * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds - * the allocation is high priority and these type of - * allocations are system rather than user orientated - */ - ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); page = get_page_from_freelist(gfp_mask, order, ALLOC_NO_WATERMARKS, ac); if (page) @@ -3808,7 +3821,11 @@ retry_cpuset: /* Dirty zone balancing only done in the fast path */ ac.spread_dirty_pages = (gfp_mask & __GFP_WRITE); - /* The preferred zone is used for statistics later */ + /* + * The preferred zone is used for statistics but crucially it is + * also used as the starting point for the zonelist iterator. It + * may get reset for allocations that ignore memory policies. + */ ac.preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx, ac.nodemask); if (!ac.preferred_zoneref) { diff --git a/mm/page_owner.c b/mm/page_owner.c index 792b56d..fedeba8 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -55,6 +55,8 @@ void __reset_page_owner(struct page *page, unsigned int order) for (i = 0; i < (1 << order); i++) { page_ext = lookup_page_ext(page + i); + if (unlikely(!page_ext)) + continue; __clear_bit(PAGE_EXT_OWNER, &page_ext->flags); } } @@ -62,6 +64,7 @@ void __reset_page_owner(struct page *page, unsigned int order) void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) { struct page_ext *page_ext = lookup_page_ext(page); + struct stack_trace trace = { .nr_entries = 0, .max_entries = ARRAY_SIZE(page_ext->trace_entries), @@ -69,6 +72,9 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) .skip = 3, }; + if (unlikely(!page_ext)) + return; + save_stack_trace(&trace); page_ext->order = order; @@ -82,6 +88,8 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask) void __set_page_owner_migrate_reason(struct page *page, int reason) { struct page_ext *page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; page_ext->last_migrate_reason = reason; } @@ -89,6 +97,12 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) gfp_t __get_page_owner_gfp(struct page *page) { struct page_ext *page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + /* + * The caller just returns 0 if no valid gfp + * So return 0 here too. + */ + return 0; return page_ext->gfp_mask; } @@ -99,6 +113,9 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage) struct page_ext *new_ext = lookup_page_ext(newpage); int i; + if (unlikely(!old_ext || !new_ext)) + return; + new_ext->order = old_ext->order; new_ext->gfp_mask = old_ext->gfp_mask; new_ext->nr_entries = old_ext->nr_entries; @@ -190,8 +207,15 @@ void __dump_page_owner(struct page *page) .nr_entries = page_ext->nr_entries, .entries = &page_ext->trace_entries[0], }; - gfp_t gfp_mask = page_ext->gfp_mask; - int mt = gfpflags_to_migratetype(gfp_mask); + gfp_t gfp_mask; + int mt; + + if (unlikely(!page_ext)) { + pr_alert("There is not page extension available.\n"); + return; + } + gfp_mask = page_ext->gfp_mask; + mt = gfpflags_to_migratetype(gfp_mask); if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not active (free page?)\n"); @@ -251,6 +275,8 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) } page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + continue; /* * Some pages could be missed by concurrent allocation or free, @@ -317,6 +343,8 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + continue; /* Maybe overraping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) diff --git a/mm/page_poison.c b/mm/page_poison.c index 1eae5fa..2e647c6 100644 --- a/mm/page_poison.c +++ b/mm/page_poison.c @@ -54,6 +54,9 @@ static inline void set_page_poison(struct page *page) struct page_ext *page_ext; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __set_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } @@ -62,6 +65,9 @@ static inline void clear_page_poison(struct page *page) struct page_ext *page_ext; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + return; + __clear_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); } @@ -70,7 +76,7 @@ bool page_is_poisoned(struct page *page) struct page_ext *page_ext; page_ext = lookup_page_ext(page); - if (!page_ext) + if (unlikely(!page_ext)) return false; return test_bit(PAGE_EXT_DEBUG_POISON, &page_ext->flags); diff --git a/mm/percpu.c b/mm/percpu.c index 0c59684..9903830 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -112,7 +112,7 @@ struct pcpu_chunk { int map_used; /* # of map entries used before the sentry */ int map_alloc; /* # of map entries allocated */ int *map; /* allocation map */ - struct work_struct map_extend_work;/* async ->map[] extension */ + struct list_head map_extend_list;/* on pcpu_map_extend_chunks */ void *data; /* chunk data */ int first_free; /* no free below this */ @@ -162,10 +162,13 @@ static struct pcpu_chunk *pcpu_reserved_chunk; static int pcpu_reserved_chunk_limit; static DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */ -static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop */ +static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ +/* chunks which need their map areas extended, protected by pcpu_lock */ +static LIST_HEAD(pcpu_map_extend_chunks); + /* * The number of empty populated pages, protected by pcpu_lock. The * reserved chunk doesn't contribute to the count. @@ -395,13 +398,19 @@ static int pcpu_need_to_extend(struct pcpu_chunk *chunk, bool is_atomic) { int margin, new_alloc; + lockdep_assert_held(&pcpu_lock); + if (is_atomic) { margin = 3; if (chunk->map_alloc < - chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW && - pcpu_async_enabled) - schedule_work(&chunk->map_extend_work); + chunk->map_used + PCPU_ATOMIC_MAP_MARGIN_LOW) { + if (list_empty(&chunk->map_extend_list)) { + list_add_tail(&chunk->map_extend_list, + &pcpu_map_extend_chunks); + pcpu_schedule_balance_work(); + } + } } else { margin = PCPU_ATOMIC_MAP_MARGIN_HIGH; } @@ -435,6 +444,8 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc) size_t old_size = 0, new_size = new_alloc * sizeof(new[0]); unsigned long flags; + lockdep_assert_held(&pcpu_alloc_mutex); + new = pcpu_mem_zalloc(new_size); if (!new) return -ENOMEM; @@ -467,20 +478,6 @@ out_unlock: return 0; } -static void pcpu_map_extend_workfn(struct work_struct *work) -{ - struct pcpu_chunk *chunk = container_of(work, struct pcpu_chunk, - map_extend_work); - int new_alloc; - - spin_lock_irq(&pcpu_lock); - new_alloc = pcpu_need_to_extend(chunk, false); - spin_unlock_irq(&pcpu_lock); - - if (new_alloc) - pcpu_extend_area_map(chunk, new_alloc); -} - /** * pcpu_fit_in_area - try to fit the requested allocation in a candidate area * @chunk: chunk the candidate area belongs to @@ -740,7 +737,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map_used = 1; INIT_LIST_HEAD(&chunk->list); - INIT_WORK(&chunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&chunk->map_extend_list); chunk->free_size = pcpu_unit_size; chunk->contig_hint = pcpu_unit_size; @@ -895,6 +892,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved, return NULL; } + if (!is_atomic) + mutex_lock(&pcpu_alloc_mutex); + spin_lock_irqsave(&pcpu_lock, flags); /* serve reserved allocations from the reserved chunk if available */ @@ -967,12 +967,9 @@ restart: if (is_atomic) goto fail; - mutex_lock(&pcpu_alloc_mutex); - if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { chunk = pcpu_create_chunk(); if (!chunk) { - mutex_unlock(&pcpu_alloc_mutex); err = "failed to allocate new chunk"; goto fail; } @@ -983,7 +980,6 @@ restart: spin_lock_irqsave(&pcpu_lock, flags); } - mutex_unlock(&pcpu_alloc_mutex); goto restart; area_found: @@ -993,8 +989,6 @@ area_found: if (!is_atomic) { int page_start, page_end, rs, re; - mutex_lock(&pcpu_alloc_mutex); - page_start = PFN_DOWN(off); page_end = PFN_UP(off + size); @@ -1005,7 +999,6 @@ area_found: spin_lock_irqsave(&pcpu_lock, flags); if (ret) { - mutex_unlock(&pcpu_alloc_mutex); pcpu_free_area(chunk, off, &occ_pages); err = "failed to populate"; goto fail_unlock; @@ -1045,6 +1038,8 @@ fail: /* see the flag handling in pcpu_blance_workfn() */ pcpu_atomic_alloc_failed = true; pcpu_schedule_balance_work(); + } else { + mutex_unlock(&pcpu_alloc_mutex); } return NULL; } @@ -1129,6 +1124,7 @@ static void pcpu_balance_workfn(struct work_struct *work) if (chunk == list_first_entry(free_head, struct pcpu_chunk, list)) continue; + list_del_init(&chunk->map_extend_list); list_move(&chunk->list, &to_free); } @@ -1146,6 +1142,25 @@ static void pcpu_balance_workfn(struct work_struct *work) pcpu_destroy_chunk(chunk); } + /* service chunks which requested async area map extension */ + do { + int new_alloc = 0; + + spin_lock_irq(&pcpu_lock); + + chunk = list_first_entry_or_null(&pcpu_map_extend_chunks, + struct pcpu_chunk, map_extend_list); + if (chunk) { + list_del_init(&chunk->map_extend_list); + new_alloc = pcpu_need_to_extend(chunk, false); + } + + spin_unlock_irq(&pcpu_lock); + + if (new_alloc) + pcpu_extend_area_map(chunk, new_alloc); + } while (chunk); + /* * Ensure there are certain number of free populated pages for * atomic allocs. Fill up from the most packed so that atomic @@ -1644,7 +1659,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, */ schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&schunk->list); - INIT_WORK(&schunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&schunk->map_extend_list); schunk->base_addr = base_addr; schunk->map = smap; schunk->map_alloc = ARRAY_SIZE(smap); @@ -1673,7 +1688,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (dyn_size) { dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0); INIT_LIST_HEAD(&dchunk->list); - INIT_WORK(&dchunk->map_extend_work, pcpu_map_extend_workfn); + INIT_LIST_HEAD(&dchunk->map_extend_list); dchunk->base_addr = base_addr; dchunk->map = dmap; dchunk->map_alloc = ARRAY_SIZE(dmap); @@ -2227,7 +2227,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, /* Remove the !PageUptodate pages we added */ shmem_undo_range(inode, (loff_t)start << PAGE_SHIFT, - (loff_t)index << PAGE_SHIFT, true); + ((loff_t)index << PAGE_SHIFT) - 1, true); goto undone; } @@ -242,7 +242,7 @@ void rotate_reclaimable_page(struct page *page) get_page(page); local_irq_save(flags); pvec = this_cpu_ptr(&lru_rotate_pvecs); - if (!pagevec_add(pvec, page)) + if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_move_tail(pvec); local_irq_restore(flags); } @@ -296,7 +296,7 @@ void activate_page(struct page *page) struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); get_page(page); - if (!pagevec_add(pvec, page)) + if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, __activate_page, NULL); put_cpu_var(activate_page_pvecs); } @@ -391,9 +391,8 @@ static void __lru_cache_add(struct page *page) struct pagevec *pvec = &get_cpu_var(lru_add_pvec); get_page(page); - if (!pagevec_space(pvec)) + if (!pagevec_add(pvec, page) || PageCompound(page)) __pagevec_lru_add(pvec); - pagevec_add(pvec, page); put_cpu_var(lru_add_pvec); } @@ -628,7 +627,7 @@ void deactivate_file_page(struct page *page) if (likely(get_page_unless_zero(page))) { struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs); - if (!pagevec_add(pvec, page)) + if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL); put_cpu_var(lru_deactivate_file_pvecs); } @@ -648,7 +647,7 @@ void deactivate_page(struct page *page) struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs); get_page(page); - if (!pagevec_add(pvec, page)) + if (!pagevec_add(pvec, page) || PageCompound(page)) pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL); put_cpu_var(lru_deactivate_pvecs); } @@ -667,6 +666,24 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); +/* + * lru_add_drain_wq is used to do lru_add_drain_all() from a WQ_MEM_RECLAIM + * workqueue, aiding in getting memory freed. + */ +static struct workqueue_struct *lru_add_drain_wq; + +static int __init lru_init(void) +{ + lru_add_drain_wq = alloc_workqueue("lru-add-drain", WQ_MEM_RECLAIM, 0); + + if (WARN(!lru_add_drain_wq, + "Failed to create workqueue lru_add_drain_wq")) + return -ENOMEM; + + return 0; +} +early_initcall(lru_init); + void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); @@ -686,7 +703,7 @@ void lru_add_drain_all(void) pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) || need_activate_page_drain(cpu)) { INIT_WORK(work, lru_add_drain_per_cpu); - schedule_work_on(cpu, work); + queue_work_on(cpu, lru_add_drain_wq, work); cpumask_set_cpu(cpu, &has_work); } } diff --git a/mm/swap_state.c b/mm/swap_state.c index 0d457e7..c99463a 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -252,7 +252,10 @@ static inline void free_swap_cache(struct page *page) void free_page_and_swap_cache(struct page *page) { free_swap_cache(page); - put_page(page); + if (is_huge_zero_page(page)) + put_huge_zero_page(); + else + put_page(page); } /* diff --git a/mm/vmalloc.c b/mm/vmalloc.c index cf7ad1a..e11475c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1105,7 +1105,7 @@ EXPORT_SYMBOL_GPL(vm_unmap_aliases); */ void vm_unmap_ram(const void *mem, unsigned int count) { - unsigned long size = count << PAGE_SHIFT; + unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr = (unsigned long)mem; BUG_ON(!addr); @@ -1140,7 +1140,7 @@ EXPORT_SYMBOL(vm_unmap_ram); */ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) { - unsigned long size = count << PAGE_SHIFT; + unsigned long size = (unsigned long)count << PAGE_SHIFT; unsigned long addr; void *mem; @@ -1574,14 +1574,15 @@ void *vmap(struct page **pages, unsigned int count, unsigned long flags, pgprot_t prot) { struct vm_struct *area; + unsigned long size; /* In bytes */ might_sleep(); if (count > totalram_pages) return NULL; - area = get_vm_area_caller((count << PAGE_SHIFT), flags, - __builtin_return_address(0)); + size = (unsigned long)count << PAGE_SHIFT; + area = get_vm_area_caller(size, flags, __builtin_return_address(0)); if (!area) return NULL; diff --git a/mm/vmstat.c b/mm/vmstat.c index 77e42ef..cb2a67b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1061,6 +1061,8 @@ static void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; page_ext = lookup_page_ext(page); + if (unlikely(!page_ext)) + continue; if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) continue; diff --git a/mm/z3fold.c b/mm/z3fold.c index 34917d5..8f9e89c 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -412,7 +412,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle) /* HEADLESS page stored */ bud = HEADLESS; } else { - bud = (handle - zhdr->first_num) & BUDDY_MASK; + bud = handle_to_buddy(handle); switch (bud) { case FIRST: @@ -572,15 +572,19 @@ next: pool->pages_nr--; spin_unlock(&pool->lock); return 0; - } else if (zhdr->first_chunks != 0 && - zhdr->last_chunks != 0 && zhdr->middle_chunks != 0) { - /* Full, add to buddied list */ - list_add(&zhdr->buddy, &pool->buddied); - } else if (!test_bit(PAGE_HEADLESS, &page->private)) { - z3fold_compact_page(zhdr); - /* add to unbuddied list */ - freechunks = num_free_chunks(zhdr); - list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else if (!test_bit(PAGE_HEADLESS, &page->private)) { + if (zhdr->first_chunks != 0 && + zhdr->last_chunks != 0 && + zhdr->middle_chunks != 0) { + /* Full, add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } else { + z3fold_compact_page(zhdr); + /* add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, + &pool->unbuddied[freechunks]); + } } /* add to beginning of LRU */ |