summaryrefslogtreecommitdiff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c122
1 files changed, 64 insertions, 58 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index fd3a07b..1ea21e2 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -138,9 +138,6 @@ static struct khugepaged_scan khugepaged_scan = {
.mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head),
};
-static DEFINE_SPINLOCK(split_queue_lock);
-static LIST_HEAD(split_queue);
-static unsigned long split_queue_len;
static struct shrinker deferred_split_shrinker;
static void set_recommended_min_free_kbytes(void)
@@ -861,7 +858,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
return false;
entry = mk_pmd(zero_page, vma->vm_page_prot);
entry = pmd_mkhuge(entry);
- pgtable_trans_huge_deposit(mm, pmd, pgtable);
+ if (pgtable)
+ pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, haddr, pmd, entry);
atomic_long_inc(&mm->nr_ptes);
return true;
@@ -1039,13 +1037,15 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
spinlock_t *dst_ptl, *src_ptl;
struct page *src_page;
pmd_t pmd;
- pgtable_t pgtable;
+ pgtable_t pgtable = NULL;
int ret;
- ret = -ENOMEM;
- pgtable = pte_alloc_one(dst_mm, addr);
- if (unlikely(!pgtable))
- goto out;
+ if (!vma_is_dax(vma)) {
+ ret = -ENOMEM;
+ pgtable = pte_alloc_one(dst_mm, addr);
+ if (unlikely(!pgtable))
+ goto out;
+ }
dst_ptl = pmd_lock(dst_mm, dst_pmd);
src_ptl = pmd_lockptr(src_mm, src_pmd);
@@ -1076,7 +1076,7 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
goto out_unlock;
}
- if (pmd_trans_huge(pmd)) {
+ if (!vma_is_dax(vma)) {
/* thp accounting separate from pmd_devmap accounting */
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
@@ -1700,7 +1700,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
VM_BUG_ON(!pmd_none(*new_pmd));
- if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+ if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
+ vma_is_anonymous(vma)) {
pgtable_t pgtable;
pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
pgtable_trans_huge_deposit(mm, new_pmd, pgtable);
@@ -2835,6 +2836,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pgtable_t pgtable;
pmd_t _pmd;
bool young, write, dirty;
+ unsigned long addr;
int i;
VM_BUG_ON(haddr & ~HPAGE_PMD_MASK);
@@ -2860,10 +2862,11 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
young = pmd_young(*pmd);
dirty = pmd_dirty(*pmd);
+ pmdp_huge_split_prepare(vma, haddr, pmd);
pgtable = pgtable_trans_huge_withdraw(mm, pmd);
pmd_populate(mm, &_pmd, pgtable);
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ for (i = 0, addr = haddr; i < HPAGE_PMD_NR; i++, addr += PAGE_SIZE) {
pte_t entry, *pte;
/*
* Note that NUMA hinting access restrictions are not
@@ -2884,9 +2887,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
}
if (dirty)
SetPageDirty(page + i);
- pte = pte_offset_map(&_pmd, haddr);
+ pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
- set_pte_at(mm, haddr, pte, entry);
+ set_pte_at(mm, addr, pte, entry);
atomic_inc(&page[i]._mapcount);
pte_unmap(pte);
}
@@ -2936,7 +2939,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pmd_populate(mm, pmd, pgtable);
if (freeze) {
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
+ for (i = 0; i < HPAGE_PMD_NR; i++) {
page_remove_rmap(page + i, false);
put_page(page + i);
}
@@ -3217,28 +3220,26 @@ static void unfreeze_page(struct anon_vma *anon_vma, struct page *page)
}
}
-static int __split_huge_page_tail(struct page *head, int tail,
+static void __split_huge_page_tail(struct page *head, int tail,
struct lruvec *lruvec, struct list_head *list)
{
- int mapcount;
struct page *page_tail = head + tail;
- mapcount = atomic_read(&page_tail->_mapcount) + 1;
+ VM_BUG_ON_PAGE(atomic_read(&page_tail->_mapcount) != -1, page_tail);
VM_BUG_ON_PAGE(atomic_read(&page_tail->_count) != 0, page_tail);
/*
* tail_page->_count is zero and not changing from under us. But
* get_page_unless_zero() may be running from under us on the
- * tail_page. If we used atomic_set() below instead of atomic_add(), we
+ * tail_page. If we used atomic_set() below instead of atomic_inc(), we
* would then run atomic_set() concurrently with
* get_page_unless_zero(), and atomic_set() is implemented in C not
* using locked ops. spin_unlock on x86 sometime uses locked ops
* because of PPro errata 66, 92, so unless somebody can guarantee
* atomic_set() here would be safe on all archs (and not only on x86),
- * it's safer to use atomic_add().
+ * it's safer to use atomic_inc().
*/
- atomic_add(mapcount + 1, &page_tail->_count);
-
+ atomic_inc(&page_tail->_count);
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -3272,8 +3273,6 @@ static int __split_huge_page_tail(struct page *head, int tail,
page_tail->index = head->index + tail;
page_cpupid_xchg_last(page_tail, page_cpupid_last(head));
lru_add_page_tail(head, page_tail, lruvec, list);
-
- return mapcount;
}
static void __split_huge_page(struct page *page, struct list_head *list)
@@ -3281,7 +3280,7 @@ static void __split_huge_page(struct page *page, struct list_head *list)
struct page *head = compound_head(page);
struct zone *zone = page_zone(head);
struct lruvec *lruvec;
- int i, tail_mapcount;
+ int i;
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irq(&zone->lru_lock);
@@ -3290,10 +3289,8 @@ static void __split_huge_page(struct page *page, struct list_head *list)
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
- tail_mapcount = 0;
for (i = HPAGE_PMD_NR - 1; i >= 1; i--)
- tail_mapcount += __split_huge_page_tail(head, i, lruvec, list);
- atomic_sub(tail_mapcount, &head->_count);
+ __split_huge_page_tail(head, i, lruvec, list);
ClearPageCompound(head);
spin_unlock_irq(&zone->lru_lock);
@@ -3358,6 +3355,7 @@ int total_mapcount(struct page *page)
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
struct anon_vma *anon_vma;
int count, mapcount, ret;
bool mlocked;
@@ -3401,19 +3399,19 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
lru_add_drain();
/* Prevent deferred_split_scan() touching ->_count */
- spin_lock_irqsave(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
count = page_count(head);
mapcount = total_mapcount(head);
if (!mapcount && count == 1) {
if (!list_empty(page_deferred_list(head))) {
- split_queue_len--;
+ pgdata->split_queue_len--;
list_del(page_deferred_list(head));
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
__split_huge_page(page, list);
ret = 0;
} else if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
pr_alert("total_mapcount: %u, page_count(): %u\n",
mapcount, count);
if (PageTail(page))
@@ -3421,7 +3419,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
dump_page(page, "total_mapcount(head) > 0");
BUG();
} else {
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
unfreeze_page(anon_vma, head);
ret = -EBUSY;
}
@@ -3436,64 +3434,65 @@ out:
void free_transhuge_page(struct page *page)
{
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
unsigned long flags;
- spin_lock_irqsave(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (!list_empty(page_deferred_list(page))) {
- split_queue_len--;
+ pgdata->split_queue_len--;
list_del(page_deferred_list(page));
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
free_compound_page(page);
}
void deferred_split_huge_page(struct page *page)
{
+ struct pglist_data *pgdata = NODE_DATA(page_to_nid(page));
unsigned long flags;
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- spin_lock_irqsave(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
if (list_empty(page_deferred_list(page))) {
- list_add_tail(page_deferred_list(page), &split_queue);
- split_queue_len++;
+ list_add_tail(page_deferred_list(page), &pgdata->split_queue);
+ pgdata->split_queue_len++;
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
}
static unsigned long deferred_split_count(struct shrinker *shrink,
struct shrink_control *sc)
{
- /*
- * Split a page from split_queue will free up at least one page,
- * at most HPAGE_PMD_NR - 1. We don't track exact number.
- * Let's use HPAGE_PMD_NR / 2 as ballpark.
- */
- return ACCESS_ONCE(split_queue_len) * HPAGE_PMD_NR / 2;
+ struct pglist_data *pgdata = NODE_DATA(sc->nid);
+ return ACCESS_ONCE(pgdata->split_queue_len);
}
static unsigned long deferred_split_scan(struct shrinker *shrink,
struct shrink_control *sc)
{
+ struct pglist_data *pgdata = NODE_DATA(sc->nid);
unsigned long flags;
LIST_HEAD(list), *pos, *next;
struct page *page;
int split = 0;
- spin_lock_irqsave(&split_queue_lock, flags);
- list_splice_init(&split_queue, &list);
-
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */
- list_for_each_safe(pos, next, &list) {
+ list_for_each_safe(pos, next, &pgdata->split_queue) {
page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page);
- /* race with put_compound_page() */
- if (!get_page_unless_zero(page)) {
+ if (get_page_unless_zero(page)) {
+ list_move(page_deferred_list(page), &list);
+ } else {
+ /* We lost race with put_compound_page() */
list_del_init(page_deferred_list(page));
- split_queue_len--;
+ pgdata->split_queue_len--;
}
+ if (!--sc->nr_to_scan)
+ break;
}
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
list_for_each_safe(pos, next, &list) {
page = list_entry((void *)pos, struct page, mapping);
@@ -3505,17 +3504,24 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
put_page(page);
}
- spin_lock_irqsave(&split_queue_lock, flags);
- list_splice_tail(&list, &split_queue);
- spin_unlock_irqrestore(&split_queue_lock, flags);
+ spin_lock_irqsave(&pgdata->split_queue_lock, flags);
+ list_splice_tail(&list, &pgdata->split_queue);
+ spin_unlock_irqrestore(&pgdata->split_queue_lock, flags);
- return split * HPAGE_PMD_NR / 2;
+ /*
+ * Stop shrinker if we didn't split any page, but the queue is empty.
+ * This can happen if pages were freed under us.
+ */
+ if (!split && list_empty(&pgdata->split_queue))
+ return SHRINK_STOP;
+ return split;
}
static struct shrinker deferred_split_shrinker = {
.count_objects = deferred_split_count,
.scan_objects = deferred_split_scan,
.seeks = DEFAULT_SEEKS,
+ .flags = SHRINKER_NUMA_AWARE,
};
#ifdef CONFIG_DEBUG_FS