From b8aeec34175fc8fe8b0d40efea4846dfc1ba663e Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Thu, 7 Oct 2010 15:31:31 +0900 Subject: HWPOISON/signalfd: add support for addr_lsb Similar change as to signal delivery: copy out the si_addr_lsb field to user space in signalfd Signed-off-by: Hidetoshi Seto Signed-off-by: Andi Kleen diff --git a/fs/signalfd.c b/fs/signalfd.c index 1c5a6ad..bdd4496 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -99,6 +99,16 @@ static int signalfd_copyinfo(struct signalfd_siginfo __user *uinfo, #ifdef __ARCH_SI_TRAPNO err |= __put_user(kinfo->si_trapno, &uinfo->ssi_trapno); #endif +#ifdef BUS_MCEERR_AO + /* + * Other callers might not initialize the si_lsb field, + * so check explicitly for the right codes here. + */ + if (kinfo->si_code == BUS_MCEERR_AR || + kinfo->si_code == BUS_MCEERR_AO) + err |= __put_user((short) kinfo->si_addr_lsb, + &uinfo->ssi_addr_lsb); +#endif break; case __SI_CHLD: err |= __put_user(kinfo->si_pid, &uinfo->ssi_pid); diff --git a/include/linux/signalfd.h b/include/linux/signalfd.h index b363b91..3ff4961 100644 --- a/include/linux/signalfd.h +++ b/include/linux/signalfd.h @@ -33,6 +33,7 @@ struct signalfd_siginfo { __u64 ssi_utime; __u64 ssi_stime; __u64 ssi_addr; + __u16 ssi_addr_lsb; /* * Pad strcture to 128 bytes. Remember to update the @@ -43,7 +44,7 @@ struct signalfd_siginfo { * comes out of a read(2) and we really don't want to have * a compat on read(2). */ - __u8 __pad[48]; + __u8 __pad[46]; }; -- cgit v0.10.2 From 998b4382c1d75a6fd3b0e334dae3ab33bd074d99 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 8 Sep 2010 10:19:32 +0900 Subject: hugetlb: fix metadata corruption in hugetlb_fault() Since the PageHWPoison() check is for avoiding hwpoisoned page remained in pagecache mapping to the process, it should be done in "found in pagecache" branch, not in the common path. Otherwise, metadata corruption occurs if memory failure happens between alloc_huge_page() and lock_page() because page fault fails with metadata changes remained (such as refcount, mapcount, etc.) This patch moves the check to "found in pagecache" branch and fix the problem. ChangeLog since v2: - remove retry check in "new allocation" path. - make description more detailed - change patch name from "HWPOISON, hugetlb: move PG_HWPoison bit check" Signed-off-by: Naoya Horiguchi Signed-off-by: Jun'ichi Nomura Acked-by: Mel Gorman Reviewed-by: Wu Fengguang Reviewed-by: Christoph Lameter Signed-off-by: Andi Kleen diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c032738..bd031a4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2515,22 +2515,19 @@ retry: hugepage_add_new_anon_rmap(page, vma, address); } } else { + /* + * If memory error occurs between mmap() and fault, some process + * don't have hwpoisoned swap entry for errored virtual address. + * So we need to block hugepage fault by PG_hwpoison bit check. + */ + if (unlikely(PageHWPoison(page))) { + ret = VM_FAULT_HWPOISON; + goto backout_unlocked; + } page_dup_rmap(page); } /* - * Since memory error handler replaces pte into hwpoison swap entry - * at the time of error handling, a process which reserved but not have - * the mapping to the error hugepage does not have hwpoison swap entry. - * So we need to block accesses from such a process by checking - * PG_hwpoison bit here. - */ - if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON; - goto backout_unlocked; - } - - /* * If we are going to COW a private mapping later, we examine the * pending reservations for this page now. This will ensure that * any allocations necessary to record that reservation occur outside -- cgit v0.10.2 From bf50bab2b34483316162443587b8467952e07730 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 8 Sep 2010 10:19:33 +0900 Subject: hugetlb: add allocate function for hugepage migration We can't use existing hugepage allocation functions to allocate hugepage for page migration, because page migration can happen asynchronously with the running processes and page migration users should call the allocation function with physical addresses (not virtual addresses) as arguments. ChangeLog since v3: - unify alloc_buddy_huge_page() and alloc_buddy_huge_page_node() ChangeLog since v2: - remove unnecessary get/put_mems_allowed() (thanks to David Rientjes) ChangeLog since v1: - add comment on top of alloc_huge_page_no_vma() Signed-off-by: Naoya Horiguchi Acked-by: Mel Gorman Signed-off-by: Jun'ichi Nomura Reviewed-by: Christoph Lameter Signed-off-by: Andi Kleen diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f479700..0b73c53 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -228,6 +228,8 @@ struct huge_bootmem_page { struct hstate *hstate; }; +struct page *alloc_huge_page_node(struct hstate *h, int nid); + /* arch callback */ int __init alloc_bootmem_huge_page(struct hstate *h); @@ -303,6 +305,7 @@ static inline struct hstate *page_hstate(struct page *page) #else struct hstate {}; +#define alloc_huge_page_node(h, nid) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_vma(v) NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bd031a4..83fa0c3 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -466,11 +466,23 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } +static struct page *dequeue_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + if (list_empty(&h->hugepage_freelists[nid])) + return NULL; + page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + return page; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) { - int nid; struct page *page = NULL; struct mempolicy *mpol; nodemask_t *nodemask; @@ -496,19 +508,13 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, for_each_zone_zonelist_nodemask(zone, z, zonelist, MAX_NR_ZONES - 1, nodemask) { - nid = zone_to_nid(zone); - if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask) && - !list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - - if (!avoid_reserve) - decrement_hugepage_resv_vma(h, vma); - - break; + if (cpuset_zone_allowed_softwall(zone, htlb_alloc_mask)) { + page = dequeue_huge_page_node(h, zone_to_nid(zone)); + if (page) { + if (!avoid_reserve) + decrement_hugepage_resv_vma(h, vma); + break; + } } } err: @@ -770,11 +776,10 @@ static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, return ret; } -static struct page *alloc_buddy_huge_page(struct hstate *h, - struct vm_area_struct *vma, unsigned long address) +static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) { struct page *page; - unsigned int nid; + unsigned int r_nid; if (h->order >= MAX_ORDER) return NULL; @@ -812,9 +817,14 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } spin_unlock(&hugetlb_lock); - page = alloc_pages(htlb_alloc_mask|__GFP_COMP| - __GFP_REPEAT|__GFP_NOWARN, - huge_page_order(h)); + if (nid == NUMA_NO_NODE) + page = alloc_pages(htlb_alloc_mask|__GFP_COMP| + __GFP_REPEAT|__GFP_NOWARN, + huge_page_order(h)); + else + page = alloc_pages_exact_node(nid, + htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| + __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page && arch_prepare_hugepage(page)) { __free_pages(page, huge_page_order(h)); @@ -829,13 +839,13 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, */ put_page_testzero(page); VM_BUG_ON(page_count(page)); - nid = page_to_nid(page); + r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); /* * We incremented the global counters already */ - h->nr_huge_pages_node[nid]++; - h->surplus_huge_pages_node[nid]++; + h->nr_huge_pages_node[r_nid]++; + h->surplus_huge_pages_node[r_nid]++; __count_vm_event(HTLB_BUDDY_PGALLOC); } else { h->nr_huge_pages--; @@ -848,6 +858,25 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, } /* + * This allocation function is useful in the context where vma is irrelevant. + * E.g. soft-offlining uses this function because it only cares physical + * address of error page. + */ +struct page *alloc_huge_page_node(struct hstate *h, int nid) +{ + struct page *page; + + spin_lock(&hugetlb_lock); + page = dequeue_huge_page_node(h, nid); + spin_unlock(&hugetlb_lock); + + if (!page) + page = alloc_buddy_huge_page(h, nid); + + return page; +} + +/* * Increase the hugetlb pool such that it can accomodate a reservation * of size 'delta'. */ @@ -871,7 +900,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = alloc_buddy_huge_page(h, NULL, 0); + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { /* * We were not able to allocate enough pages to @@ -1052,7 +1081,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, spin_unlock(&hugetlb_lock); if (!page) { - page = alloc_buddy_huge_page(h, vma, addr); + page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugetlb_put_quota(inode->i_mapping, chg); return ERR_PTR(-VM_FAULT_SIGBUS); -- cgit v0.10.2 From 0ebabb416f585ace711769057422af4bbc9d1110 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 8 Sep 2010 10:19:34 +0900 Subject: hugetlb: redefine hugepage copy functions This patch modifies hugepage copy functions to have only destination and source hugepages as arguments for later use. The old ones are renamed from copy_{gigantic,huge}_page() to copy_user_{gigantic,huge}_page(). This naming convention is consistent with that between copy_highpage() and copy_user_highpage(). ChangeLog since v4: - add blank line between local declaration and code - remove unnecessary might_sleep() ChangeLog since v2: - change copy_huge_page() from macro to inline dummy function to avoid compile warning when !CONFIG_HUGETLB_PAGE. Signed-off-by: Naoya Horiguchi Acked-by: Mel Gorman Reviewed-by: Christoph Lameter Signed-off-by: Andi Kleen diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0b73c53..9e51f77 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -44,6 +44,7 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, int acctflags); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); void __isolate_hwpoisoned_huge_page(struct page *page); +void copy_huge_page(struct page *dst, struct page *src); extern unsigned long hugepages_treat_as_movable; extern const unsigned long hugetlb_zero, hugetlb_infinity; @@ -102,6 +103,9 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define huge_pte_offset(mm, address) 0 #define __isolate_hwpoisoned_huge_page(page) 0 +static inline void copy_huge_page(struct page *dst, struct page *src) +{ +} #define hugetlb_change_protection(vma, address, end, newprot) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 83fa0c3..a73dbdc 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -423,14 +423,14 @@ static void clear_huge_page(struct page *page, } } -static void copy_gigantic_page(struct page *dst, struct page *src, +static void copy_user_gigantic_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); struct page *dst_base = dst; struct page *src_base = src; - might_sleep(); + for (i = 0; i < pages_per_huge_page(h); ) { cond_resched(); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); @@ -440,14 +440,15 @@ static void copy_gigantic_page(struct page *dst, struct page *src, src = mem_map_next(src, src_base, i); } } -static void copy_huge_page(struct page *dst, struct page *src, + +static void copy_user_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_gigantic_page(dst, src, addr, vma); + copy_user_gigantic_page(dst, src, addr, vma); return; } @@ -458,6 +459,40 @@ static void copy_huge_page(struct page *dst, struct page *src, } } +static void copy_gigantic_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page(h); ) { + cond_resched(); + copy_highpage(dst, src); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_huge_page(struct page *dst, struct page *src) +{ + int i; + struct hstate *h = page_hstate(src); + + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { + copy_gigantic_page(dst, src); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page(h); i++) { + cond_resched(); + copy_highpage(dst + i, src + i); + } +} + static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); @@ -2412,7 +2447,7 @@ retry_avoidcopy: if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - copy_huge_page(new_page, old_page, address, vma); + copy_user_huge_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); /* -- cgit v0.10.2 From 290408d4a25002f099efeee7b6a5778d431154d6 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 8 Sep 2010 10:19:35 +0900 Subject: hugetlb: hugepage migration core This patch extends page migration code to support hugepage migration. One of the potential users of this feature is soft offlining which is triggered by memory corrected errors (added by the next patch.) Todo: - there are other users of page migration such as memory policy, memory hotplug and memocy compaction. They are not ready for hugepage support for now. ChangeLog since v4: - define migrate_huge_pages() - remove changes on isolation/putback_lru_page() ChangeLog since v2: - refactor isolate/putback_lru_page() to handle hugepage - add comment about race on unmap_and_move_huge_page() ChangeLog since v1: - divide migration code path for hugepage - define routine checking migration swap entry for hugetlb - replace "goto" with "if/else" in remove_migration_pte() Signed-off-by: Naoya Horiguchi Signed-off-by: Jun'ichi Nomura Acked-by: Mel Gorman Signed-off-by: Andi Kleen diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6e5bd42..1f7ca50 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -573,6 +574,19 @@ static int hugetlbfs_set_page_dirty(struct page *page) return 0; } +static int hugetlbfs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + int rc; + + rc = migrate_huge_page_move_mapping(mapping, newpage, page); + if (rc) + return rc; + migrate_page_copy(newpage, page); + + return 0; +} + static int hugetlbfs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(dentry->d_sb); @@ -659,6 +673,7 @@ static const struct address_space_operations hugetlbfs_aops = { .write_begin = hugetlbfs_write_begin, .write_end = hugetlbfs_write_end, .set_page_dirty = hugetlbfs_set_page_dirty, + .migratepage = hugetlbfs_migrate_page, }; diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 7238231..3c1941e 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -14,6 +14,8 @@ extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, int offlining); +extern int migrate_huge_pages(struct list_head *l, new_page_t x, + unsigned long private, int offlining); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -23,12 +25,17 @@ extern int migrate_prep_local(void); extern int migrate_vmas(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to, unsigned long flags); +extern void migrate_page_copy(struct page *newpage, struct page *page); +extern int migrate_huge_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page); #else #define PAGE_MIGRATION 0 static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, int offlining) { return -ENOSYS; } +static inline int migrate_huge_pages(struct list_head *l, new_page_t x, + unsigned long private, int offlining) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } @@ -40,6 +47,15 @@ static inline int migrate_vmas(struct mm_struct *mm, return -ENOSYS; } +static inline void migrate_page_copy(struct page *newpage, + struct page *page) {} + +extern int migrate_huge_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + return -ENOSYS; +} + /* Possible settings for the migrate_page() method in address_operations */ #define migrate_page NULL #define fail_migrate_page NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a73dbdc..0fa9de8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2217,6 +2217,19 @@ nomem: return -ENOMEM; } +static int is_hugetlb_entry_migration(pte_t pte) +{ + swp_entry_t swp; + + if (huge_pte_none(pte) || pte_present(pte)) + return 0; + swp = pte_to_swp_entry(pte); + if (non_swap_entry(swp) && is_migration_entry(swp)) { + return 1; + } else + return 0; +} + static int is_hugetlb_entry_hwpoisoned(pte_t pte) { swp_entry_t swp; @@ -2648,7 +2661,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, address); if (ptep) { entry = huge_ptep_get(ptep); - if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) + if (unlikely(is_hugetlb_entry_migration(entry))) { + migration_entry_wait(mm, (pmd_t *)ptep, address); + return 0; + } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON; } diff --git a/mm/migrate.c b/mm/migrate.c index 38e7cad..55dbc45 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include "internal.h" @@ -95,26 +96,34 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte_t *ptep, pte; spinlock_t *ptl; - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) - goto out; + if (unlikely(PageHuge(new))) { + ptep = huge_pte_offset(mm, addr); + if (!ptep) + goto out; + ptl = &mm->page_table_lock; + } else { + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; - pmd = pmd_offset(pud, addr); - if (!pmd_present(*pmd)) - goto out; + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; - ptep = pte_offset_map(pmd, addr); + ptep = pte_offset_map(pmd, addr); - if (!is_swap_pte(*ptep)) { - pte_unmap(ptep); - goto out; - } + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + goto out; + } + + ptl = pte_lockptr(mm, pmd); + } - ptl = pte_lockptr(mm, pmd); spin_lock(ptl); pte = *ptep; if (!is_swap_pte(pte)) @@ -130,10 +139,17 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); + if (PageHuge(new)) + pte = pte_mkhuge(pte); flush_cache_page(vma, addr, pte_pfn(pte)); set_pte_at(mm, addr, ptep, pte); - if (PageAnon(new)) + if (PageHuge(new)) { + if (PageAnon(new)) + hugepage_add_anon_rmap(new, vma, addr); + else + page_dup_rmap(new); + } else if (PageAnon(new)) page_add_anon_rmap(new, vma, addr); else page_add_file_rmap(new); @@ -276,11 +292,59 @@ static int migrate_page_move_mapping(struct address_space *mapping, } /* + * The expected number of remaining references is the same as that + * of migrate_page_move_mapping(). + */ +int migrate_huge_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + int expected_count; + void **pslot; + + if (!mapping) { + if (page_count(page) != 1) + return -EAGAIN; + return 0; + } + + spin_lock_irq(&mapping->tree_lock); + + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(page)); + + expected_count = 2 + page_has_private(page); + if (page_count(page) != expected_count || + (struct page *)radix_tree_deref_slot(pslot) != page) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + if (!page_freeze_refs(page, expected_count)) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } + + get_page(newpage); + + radix_tree_replace_slot(pslot, newpage); + + page_unfreeze_refs(page, expected_count); + + __put_page(page); + + spin_unlock_irq(&mapping->tree_lock); + return 0; +} + +/* * Copy the page to its new location */ -static void migrate_page_copy(struct page *newpage, struct page *page) +void migrate_page_copy(struct page *newpage, struct page *page) { - copy_highpage(newpage, page); + if (PageHuge(page)) + copy_huge_page(newpage, page); + else + copy_highpage(newpage, page); if (PageError(page)) SetPageError(newpage); @@ -724,6 +788,92 @@ move_newpage: } /* + * Counterpart of unmap_and_move_page() for hugepage migration. + * + * This function doesn't wait the completion of hugepage I/O + * because there is no race between I/O and migration for hugepage. + * Note that currently hugepage I/O occurs only in direct I/O + * where no lock is held and PG_writeback is irrelevant, + * and writeback status of all subpages are counted in the reference + * count of the head page (i.e. if all subpages of a 2MB hugepage are + * under direct I/O, the reference of the head page is 512 and a bit more.) + * This means that when we try to migrate hugepage whose subpages are + * doing direct I/O, some references remain after try_to_unmap() and + * hugepage migration fails without data corruption. + * + * There is also no race when direct I/O is issued on the page under migration, + * because then pte is replaced with migration swap entry and direct I/O code + * will wait in the page fault for migration to complete. + */ +static int unmap_and_move_huge_page(new_page_t get_new_page, + unsigned long private, struct page *hpage, + int force, int offlining) +{ + int rc = 0; + int *result = NULL; + struct page *new_hpage = get_new_page(hpage, private, &result); + int rcu_locked = 0; + struct anon_vma *anon_vma = NULL; + + if (!new_hpage) + return -ENOMEM; + + rc = -EAGAIN; + + if (!trylock_page(hpage)) { + if (!force) + goto out; + lock_page(hpage); + } + + if (PageAnon(hpage)) { + rcu_read_lock(); + rcu_locked = 1; + + if (page_mapped(hpage)) { + anon_vma = page_anon_vma(hpage); + atomic_inc(&anon_vma->external_refcount); + } + } + + try_to_unmap(hpage, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + + if (!page_mapped(hpage)) + rc = move_to_new_page(new_hpage, hpage, 1); + + if (rc) + remove_migration_ptes(hpage, hpage); + + if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, + &anon_vma->lock)) { + int empty = list_empty(&anon_vma->head); + spin_unlock(&anon_vma->lock); + if (empty) + anon_vma_free(anon_vma); + } + + if (rcu_locked) + rcu_read_unlock(); +out: + unlock_page(hpage); + + if (rc != -EAGAIN) { + list_del(&hpage->lru); + put_page(hpage); + } + + put_page(new_hpage); + + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(new_hpage); + } + return rc; +} + +/* * migrate_pages * * The function takes one list of pages to migrate and a function @@ -788,6 +938,52 @@ out: return nr_failed + retry; } +int migrate_huge_pages(struct list_head *from, + new_page_t get_new_page, unsigned long private, int offlining) +{ + int retry = 1; + int nr_failed = 0; + int pass = 0; + struct page *page; + struct page *page2; + int rc; + + for (pass = 0; pass < 10 && retry; pass++) { + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + cond_resched(); + + rc = unmap_and_move_huge_page(get_new_page, + private, page, pass > 2, offlining); + + switch(rc) { + case -ENOMEM: + goto out; + case -EAGAIN: + retry++; + break; + case 0: + break; + default: + /* Permanent failure */ + nr_failed++; + break; + } + } + } + rc = 0; +out: + + list_for_each_entry_safe(page, page2, from, lru) + put_page(page); + + if (rc) + return rc; + + return nr_failed + retry; +} + #ifdef CONFIG_NUMA /* * Move a list of individual pages -- cgit v0.10.2 From 6de2b1aab94355482bd2accdc115666509667458 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 8 Sep 2010 10:19:36 +0900 Subject: HWPOISON, hugetlb: add free check to dequeue_hwpoison_huge_page() This check is necessary to avoid race between dequeue and allocation, which can cause a free hugepage to be dequeued twice and get kernel unstable. Signed-off-by: Naoya Horiguchi Signed-off-by: Wu Fengguang Acked-by: Mel Gorman Reviewed-by: Christoph Lameter Signed-off-by: Andi Kleen diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 9e51f77..796f30e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -43,7 +43,7 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to, struct vm_area_struct *vma, int acctflags); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); -void __isolate_hwpoisoned_huge_page(struct page *page); +int dequeue_hwpoisoned_huge_page(struct page *page); void copy_huge_page(struct page *dst, struct page *src); extern unsigned long hugepages_treat_as_movable; @@ -102,7 +102,7 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define huge_pte_offset(mm, address) 0 -#define __isolate_hwpoisoned_huge_page(page) 0 +#define dequeue_hwpoisoned_huge_page(page) 0 static inline void copy_huge_page(struct page *dst, struct page *src) { } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0fa9de8..deb7beb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2955,18 +2955,39 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_acct_memory(h, -(chg - freed)); } +/* Should be called in hugetlb_lock */ +static int is_hugepage_on_freelist(struct page *hpage) +{ + struct page *page; + struct page *tmp; + struct hstate *h = page_hstate(hpage); + int nid = page_to_nid(hpage); + + list_for_each_entry_safe(page, tmp, &h->hugepage_freelists[nid], lru) + if (page == hpage) + return 1; + return 0; +} + +#ifdef CONFIG_MEMORY_FAILURE /* * This function is called from memory failure code. * Assume the caller holds page lock of the head page. */ -void __isolate_hwpoisoned_huge_page(struct page *hpage) +int dequeue_hwpoisoned_huge_page(struct page *hpage) { struct hstate *h = page_hstate(hpage); int nid = page_to_nid(hpage); + int ret = -EBUSY; spin_lock(&hugetlb_lock); - list_del(&hpage->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; + if (is_hugepage_on_freelist(hpage)) { + list_del(&hpage->lru); + h->free_huge_pages--; + h->free_huge_pages_node[nid]--; + ret = 0; + } spin_unlock(&hugetlb_lock); + return ret; } +#endif diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757f6b0..5c7158a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -698,6 +698,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) */ static int me_huge_page(struct page *p, unsigned long pfn) { + int res = 0; struct page *hpage = compound_head(p); /* * We can safely recover from error on free or reserved (i.e. @@ -710,8 +711,9 @@ static int me_huge_page(struct page *p, unsigned long pfn) * so there is no race between isolation and mapping/unmapping. */ if (!(page_mapping(hpage) || PageAnon(hpage))) { - __isolate_hwpoisoned_huge_page(hpage); - return RECOVERED; + res = dequeue_hwpoisoned_huge_page(hpage); + if (!res) + return RECOVERED; } return DELAYED; } -- cgit v0.10.2 From a9869b837c098732bad84939015c0eb391b23e41 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 8 Sep 2010 10:19:37 +0900 Subject: hugetlb: move refcounting in hugepage allocation inside hugetlb_lock Currently alloc_huge_page() raises page refcount outside hugetlb_lock. but it causes race when dequeue_hwpoison_huge_page() runs concurrently with alloc_huge_page(). To avoid it, this patch moves set_page_refcounted() in hugetlb_lock. Signed-off-by: Naoya Horiguchi Signed-off-by: Wu Fengguang Acked-by: Mel Gorman Reviewed-by: Christoph Lameter Signed-off-by: Andi Kleen diff --git a/mm/hugetlb.c b/mm/hugetlb.c index deb7beb..636be5d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -509,6 +509,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) return NULL; page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); list_del(&page->lru); + set_page_refcounted(page); h->free_huge_pages--; h->free_huge_pages_node[nid]--; return page; @@ -868,12 +869,6 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_lock(&hugetlb_lock); if (page) { - /* - * This page is now managed by the hugetlb allocator and has - * no users -- drop the buddy allocator's reference. - */ - put_page_testzero(page); - VM_BUG_ON(page_count(page)); r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); /* @@ -936,16 +931,13 @@ retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_buddy_huge_page(h, NUMA_NO_NODE); - if (!page) { + if (!page) /* * We were not able to allocate enough pages to * satisfy the entire reservation so we free what * we've allocated so far. */ - spin_lock(&hugetlb_lock); - needed = 0; goto free; - } list_add(&page->lru, &surplus_list); } @@ -972,31 +964,31 @@ retry: needed += allocated; h->resv_huge_pages += delta; ret = 0; -free: + + spin_unlock(&hugetlb_lock); /* Free the needed pages to the hugetlb pool */ list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) break; list_del(&page->lru); + /* + * This page is now managed by the hugetlb allocator and has + * no users -- drop the buddy allocator's reference. + */ + put_page_testzero(page); + VM_BUG_ON(page_count(page)); enqueue_huge_page(h, page); } /* Free unnecessary surplus pages to the buddy allocator */ +free: if (!list_empty(&surplus_list)) { - spin_unlock(&hugetlb_lock); list_for_each_entry_safe(page, tmp, &surplus_list, lru) { list_del(&page->lru); - /* - * The page has a reference count of zero already, so - * call free_huge_page directly instead of using - * put_page. This must be done with hugetlb_lock - * unlocked which is safe because free_huge_page takes - * hugetlb_lock before deciding how to free the page. - */ - free_huge_page(page); + put_page(page); } - spin_lock(&hugetlb_lock); } + spin_lock(&hugetlb_lock); return ret; } @@ -1123,7 +1115,6 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, } } - set_page_refcounted(page); set_page_private(page, (unsigned long) mapping); vma_commit_reservation(h, vma, addr); -- cgit v0.10.2 From 8c6c2ecb44667f7204e9d2b89c4c1f42edc5a196 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 8 Sep 2010 10:19:38 +0900 Subject: HWPOSION, hugetlb: recover from free hugepage error when !MF_COUNT_INCREASED Currently error recovery for free hugepage works only for MF_COUNT_INCREASED. This patch enables !MF_COUNT_INCREASED case. Free hugepages can be handled directly by alloc_huge_page() and dequeue_hwpoisoned_huge_page(), and both of them are protected by hugetlb_lock, so there is no race between them. Note that this patch defines the refcount of HWPoisoned hugepage dequeued from freelist is 1, deviated from present 0, thereby we can avoid race between unpoison and memory failure on free hugepage. This is reasonable because unlikely to free buddy pages, free hugepage is governed by hugetlbfs even after error handling finishes. And it also makes unpoison code added in the later patch cleaner. Signed-off-by: Naoya Horiguchi Signed-off-by: Jun'ichi Nomura Acked-by: Mel Gorman Signed-off-by: Andi Kleen diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 636be5d..7123270 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2974,6 +2974,7 @@ int dequeue_hwpoisoned_huge_page(struct page *hpage) spin_lock(&hugetlb_lock); if (is_hugepage_on_freelist(hpage)) { list_del(&hpage->lru); + set_page_refcounted(hpage); h->free_huge_pages--; h->free_huge_pages_node[nid]--; ret = 0; diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5c7158a..333f87d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -983,7 +983,10 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * We need/can do nothing about count=0 pages. * 1) it's a free page, and therefore in safe hand: * prep_new_page() will be the gate keeper. - * 2) it's part of a non-compound high order page. + * 2) it's a free hugepage, which is also safe: + * an affected hugepage will be dequeued from hugepage freelist, + * so there's no concern about reusing it ever after. + * 3) it's part of a non-compound high order page. * Implies some kernel user: cannot stop them from * R/W the page; let's pray that the page has been * used and will be freed some time later. @@ -995,6 +998,24 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) if (is_free_buddy_page(p)) { action_result(pfn, "free buddy", DELAYED); return 0; + } else if (PageHuge(hpage)) { + /* + * Check "just unpoisoned", "filter hit", and + * "race with other subpage." + */ + lock_page_nosync(hpage); + if (!PageHWPoison(hpage) + || (hwpoison_filter(p) && TestClearPageHWPoison(p)) + || (p != hpage && TestSetPageHWPoison(hpage))) { + atomic_long_sub(nr_pages, &mce_bad_pages); + return 0; + } + set_page_hwpoison_huge_page(hpage); + res = dequeue_hwpoisoned_huge_page(hpage); + action_result(pfn, "free huge", + res ? IGNORED : DELAYED); + unlock_page(hpage); + return res; } else { action_result(pfn, "high order kernel", IGNORED); return -EBUSY; @@ -1156,6 +1177,16 @@ int unpoison_memory(unsigned long pfn) nr_pages = 1 << compound_order(page); if (!get_page_unless_zero(page)) { + /* + * Since HWPoisoned hugepage should have non-zero refcount, + * race between memory failure and unpoison seems to happen. + * In such case unpoison fails and memory failure runs + * to the end. + */ + if (PageHuge(page)) { + pr_debug("MCE: Memory failure is now running on free hugepage %#lx\n", pfn); + return 0; + } if (TestClearPageHWPoison(p)) atomic_long_sub(nr_pages, &mce_bad_pages); pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); -- cgit v0.10.2 From d950b95882f3dc47e86f1496cd3f7fef540d6d6b Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 8 Sep 2010 10:19:39 +0900 Subject: HWPOISON, hugetlb: soft offlining for hugepage This patch extends soft offlining framework to support hugepage. When memory corrected errors occur repeatedly on a hugepage, we can choose to stop using it by migrating data onto another hugepage and disabling the original (maybe half-broken) one. ChangeLog since v4: - branch soft_offline_page() for hugepage ChangeLog since v3: - remove comment about "ToDo: hugepage soft-offline" ChangeLog since v2: - move refcount handling into isolate_lru_page() ChangeLog since v1: - add double check in isolating hwpoisoned hugepage - define free/non-free checker for hugepage - postpone calling put_page() for hugepage in soft_offline_page() Signed-off-by: Naoya Horiguchi Signed-off-by: Jun'ichi Nomura Acked-by: Mel Gorman Signed-off-by: Andi Kleen diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 333f87d..74eb425 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -693,8 +693,6 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn) * Issues: * - Error on hugepage is contained in hugepage unit (not in raw page unit.) * To narrow down kill region to one page, we need to break up pmd. - * - To support soft-offlining for hugepage, we need to support hugepage - * migration. */ static int me_huge_page(struct page *p, unsigned long pfn) { @@ -1220,7 +1218,11 @@ EXPORT_SYMBOL(unpoison_memory); static struct page *new_page(struct page *p, unsigned long private, int **x) { int nid = page_to_nid(p); - return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); + if (PageHuge(p)) + return alloc_huge_page_node(page_hstate(compound_head(p)), + nid); + else + return alloc_pages_exact_node(nid, GFP_HIGHUSER_MOVABLE, 0); } /* @@ -1248,8 +1250,15 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) * was free. */ set_migratetype_isolate(p); + /* + * When the target page is a free hugepage, just remove it + * from free hugepage list. + */ if (!get_page_unless_zero(compound_head(p))) { - if (is_free_buddy_page(p)) { + if (PageHuge(p)) { + pr_debug("get_any_page: %#lx free huge page\n", pfn); + ret = dequeue_hwpoisoned_huge_page(compound_head(p)); + } else if (is_free_buddy_page(p)) { pr_debug("get_any_page: %#lx free buddy page\n", pfn); /* Set hwpoison bit while page is still isolated */ SetPageHWPoison(p); @@ -1268,6 +1277,45 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) return ret; } +static int soft_offline_huge_page(struct page *page, int flags) +{ + int ret; + unsigned long pfn = page_to_pfn(page); + struct page *hpage = compound_head(page); + LIST_HEAD(pagelist); + + ret = get_any_page(page, pfn, flags); + if (ret < 0) + return ret; + if (ret == 0) + goto done; + + if (PageHWPoison(hpage)) { + put_page(hpage); + pr_debug("soft offline: %#lx hugepage already poisoned\n", pfn); + return -EBUSY; + } + + /* Keep page count to indicate a given hugepage is isolated. */ + + list_add(&hpage->lru, &pagelist); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + if (ret) { + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pfn, ret, page->flags); + if (ret > 0) + ret = -EIO; + return ret; + } +done: + if (!PageHWPoison(hpage)) + atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); + set_page_hwpoison_huge_page(hpage); + dequeue_hwpoisoned_huge_page(hpage); + /* keep elevated page count for bad page */ + return ret; +} + /** * soft_offline_page - Soft offline a page. * @page: page to offline @@ -1295,6 +1343,9 @@ int soft_offline_page(struct page *page, int flags) int ret; unsigned long pfn = page_to_pfn(page); + if (PageHuge(page)) + return soft_offline_huge_page(page, flags); + ret = get_any_page(page, pfn, flags); if (ret < 0) return ret; -- cgit v0.10.2 From 6a90181c7b0558b86179c1f5bcf3ab11f9d1bd30 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 8 Sep 2010 10:19:40 +0900 Subject: HWPOISON, hugetlb: fix unpoison for hugepage Currently unpoisoning hugepages doesn't work correctly because clearing PG_HWPoison is done outside if (TestClearPageHWPoison). This patch fixes it. Signed-off-by: Naoya Horiguchi Signed-off-by: Jun'ichi Nomura Acked-by: Mel Gorman Signed-off-by: Andi Kleen diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 74eb425..14589a2 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1202,9 +1202,9 @@ int unpoison_memory(unsigned long pfn) pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); atomic_long_sub(nr_pages, &mce_bad_pages); freeit = 1; + if (PageHuge(page)) + clear_page_hwpoison_huge_page(page); } - if (PageHuge(p)) - clear_page_hwpoison_huge_page(page); unlock_page(page); put_page(page); -- cgit v0.10.2 From 4e1c19750a8991c66e998a1915f2ad5c391bbd04 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 22 Sep 2010 12:43:56 +0200 Subject: Clean up __page_set_anon_rmap Linus asked for a cleanup of __page_set_anon_rmap to make it look more like the cleaner huge pages version. Factor out the duplicated PageAnon check into a single check at the beginning of the function. Remove obsolete comments and rewrite them into standard English. No functional changes. Signed-off-by: Andi Kleen diff --git a/mm/rmap.c b/mm/rmap.c index 92e6757..8adc6e3 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -780,10 +780,10 @@ void page_move_anon_rmap(struct page *page, } /** - * __page_set_anon_rmap - setup new anonymous rmap - * @page: the page to add the mapping to - * @vma: the vm area in which the mapping is added - * @address: the user virtual address mapped + * __page_set_anon_rmap - set up new anonymous rmap + * @page: Page to add to rmap + * @vma: VM area to add page to. + * @address: User virtual address of the mapping * @exclusive: the page is exclusively owned by the current process */ static void __page_set_anon_rmap(struct page *page, @@ -793,25 +793,16 @@ static void __page_set_anon_rmap(struct page *page, BUG_ON(!anon_vma); + if (PageAnon(page)) + return; + /* * If the page isn't exclusively mapped into this vma, * we must use the _oldest_ possible anon_vma for the * page mapping! */ - if (!exclusive) { - if (PageAnon(page)) - return; + if (!exclusive) anon_vma = anon_vma->root; - } else { - /* - * In this case, swapped-out-but-not-discarded swap-cache - * is remapped. So, no need to update page->mapping here. - * We convice anon_vma poitned by page->mapping is not obsolete - * because vma->anon_vma is necessary to be a family of it. - */ - if (PageAnon(page)) - return; - } anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; page->mapping = (struct address_space *) anon_vma; -- cgit v0.10.2 From d5bd910696a27e505078c19ba6b4143537570681 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Sep 2010 09:00:12 +0200 Subject: hugepage: move is_hugepage_on_freelist inside ifdef to avoid warning Fixes warning reported by Stephen Rothwell mm/hugetlb.c:2950: warning: 'is_hugepage_on_freelist' defined but not used for the !CONFIG_MEMORY_FAILURE case. Signed-off-by: Andi Kleen diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7123270..67cd032 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2946,6 +2946,8 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) hugetlb_acct_memory(h, -(chg - freed)); } +#ifdef CONFIG_MEMORY_FAILURE + /* Should be called in hugetlb_lock */ static int is_hugepage_on_freelist(struct page *hpage) { @@ -2960,7 +2962,6 @@ static int is_hugepage_on_freelist(struct page *hpage) return 0; } -#ifdef CONFIG_MEMORY_FAILURE /* * This function is called from memory failure code. * Assume the caller holds page lock of the head page. -- cgit v0.10.2 From 6f39ce056ab2ab2d29b2fae4aed61ed0b485972f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Thu, 30 Sep 2010 11:54:51 +0900 Subject: Fix build error with !CONFIG_MIGRATION migrate_huge_page_move_mapping() is declared as "extern int ..." in include/linux/migrate.h for !CONFIG_MIGRATION, which causes the build error like below: mm/mprotect.o: In function `migrate_huge_page_move_mapping': mprotect.c:(.text+0x0): multiple definition of `migrate_huge_page_move_mapping' mm/shmem.o:shmem.c:(.text+0x0): first defined here mm/rmap.o: In function `migrate_huge_page_move_mapping': rmap.c:(.text+0x0): multiple definition of `migrate_huge_page_move_mapping' mm/shmem.o:shmem.c:(.text+0x0): first defined here Reported-by: Stephen Rothwell Signed-off-by: Naoya Horiguchi Signed-off-by: Andi Kleen diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 3c1941e..085527f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -50,7 +50,7 @@ static inline int migrate_vmas(struct mm_struct *mm, static inline void migrate_page_copy(struct page *newpage, struct page *page) {} -extern int migrate_huge_page_move_mapping(struct address_space *mapping, +static inline int migrate_huge_page_move_mapping(struct address_space *mapping, struct page *newpage, struct page *page) { return -ENOSYS; -- cgit v0.10.2 From aa50d3a7aa8147b9e14dc9d5972a5d2359db4ef8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 6 Oct 2010 21:45:00 +0200 Subject: Encode huge page size for VM_FAULT_HWPOISON errors This fixes a problem introduced with the hugetlb hwpoison handling The user space SIGBUS signalling wants to know the size of the hugepage that caused a HWPOISON fault. Unfortunately the architecture page fault handlers do not have easy access to the struct page. Pass the information out in the fault error code instead. I added a separate VM_FAULT_HWPOISON_LARGE bit for this case and encode the hpage index in some free upper bits of the fault code. The small page hwpoison keeps stays with the VM_FAULT_HWPOISON name to minimize changes. Also add code to hugetlb.h to convert that index into a page shift. Will be used in a further patch. Cc: Naoya Horiguchi Cc: fengguang.wu@intel.com Signed-off-by: Andi Kleen diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 796f30e..943c76b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -307,6 +307,11 @@ static inline struct hstate *page_hstate(struct page *page) return size_to_hstate(PAGE_SIZE << compound_order(page)); } +static inline unsigned hstate_index_to_shift(unsigned index) +{ + return hstates[index].order + PAGE_SHIFT; +} + #else struct hstate {}; #define alloc_huge_page_node(h, nid) NULL @@ -324,6 +329,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) { return 1; } +#define hstate_index_to_shift(index) 0 #endif #endif /* _LINUX_HUGETLB_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 74949fb..f7e9efc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -718,12 +718,20 @@ static inline int page_mapped(struct page *page) #define VM_FAULT_SIGBUS 0x0002 #define VM_FAULT_MAJOR 0x0004 #define VM_FAULT_WRITE 0x0008 /* Special case for get_user_pages */ -#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned page */ +#define VM_FAULT_HWPOISON 0x0010 /* Hit poisoned small page */ +#define VM_FAULT_HWPOISON_LARGE 0x0020 /* Hit poisoned large page. Index encoded in upper bits */ #define VM_FAULT_NOPAGE 0x0100 /* ->fault installed the pte, not return page */ #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ -#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON) +#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ + +#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_HWPOISON | \ + VM_FAULT_HWPOISON_LARGE) + +/* Encode hstate index for a hwpoisoned large page */ +#define VM_FAULT_SET_HINDEX(x) ((x) << 12) +#define VM_FAULT_GET_HINDEX(x) (((x) >> 12) & 0xf) /* * Can be called by the pagefault handler when it gets a VM_FAULT_OOM. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 67cd032..96991de 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2589,7 +2589,8 @@ retry: * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON; + ret = VM_FAULT_HWPOISON | + VM_FAULT_SET_HINDEX(h - hstates); goto backout_unlocked; } page_dup_rmap(page); @@ -2656,7 +2657,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, migration_entry_wait(mm, (pmd_t *)ptep, address); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) - return VM_FAULT_HWPOISON; + return VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(h - hstates); } ptep = huge_pte_alloc(mm, address, huge_page_size(h)); diff --git a/mm/memory.c b/mm/memory.c index 0e18b4d..c2d6dd3 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1450,7 +1450,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; if (ret & - (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) + (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE| + VM_FAULT_SIGBUS)) return i ? i : -EFAULT; BUG(); } -- cgit v0.10.2 From f672b49b07a4a152fc4251f2aec6b4d05164c4cd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Sep 2010 22:05:55 +0200 Subject: x86: HWPOISON: Report correct address granuality for huge hwpoison faults An earlier patch fixed the hwpoison fault handling to encode the huge page size in the fault code of the page fault handler. This is needed to report this information in SIGBUS to user space. This is a straight forward patch to pass this information through to the signal handling in the x86 specific fault.c Cc: x86@kernel.org Cc: Naoya Horiguchi Cc: fengguang.wu@intel.com Signed-off-by: Andi Kleen diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 4c4508e..1d15a27 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -11,6 +11,7 @@ #include /* __kprobes, ... */ #include /* kmmio_handler, ... */ #include /* perf_sw_event */ +#include /* hstate_index_to_shift */ #include /* dotraplinkage, ... */ #include /* pgd_*(), ... */ @@ -160,15 +161,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) static void force_sig_info_fault(int si_signo, int si_code, unsigned long address, - struct task_struct *tsk) + struct task_struct *tsk, int fault) { + unsigned lsb = 0; siginfo_t info; info.si_signo = si_signo; info.si_errno = 0; info.si_code = si_code; info.si_addr = (void __user *)address; - info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; + if (fault & VM_FAULT_HWPOISON_LARGE) + lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); + if (fault & VM_FAULT_HWPOISON) + lsb = PAGE_SHIFT; + info.si_addr_lsb = lsb; force_sig_info(si_signo, &info, tsk); } @@ -731,7 +737,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, tsk->thread.error_code = error_code | (address >= TASK_SIZE); tsk->thread.trap_no = 14; - force_sig_info_fault(SIGSEGV, si_code, address, tsk); + force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0); return; } @@ -816,14 +822,14 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, tsk->thread.trap_no = 14; #ifdef CONFIG_MEMORY_FAILURE - if (fault & VM_FAULT_HWPOISON) { + if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) { printk(KERN_ERR "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", tsk->comm, tsk->pid, address); code = BUS_MCEERR_AR; } #endif - force_sig_info_fault(SIGBUS, code, address, tsk); + force_sig_info_fault(SIGBUS, code, address, tsk, fault); } static noinline void @@ -833,7 +839,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (fault & VM_FAULT_OOM) { out_of_memory(regs, error_code, address); } else { - if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) + if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON| + VM_FAULT_HWPOISON_LARGE)) do_sigbus(regs, error_code, address, fault); else BUG(); -- cgit v0.10.2 From 1c80b990a3411733890eff10817e388d5e25e2dd Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Sep 2010 23:09:51 +0200 Subject: HWPOISON: Improve comments in memory-failure.c Clean up and improve the overview comment in memory-failure.c Tidy some grammar issues in other comments. Signed-off-by: Andi Kleen diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 757f6b0..eebb9d8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -7,21 +7,26 @@ * Free Software Foundation. * * High level machine check handler. Handles pages reported by the - * hardware as being corrupted usually due to a 2bit ECC memory or cache + * hardware as being corrupted usually due to a multi-bit ECC memory or cache * failure. + * + * In addition there is a "soft offline" entry point that allows stop using + * not-yet-corrupted-by-suspicious pages without killing anything. * * Handles page cache pages in various states. The tricky part - * here is that we can access any page asynchronous to other VM - * users, because memory failures could happen anytime and anywhere, - * possibly violating some of their assumptions. This is why this code - * has to be extremely careful. Generally it tries to use normal locking - * rules, as in get the standard locks, even if that means the - * error handling takes potentially a long time. - * - * The operation to map back from RMAP chains to processes has to walk - * the complete process list and has non linear complexity with the number - * mappings. In short it can be quite slow. But since memory corruptions - * are rare we hope to get away with this. + * here is that we can access any page asynchronously in respect to + * other VM users, because memory failures could happen anytime and + * anywhere. This could violate some of their assumptions. This is why + * this code has to be extremely careful. Generally it tries to use + * normal locking rules, as in get the standard locks, even if that means + * the error handling takes potentially a long time. + * + * There are several operations here with exponential complexity because + * of unsuitable VM data structures. For example the operation to map back + * from RMAP chains to processes has to walk the complete process list and + * has non linear complexity with the number. But since memory corruptions + * are rare we hope to get away with this. This avoids impacting the core + * VM. */ /* @@ -78,7 +83,7 @@ static int hwpoison_filter_dev(struct page *p) return 0; /* - * page_mapping() does not accept slab page + * page_mapping() does not accept slab pages. */ if (PageSlab(p)) return -EINVAL; -- cgit v0.10.2 From fb46e73520940bfc426152cfe5e4a9f1ae3f00b6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Sep 2010 23:31:30 +0200 Subject: HWPOISON: Convert pr_debugs to pr_info Convert a lot of pr_debugs in memory-failure.c that are generally useful to pr_info. It's reasonable to print at least one message why offlining succeeded or failed by default. Signed-off-by: Andi Kleen diff --git a/mm/memory-failure.c b/mm/memory-failure.c index eebb9d8..0a2ed9a 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -314,7 +314,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p, * a SIGKILL because the error is not contained anymore. */ if (tk->addr == -EFAULT) { - pr_debug("MCE: Unable to find user space address %lx in %s\n", + pr_info("MCE: Unable to find user space address %lx in %s\n", page_to_pfn(p), tsk->comm); tk->addr_valid = 0; } @@ -582,7 +582,7 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) pfn, err); } else if (page_has_private(p) && !try_to_release_page(p, GFP_NOIO)) { - pr_debug("MCE %#lx: failed to release buffers\n", pfn); + pr_info("MCE %#lx: failed to release buffers\n", pfn); } else { ret = RECOVERED; } @@ -1152,7 +1152,7 @@ int unpoison_memory(unsigned long pfn) page = compound_head(p); if (!PageHWPoison(p)) { - pr_debug("MCE: Page was already unpoisoned %#lx\n", pfn); + pr_info("MCE: Page was already unpoisoned %#lx\n", pfn); return 0; } @@ -1161,7 +1161,7 @@ int unpoison_memory(unsigned long pfn) if (!get_page_unless_zero(page)) { if (TestClearPageHWPoison(p)) atomic_long_sub(nr_pages, &mce_bad_pages); - pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn); + pr_info("MCE: Software-unpoisoned free page %#lx\n", pfn); return 0; } @@ -1173,7 +1173,7 @@ int unpoison_memory(unsigned long pfn) * the free buddy page pool. */ if (TestClearPageHWPoison(page)) { - pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn); + pr_info("MCE: Software-unpoisoned page %#lx\n", pfn); atomic_long_sub(nr_pages, &mce_bad_pages); freeit = 1; } @@ -1222,12 +1222,12 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) set_migratetype_isolate(p); if (!get_page_unless_zero(compound_head(p))) { if (is_free_buddy_page(p)) { - pr_debug("get_any_page: %#lx free buddy page\n", pfn); + pr_info("get_any_page: %#lx free buddy page\n", pfn); /* Set hwpoison bit while page is still isolated */ SetPageHWPoison(p); ret = 0; } else { - pr_debug("get_any_page: %#lx: unknown zero refcount page type %lx\n", + pr_info("get_any_page: %#lx: unknown zero refcount page type %lx\n", pfn, p->flags); ret = -EIO; } @@ -1293,7 +1293,7 @@ int soft_offline_page(struct page *page, int flags) goto done; } if (!PageLRU(page)) { - pr_debug("soft_offline: %#lx: unknown non LRU page type %lx\n", + pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n", pfn, page->flags); return -EIO; } @@ -1307,7 +1307,7 @@ int soft_offline_page(struct page *page, int flags) if (PageHWPoison(page)) { unlock_page(page); put_page(page); - pr_debug("soft offline: %#lx page already poisoned\n", pfn); + pr_info("soft offline: %#lx page already poisoned\n", pfn); return -EBUSY; } @@ -1328,7 +1328,7 @@ int soft_offline_page(struct page *page, int flags) put_page(page); if (ret == 1) { ret = 0; - pr_debug("soft_offline: %#lx: invalidated\n", pfn); + pr_info("soft_offline: %#lx: invalidated\n", pfn); goto done; } @@ -1344,13 +1344,13 @@ int soft_offline_page(struct page *page, int flags) list_add(&page->lru, &pagelist); ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); if (ret) { - pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", + pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) ret = -EIO; } } else { - pr_debug("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", + pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n", pfn, ret, page_count(page), page->flags); } if (ret) -- cgit v0.10.2 From 898e70d1e526d7814bd2f64c907706b83ffca9af Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Sep 2010 23:33:29 +0200 Subject: HWPOISON: Disable DEBUG by default Now that only a few obscure messages are left as pr_debug disable outputting of pr_debug in memory-failure.c by default. Signed-off-by: Andi Kleen diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0a2ed9a..77b3e79 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -35,7 +35,6 @@ * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages * - pass bad pages to kdump next kernel */ -#define DEBUG 1 /* remove me in 2.6.34 */ #include #include #include -- cgit v0.10.2 From 9033ae16407f46ae06f559f9374281f6e9d89efc Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Sep 2010 23:36:05 +0200 Subject: HWPOISON: Turn addr_valid from bitfield into char The addr_valid flag is the only flag in "to_kill" and it's slightly more efficient to have it as char instead of a bitfield. Signed-off-by: Andi Kleen diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 77b3e79..88653c9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -272,7 +272,7 @@ struct to_kill { struct list_head nd; struct task_struct *tsk; unsigned long addr; - unsigned addr_valid:1; + char addr_valid; }; /* -- cgit v0.10.2 From a08c80ebb621a6dc277c91e029acb725f2f20254 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Sep 2010 23:39:30 +0200 Subject: HWPOISON: Remove retry loop for try_to_unmap We don't reply in other temporary failure cases and there were no reports of replies happening. I think the original reason it was added was also just an early bug, not an observation of the race. So remove the loop for now, but keep a warning message. Signed-off-by: Andi Kleen diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 88653c9..2044fe8 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -840,8 +840,6 @@ static int page_action(struct page_state *ps, struct page *p, return (result == RECOVERED || result == DELAYED) ? 0 : -EBUSY; } -#define N_UNMAP_TRIES 5 - /* * Do all that is necessary to remove user space mappings. Unmap * the pages and send SIGBUS to the processes if the data was dirty. @@ -853,7 +851,6 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); int ret; - int i; int kill = 1; struct page *hpage = compound_head(p); @@ -907,17 +904,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, if (kill) collect_procs(hpage, &tokill); - /* - * try_to_unmap can fail temporarily due to races. - * Try a few times (RED-PEN better strategy?) - */ - for (i = 0; i < N_UNMAP_TRIES; i++) { - ret = try_to_unmap(hpage, ttu); - if (ret == SWAP_SUCCESS) - break; - pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); - } - + ret = try_to_unmap(hpage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", pfn, page_mapcount(hpage)); -- cgit v0.10.2 From 3ef8fd7f720fc4f462fcdcae2fcde6f1c0536bfe Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Oct 2010 16:03:21 +0200 Subject: Fix migration.c compilation on s390 31bit s390 doesn't have huge pages and failed with: > mm/migrate.c: In function 'remove_migration_pte': > mm/migrate.c:143:3: error: implicit declaration of function 'pte_mkhuge' > mm/migrate.c:143:7: error: incompatible types when assigning to type 'pte_t' from type 'int' Put that code into a ifdef. Reported by Heiko Carstens Signed-off-by: Andi Kleen diff --git a/mm/migrate.c b/mm/migrate.c index 55dbc45..f8c9bcc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -139,8 +139,10 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); if (is_write_migration_entry(entry)) pte = pte_mkwrite(pte); +#ifdef CONFIG_HUGETLB_PAGE if (PageHuge(new)) pte = pte_mkhuge(pte); +#endif flush_cache_page(vma, addr, pte_pfn(pte)); set_pte_at(mm, addr, ptep, pte); -- cgit v0.10.2 From f7cb8b5ffd16edad64ae16ee38c561f98cbcaa3b Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 11 Oct 2010 19:51:49 +0200 Subject: Add _addr_lsb field to ia64 siginfo The huge pages code uses the siginfo _addr_lsb field, but ia64 was missing it. Add it to ia64 too (over some padding) to avoid compilation failures. Acked-by: Tony Luck Signed-off-by: Andi Kleen diff --git a/arch/ia64/include/asm/siginfo.h b/arch/ia64/include/asm/siginfo.h index 118d429..c8fcaa2 100644 --- a/arch/ia64/include/asm/siginfo.h +++ b/arch/ia64/include/asm/siginfo.h @@ -62,6 +62,7 @@ typedef struct siginfo { int _imm; /* immediate value for "break" */ unsigned int _flags; /* see below */ unsigned long _isr; /* isr */ + short _addr_lsb; /* lsb of faulting address */ } _sigfault; /* SIGPOLL */ -- cgit v0.10.2