summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/cma.h2
-rw-r--r--mm/cma_debug.c11
-rw-r--r--mm/huge_memory.c7
-rw-r--r--mm/kasan/kasan.c2
-rw-r--r--mm/kasan/report.c2
-rw-r--r--mm/memory-failure.c54
-rw-r--r--mm/memory_hotplug.c13
-rw-r--r--mm/migrate.c8
-rw-r--r--mm/page-writeback.c4
-rw-r--r--mm/page_alloc.c67
-rw-r--r--mm/page_owner.c7
-rw-r--r--mm/shmem.c4
-rw-r--r--mm/slab_common.c3
-rw-r--r--mm/vmscan.c16
14 files changed, 123 insertions, 77 deletions
diff --git a/mm/cma.h b/mm/cma.h
index 1132d73..17c75a4 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -16,7 +16,7 @@ struct cma {
extern struct cma cma_areas[MAX_CMA_AREAS];
extern unsigned cma_area_count;
-static unsigned long cma_bitmap_maxno(struct cma *cma)
+static inline unsigned long cma_bitmap_maxno(struct cma *cma)
{
return cma->count >> cma->order_per_bit;
}
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 7621ee3..f8e4b60 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -39,7 +39,7 @@ static int cma_used_get(void *data, u64 *val)
mutex_lock(&cma->lock);
/* pages counter is smaller than sizeof(int) */
- used = bitmap_weight(cma->bitmap, (int)cma->count);
+ used = bitmap_weight(cma->bitmap, (int)cma_bitmap_maxno(cma));
mutex_unlock(&cma->lock);
*val = (u64)used << cma->order_per_bit;
@@ -52,13 +52,14 @@ static int cma_maxchunk_get(void *data, u64 *val)
struct cma *cma = data;
unsigned long maxchunk = 0;
unsigned long start, end = 0;
+ unsigned long bitmap_maxno = cma_bitmap_maxno(cma);
mutex_lock(&cma->lock);
for (;;) {
- start = find_next_zero_bit(cma->bitmap, cma->count, end);
+ start = find_next_zero_bit(cma->bitmap, bitmap_maxno, end);
if (start >= cma->count)
break;
- end = find_next_bit(cma->bitmap, cma->count, start);
+ end = find_next_bit(cma->bitmap, bitmap_maxno, start);
maxchunk = max(end - start, maxchunk);
}
mutex_unlock(&cma->lock);
@@ -170,10 +171,10 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
tmp = debugfs_create_dir(name, cma_debugfs_root);
- debugfs_create_file("alloc", S_IWUSR, cma_debugfs_root, cma,
+ debugfs_create_file("alloc", S_IWUSR, tmp, cma,
&cma_alloc_fops);
- debugfs_create_file("free", S_IWUSR, cma_debugfs_root, cma,
+ debugfs_create_file("free", S_IWUSR, tmp, cma,
&cma_free_fops);
debugfs_create_file("base_pfn", S_IRUGO, tmp,
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c107094..097c7a4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1676,12 +1676,7 @@ static void __split_huge_page_refcount(struct page *page,
/* after clearing PageTail the gup refcount can be released */
smp_mb__after_atomic();
- /*
- * retain hwpoison flag of the poisoned tail page:
- * fix for the unsuitable process killed on Guest Machine(KVM)
- * by the memory-failure.
- */
- page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON;
+ page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (page->flags &
((1L << PG_referenced) |
(1L << PG_swapbacked) |
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
index 6c513a6..7b28e9c 100644
--- a/mm/kasan/kasan.c
+++ b/mm/kasan/kasan.c
@@ -2,7 +2,7 @@
* This file contains shadow memory manipulation code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
- * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some of code borrowed from https://github.com/xairy/linux by
* Andrey Konovalov <adech.fo@gmail.com>
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 680ceed..e07c94f 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -2,7 +2,7 @@
* This file contains error reporting code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
- * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some of code borrowed from https://github.com/xairy/linux by
* Andrey Konovalov <adech.fo@gmail.com>
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index c53543d..1f4446a9 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -909,6 +909,18 @@ int get_hwpoison_page(struct page *page)
* directly for tail pages.
*/
if (PageTransHuge(head)) {
+ /*
+ * Non anonymous thp exists only in allocation/free time. We
+ * can't handle such a case correctly, so let's give it up.
+ * This should be better than triggering BUG_ON when kernel
+ * tries to touch the "partially handled" page.
+ */
+ if (!PageAnon(head)) {
+ pr_err("MCE: %#lx: non anonymous thp\n",
+ page_to_pfn(page));
+ return 0;
+ }
+
if (get_page_unless_zero(head)) {
if (PageTail(page))
get_page(page);
@@ -1134,17 +1146,11 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
}
if (!PageHuge(p) && PageTransHuge(hpage)) {
- if (!PageAnon(hpage)) {
- pr_err("MCE: %#lx: non anonymous thp\n", pfn);
- if (TestClearPageHWPoison(p))
- atomic_long_sub(nr_pages, &num_poisoned_pages);
- put_page(p);
- if (p != hpage)
- put_page(hpage);
- return -EBUSY;
- }
- if (unlikely(split_huge_page(hpage))) {
- pr_err("MCE: %#lx: thp split failed\n", pfn);
+ if (!PageAnon(hpage) || unlikely(split_huge_page(hpage))) {
+ if (!PageAnon(hpage))
+ pr_err("MCE: %#lx: non anonymous thp\n", pfn);
+ else
+ pr_err("MCE: %#lx: thp split failed\n", pfn);
if (TestClearPageHWPoison(p))
atomic_long_sub(nr_pages, &num_poisoned_pages);
put_page(p);
@@ -1209,9 +1215,9 @@ int memory_failure(unsigned long pfn, int trapno, int flags)
if (!PageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: just unpoisoned\n", pfn);
atomic_long_sub(nr_pages, &num_poisoned_pages);
+ unlock_page(hpage);
put_page(hpage);
- res = 0;
- goto out;
+ return 0;
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
@@ -1535,6 +1541,8 @@ static int get_any_page(struct page *page, unsigned long pfn, int flags)
*/
ret = __get_any_page(page, pfn, 0);
if (!PageLRU(page)) {
+ /* Drop page reference which is from __get_any_page() */
+ put_page(page);
pr_info("soft_offline: %#lx: unknown non LRU page type %lx\n",
pfn, page->flags);
return -EIO;
@@ -1564,13 +1572,12 @@ static int soft_offline_huge_page(struct page *page, int flags)
unlock_page(hpage);
ret = isolate_huge_page(hpage, &pagelist);
- if (ret) {
- /*
- * get_any_page() and isolate_huge_page() takes a refcount each,
- * so need to drop one here.
- */
- put_page(hpage);
- } else {
+ /*
+ * get_any_page() and isolate_huge_page() takes a refcount each,
+ * so need to drop one here.
+ */
+ put_page(hpage);
+ if (!ret) {
pr_info("soft offline: %#lx hugepage failed to isolate\n", pfn);
return -EBUSY;
}
@@ -1656,6 +1663,8 @@ static int __soft_offline_page(struct page *page, int flags)
inc_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
list_add(&page->lru, &pagelist);
+ if (!TestSetPageHWPoison(page))
+ atomic_long_inc(&num_poisoned_pages);
ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL,
MIGRATE_SYNC, MR_MEMORY_FAILURE);
if (ret) {
@@ -1670,9 +1679,8 @@ static int __soft_offline_page(struct page *page, int flags)
pfn, ret, page->flags);
if (ret > 0)
ret = -EIO;
- } else {
- SetPageHWPoison(page);
- atomic_long_inc(&num_poisoned_pages);
+ if (TestClearPageHWPoison(page))
+ atomic_long_dec(&num_poisoned_pages);
}
} else {
pr_info("soft offline: %#lx: isolation failed: %d, page count %d, type %lx\n",
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 26fbba7..6da82bc 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -446,7 +446,7 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
int nr_pages = PAGES_PER_SECTION;
int nid = pgdat->node_id;
int zone_type;
- unsigned long flags;
+ unsigned long flags, pfn;
int ret;
zone_type = zone - pgdat->node_zones;
@@ -461,6 +461,14 @@ static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
pgdat_resize_unlock(zone->zone_pgdat, &flags);
memmap_init_zone(nr_pages, nid, zone_type,
phys_start_pfn, MEMMAP_HOTPLUG);
+
+ /* online_page_range is called later and expects pages reserved */
+ for (pfn = phys_start_pfn; pfn < phys_start_pfn + nr_pages; pfn++) {
+ if (!pfn_valid(pfn))
+ continue;
+
+ SetPageReserved(pfn_to_page(pfn));
+ }
return 0;
}
@@ -1269,6 +1277,7 @@ int __ref add_memory(int nid, u64 start, u64 size)
/* create new memmap entry */
firmware_map_add_hotplug(start, start + size, "System RAM");
+ memblock_add_node(start, size, nid);
goto out;
@@ -2005,6 +2014,8 @@ void __ref remove_memory(int nid, u64 start, u64 size)
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
+ memblock_free(start, size);
+ memblock_remove(start, size);
arch_remove_memory(start, size);
diff --git a/mm/migrate.c b/mm/migrate.c
index ee401e4..eb42671 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -880,7 +880,8 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
/* Establish migration ptes or remove ptes */
if (page_mapped(page)) {
try_to_unmap(page,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS|
+ TTU_IGNORE_HWPOISON);
page_was_mapped = 1;
}
@@ -950,7 +951,10 @@ out:
list_del(&page->lru);
dec_zone_page_state(page, NR_ISOLATED_ANON +
page_is_file_cache(page));
- if (reason != MR_MEMORY_FAILURE)
+ /* Soft-offlined page shouldn't go through lru cache list */
+ if (reason == MR_MEMORY_FAILURE)
+ put_page(page);
+ else
putback_lru_page(page);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 22cddd3..5cccc12 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2063,10 +2063,10 @@ static struct notifier_block ratelimit_nb = {
*/
void __init page_writeback_init(void)
{
+ BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
+
writeback_set_ratelimit();
register_cpu_notifier(&ratelimit_nb);
-
- BUG_ON(wb_domain_init(&global_wb_domain, GFP_KERNEL));
}
/**
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 506eac8..df959b7 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -18,7 +18,6 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/interrupt.h>
-#include <linux/rwsem.h>
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/bootmem.h>
@@ -246,9 +245,7 @@ static inline void reset_deferred_meminit(pg_data_t *pgdat)
/* Returns true if the struct page for the pfn is uninitialised */
static inline bool __meminit early_page_uninitialised(unsigned long pfn)
{
- int nid = early_pfn_to_nid(pfn);
-
- if (pfn >= NODE_DATA(nid)->first_deferred_pfn)
+ if (pfn >= NODE_DATA(early_pfn_to_nid(pfn))->first_deferred_pfn)
return true;
return false;
@@ -983,21 +980,21 @@ static void __init __free_pages_boot_core(struct page *page,
#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
-/* Only safe to use early in boot when initialisation is single-threaded */
+
static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
int __meminit early_pfn_to_nid(unsigned long pfn)
{
+ static DEFINE_SPINLOCK(early_pfn_lock);
int nid;
- /* The system will behave unpredictably otherwise */
- BUG_ON(system_state != SYSTEM_BOOTING);
-
+ spin_lock(&early_pfn_lock);
nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid >= 0)
- return nid;
- /* just returns 0 */
- return 0;
+ if (nid < 0)
+ nid = 0;
+ spin_unlock(&early_pfn_lock);
+
+ return nid;
}
#endif
@@ -1062,7 +1059,15 @@ static void __init deferred_free_range(struct page *page,
__free_pages_boot_core(page, pfn, 0);
}
-static __initdata DECLARE_RWSEM(pgdat_init_rwsem);
+/* Completion tracking for deferred_init_memmap() threads */
+static atomic_t pgdat_init_n_undone __initdata;
+static __initdata DECLARE_COMPLETION(pgdat_init_all_done_comp);
+
+static inline void __init pgdat_init_report_one_done(void)
+{
+ if (atomic_dec_and_test(&pgdat_init_n_undone))
+ complete(&pgdat_init_all_done_comp);
+}
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
@@ -1079,7 +1084,7 @@ static int __init deferred_init_memmap(void *data)
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
if (first_init_pfn == ULONG_MAX) {
- up_read(&pgdat_init_rwsem);
+ pgdat_init_report_one_done();
return 0;
}
@@ -1179,7 +1184,8 @@ free_range:
pr_info("node %d initialised, %lu pages in %ums\n", nid, nr_pages,
jiffies_to_msecs(jiffies - start));
- up_read(&pgdat_init_rwsem);
+
+ pgdat_init_report_one_done();
return 0;
}
@@ -1187,14 +1193,17 @@ void __init page_alloc_init_late(void)
{
int nid;
+ /* There will be num_node_state(N_MEMORY) threads */
+ atomic_set(&pgdat_init_n_undone, num_node_state(N_MEMORY));
for_each_node_state(nid, N_MEMORY) {
- down_read(&pgdat_init_rwsem);
kthread_run(deferred_init_memmap, NODE_DATA(nid), "pgdatinit%d", nid);
}
/* Block until all are initialised */
- down_write(&pgdat_init_rwsem);
- up_write(&pgdat_init_rwsem);
+ wait_for_completion(&pgdat_init_all_done_comp);
+
+ /* Reinit limits that are based on free pages after the kernel is up */
+ files_maxfiles_init();
}
#endif /* CONFIG_DEFERRED_STRUCT_PAGE_INIT */
@@ -1287,6 +1296,10 @@ static inline int check_new_page(struct page *page)
bad_reason = "non-NULL mapping";
if (unlikely(atomic_read(&page->_count) != 0))
bad_reason = "nonzero _count";
+ if (unlikely(page->flags & __PG_HWPOISON)) {
+ bad_reason = "HWPoisoned (hardware-corrupted)";
+ bad_flags = __PG_HWPOISON;
+ }
if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
@@ -1950,6 +1963,7 @@ void free_hot_cold_page_list(struct list_head *list, bool cold)
void split_page(struct page *page, unsigned int order)
{
int i;
+ gfp_t gfp_mask;
VM_BUG_ON_PAGE(PageCompound(page), page);
VM_BUG_ON_PAGE(!page_count(page), page);
@@ -1963,10 +1977,11 @@ void split_page(struct page *page, unsigned int order)
split_page(virt_to_page(page[0].shadow), order);
#endif
- set_page_owner(page, 0, 0);
+ gfp_mask = get_page_owner_gfp(page);
+ set_page_owner(page, 0, gfp_mask);
for (i = 1; i < (1 << order); i++) {
set_page_refcounted(page + i);
- set_page_owner(page + i, 0, 0);
+ set_page_owner(page + i, 0, gfp_mask);
}
}
EXPORT_SYMBOL_GPL(split_page);
@@ -1996,6 +2011,8 @@ int __isolate_free_page(struct page *page, unsigned int order)
zone->free_area[order].nr_free--;
rmv_page_order(page);
+ set_page_owner(page, order, __GFP_MOVABLE);
+
/* Set the pageblock if the isolated page is at least a pageblock */
if (order >= pageblock_order - 1) {
struct page *endpage = page + (1 << order) - 1;
@@ -2007,7 +2024,7 @@ int __isolate_free_page(struct page *page, unsigned int order)
}
}
- set_page_owner(page, order, 0);
+
return 1UL << order;
}
@@ -5043,6 +5060,10 @@ static unsigned long __meminit zone_spanned_pages_in_node(int nid,
{
unsigned long zone_start_pfn, zone_end_pfn;
+ /* When hotadd a new node, the node should be empty */
+ if (!node_start_pfn && !node_end_pfn)
+ return 0;
+
/* Get the start and end of the zone */
zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
@@ -5106,6 +5127,10 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
unsigned long zone_start_pfn, zone_end_pfn;
+ /* When hotadd a new node, the node should be empty */
+ if (!node_start_pfn && !node_end_pfn)
+ return 0;
+
zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
diff --git a/mm/page_owner.c b/mm/page_owner.c
index bd5f842..983c3a1 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -76,6 +76,13 @@ void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
+gfp_t __get_page_owner_gfp(struct page *page)
+{
+ struct page_ext *page_ext = lookup_page_ext(page);
+
+ return page_ext->gfp_mask;
+}
+
static ssize_t
print_page_owner(char __user *buf, size_t count, unsigned long pfn,
struct page *page, struct page_ext *page_ext)
diff --git a/mm/shmem.c b/mm/shmem.c
index 4caf8ed..dbe0c1e 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -3363,8 +3363,8 @@ put_path:
* shmem_kernel_file_setup - get an unlinked file living in tmpfs which must be
* kernel internal. There will be NO LSM permission checks against the
* underlying inode. So users of this interface must do LSM checks at a
- * higher layer. The one user is the big_key implementation. LSM checks
- * are provided at the key level rather than the inode level.
+ * higher layer. The users are the big_key and shm implementations. LSM
+ * checks are provided at the key or shm level rather than the inode.
* @name: name for dentry (to be seen in /proc/<pid>/maps
* @size: size to be set for the file
* @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 3e5f8f2..8683110 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -37,8 +37,7 @@ struct kmem_cache *kmem_cache;
SLAB_TRACE | SLAB_DESTROY_BY_RCU | SLAB_NOLEAKTRACE | \
SLAB_FAILSLAB)
-#define SLAB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
- SLAB_CACHE_DMA | SLAB_NOTRACK)
+#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | SLAB_NOTRACK)
/*
* Merge control. If this is set then no merging of slab caches will occur.
diff --git a/mm/vmscan.c b/mm/vmscan.c
index e61445d..8286938 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -973,22 +973,18 @@ static unsigned long shrink_page_list(struct list_head *page_list,
* caller can stall after page list has been processed.
*
* 2) Global or new memcg reclaim encounters a page that is
- * not marked for immediate reclaim or the caller does not
- * have __GFP_IO. In this case mark the page for immediate
+ * not marked for immediate reclaim, or the caller does not
+ * have __GFP_FS (or __GFP_IO if it's simply going to swap,
+ * not to fs). In this case mark the page for immediate
* reclaim and continue scanning.
*
- * __GFP_IO is checked because a loop driver thread might
+ * Require may_enter_fs because we would wait on fs, which
+ * may not have submitted IO yet. And the loop driver might
* enter reclaim, and deadlock if it waits on a page for
* which it is needed to do the write (loop masks off
* __GFP_IO|__GFP_FS for this reason); but more thought
* would probably show more reasons.
*
- * Don't require __GFP_FS, since we're not going into the
- * FS, just waiting on its writeback completion. Worryingly,
- * ext4 gfs2 and xfs allocate pages with
- * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
- * may_enter_fs here is liable to OOM on them.
- *
* 3) Legacy memcg encounters a page that is not already marked
* PageReclaim. memcg does not have any dirty pages
* throttling so we could easily OOM just because too many
@@ -1005,7 +1001,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
/* Case 2 above */
} else if (sane_reclaim(sc) ||
- !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+ !PageReclaim(page) || !may_enter_fs) {
/*
* This is slightly racy - end_page_writeback()
* might have just cleared PageReclaim, then