summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig32
-rw-r--r--mm/Makefile7
-rw-r--r--mm/backing-dev.c117
-rw-r--r--mm/cleancache.c2
-rw-r--r--mm/cma.c2
-rw-r--r--mm/compaction.c181
-rw-r--r--mm/debug.c4
-rw-r--r--mm/fadvise.c10
-rw-r--r--mm/filemap.c30
-rw-r--r--mm/filemap_xip.c478
-rw-r--r--mm/fremap.c283
-rw-r--r--mm/gup.c242
-rw-r--r--mm/huge_memory.c156
-rw-r--r--mm/hugetlb.c160
-rw-r--r--mm/hugetlb_cgroup.c2
-rw-r--r--mm/internal.h28
-rw-r--r--mm/interval_tree.c34
-rw-r--r--mm/iov_iter.c17
-rw-r--r--mm/kasan/Makefile8
-rw-r--r--mm/kasan/kasan.c516
-rw-r--r--mm/kasan/kasan.h75
-rw-r--r--mm/kasan/report.c269
-rw-r--r--mm/kmemleak.c6
-rw-r--r--mm/ksm.c2
-rw-r--r--mm/list_lru.c467
-rw-r--r--mm/madvise.c32
-rw-r--r--mm/memcontrol.c1073
-rw-r--r--mm/memory-failure.c13
-rw-r--r--mm/memory.c355
-rw-r--r--mm/mempolicy.c286
-rw-r--r--mm/migrate.c45
-rw-r--r--mm/mincore.c175
-rw-r--r--mm/mm_init.c4
-rw-r--r--mm/mmap.c100
-rw-r--r--mm/mmzone.c4
-rw-r--r--mm/mprotect.c50
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/msync.c5
-rw-r--r--mm/nommu.c114
-rw-r--r--mm/oom_kill.c169
-rw-r--r--mm/page-writeback.c46
-rw-r--r--mm/page_alloc.c462
-rw-r--r--mm/page_counter.c7
-rw-r--r--mm/page_io.c9
-rw-r--r--mm/page_owner.c26
-rw-r--r--mm/pagewalk.c238
-rw-r--r--mm/percpu.c6
-rw-r--r--mm/pgtable-generic.c2
-rw-r--r--mm/process_vm_access.c7
-rw-r--r--mm/readahead.c4
-rw-r--r--mm/rmap.c237
-rw-r--r--mm/shmem.c31
-rw-r--r--mm/slab.c17
-rw-r--r--mm/slab.h67
-rw-r--r--mm/slab_common.c323
-rw-r--r--mm/slob.c2
-rw-r--r--mm/slub.c232
-rw-r--r--mm/swap.c6
-rw-r--r--mm/swap_state.c6
-rw-r--r--mm/truncate.c2
-rw-r--r--mm/util.c48
-rw-r--r--mm/vmalloc.c16
-rw-r--r--mm/vmscan.c121
-rw-r--r--mm/vmstat.c130
-rw-r--r--mm/workingset.c9
-rw-r--r--mm/zbud.c3
-rw-r--r--mm/zpool.c6
-rw-r--r--mm/zsmalloc.c239
-rw-r--r--mm/zswap.c5
69 files changed, 4535 insertions, 3327 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 4395b12..a03131b 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -129,28 +129,28 @@ config SPARSEMEM_VMEMMAP
efficient option when sufficient kernel resources are available.
config HAVE_MEMBLOCK
- boolean
+ bool
config HAVE_MEMBLOCK_NODE_MAP
- boolean
+ bool
config HAVE_MEMBLOCK_PHYS_MAP
- boolean
+ bool
config HAVE_GENERIC_RCU_GUP
- boolean
+ bool
config ARCH_DISCARD_MEMBLOCK
- boolean
+ bool
config NO_BOOTMEM
- boolean
+ bool
config MEMORY_ISOLATION
- boolean
+ bool
config MOVABLE_NODE
- boolean "Enable to assign a node which has only movable memory"
+ bool "Enable to assign a node which has only movable memory"
depends on HAVE_MEMBLOCK
depends on NO_BOOTMEM
depends on X86_64
@@ -228,12 +228,12 @@ config SPLIT_PTLOCK_CPUS
default "4"
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
- boolean
+ bool
#
# support for memory balloon
config MEMORY_BALLOON
- boolean
+ bool
#
# support for memory balloon compaction
@@ -276,7 +276,7 @@ config MIGRATION
allocation instead of reclaiming.
config ARCH_ENABLE_HUGEPAGE_MIGRATION
- boolean
+ bool
config PHYS_ADDR_T_64BIT
def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT
@@ -602,6 +602,16 @@ config PGTABLE_MAPPING
You can check speed with zsmalloc benchmark:
https://github.com/spartacus06/zsmapbench
+config ZSMALLOC_STAT
+ bool "Export zsmalloc statistics"
+ depends on ZSMALLOC
+ select DEBUG_FS
+ help
+ This option enables code in the zsmalloc to collect various
+ statistics about whats happening in zsmalloc and exports that
+ information to userspace via debugfs.
+ If unsure, say N.
+
config GENERIC_EARLY_IOREMAP
bool
diff --git a/mm/Makefile b/mm/Makefile
index 4bf586e..3c1caa2 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -2,8 +2,11 @@
# Makefile for the linux memory manager.
#
+KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slub.o := n
+
mmu-y := nommu.o
-mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o memory.o mincore.o \
+mmu-$(CONFIG_MMU) := gup.o highmem.o memory.o mincore.o \
mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
vmalloc.o pagewalk.o pgtable-generic.o
@@ -49,9 +52,9 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
obj-$(CONFIG_SLAB) += slab.o
obj-$(CONFIG_SLUB) += slub.o
obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
+obj-$(CONFIG_KASAN) += kasan/
obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
-obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 0ae0df5..6dc4580 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -14,19 +14,10 @@
static atomic_long_t bdi_seq = ATOMIC_LONG_INIT(0);
-struct backing_dev_info default_backing_dev_info = {
- .name = "default",
- .ra_pages = VM_MAX_READAHEAD * 1024 / PAGE_CACHE_SIZE,
- .state = 0,
- .capabilities = BDI_CAP_MAP_COPY,
-};
-EXPORT_SYMBOL_GPL(default_backing_dev_info);
-
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
-EXPORT_SYMBOL_GPL(noop_backing_dev_info);
static struct class *bdi_class;
@@ -40,17 +31,6 @@ LIST_HEAD(bdi_list);
/* bdi_wq serves all asynchronous writeback tasks */
struct workqueue_struct *bdi_wq;
-static void bdi_lock_two(struct bdi_writeback *wb1, struct bdi_writeback *wb2)
-{
- if (wb1 < wb2) {
- spin_lock(&wb1->list_lock);
- spin_lock_nested(&wb2->list_lock, 1);
- } else {
- spin_lock(&wb2->list_lock);
- spin_lock_nested(&wb1->list_lock, 1);
- }
-}
-
#ifdef CONFIG_DEBUG_FS
#include <linux/debugfs.h>
#include <linux/seq_file.h>
@@ -69,10 +49,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
unsigned long background_thresh;
unsigned long dirty_thresh;
unsigned long bdi_thresh;
- unsigned long nr_dirty, nr_io, nr_more_io;
+ unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
struct inode *inode;
- nr_dirty = nr_io = nr_more_io = 0;
+ nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
spin_lock(&wb->list_lock);
list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
nr_dirty++;
@@ -80,6 +60,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_io++;
list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
nr_more_io++;
+ list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
+ if (inode->i_state & I_DIRTY_TIME)
+ nr_dirty_time++;
spin_unlock(&wb->list_lock);
global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -98,6 +81,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
"b_dirty: %10lu\n"
"b_io: %10lu\n"
"b_more_io: %10lu\n"
+ "b_dirty_time: %10lu\n"
"bdi_list: %10u\n"
"state: %10lx\n",
(unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -111,6 +95,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_dirty,
nr_io,
nr_more_io,
+ nr_dirty_time,
!list_empty(&bdi->bdi_list), bdi->state);
#undef K
@@ -264,9 +249,6 @@ static int __init default_bdi_init(void)
if (!bdi_wq)
return -ENOMEM;
- err = bdi_init(&default_backing_dev_info);
- if (!err)
- bdi_register(&default_backing_dev_info, NULL, "default");
err = bdi_init(&noop_backing_dev_info);
return err;
@@ -355,19 +337,19 @@ EXPORT_SYMBOL(bdi_register_dev);
*/
static void bdi_wb_shutdown(struct backing_dev_info *bdi)
{
- if (!bdi_cap_writeback_dirty(bdi))
+ /* Make sure nobody queues further work */
+ spin_lock_bh(&bdi->wb_lock);
+ if (!test_and_clear_bit(BDI_registered, &bdi->state)) {
+ spin_unlock_bh(&bdi->wb_lock);
return;
+ }
+ spin_unlock_bh(&bdi->wb_lock);
/*
* Make sure nobody finds us on the bdi_list anymore
*/
bdi_remove_from_list(bdi);
- /* Make sure nobody queues further work */
- spin_lock_bh(&bdi->wb_lock);
- clear_bit(BDI_registered, &bdi->state);
- spin_unlock_bh(&bdi->wb_lock);
-
/*
* Drain work list and shutdown the delayed_work. At this point,
* @bdi->bdi_list is empty telling bdi_Writeback_workfn() that @bdi
@@ -375,37 +357,22 @@ static void bdi_wb_shutdown(struct backing_dev_info *bdi)
*/
mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
flush_delayed_work(&bdi->wb.dwork);
- WARN_ON(!list_empty(&bdi->work_list));
- WARN_ON(delayed_work_pending(&bdi->wb.dwork));
}
/*
- * This bdi is going away now, make sure that no super_blocks point to it
+ * Called when the device behind @bdi has been removed or ejected.
+ *
+ * We can't really do much here except for reducing the dirty ratio at
+ * the moment. In the future we should be able to set a flag so that
+ * the filesystem can handle errors at mark_inode_dirty time instead
+ * of only at writeback time.
*/
-static void bdi_prune_sb(struct backing_dev_info *bdi)
-{
- struct super_block *sb;
-
- spin_lock(&sb_lock);
- list_for_each_entry(sb, &super_blocks, s_list) {
- if (sb->s_bdi == bdi)
- sb->s_bdi = &default_backing_dev_info;
- }
- spin_unlock(&sb_lock);
-}
-
void bdi_unregister(struct backing_dev_info *bdi)
{
- if (bdi->dev) {
- bdi_set_min_ratio(bdi, 0);
- trace_writeback_bdi_unregister(bdi);
- bdi_prune_sb(bdi);
+ if (WARN_ON_ONCE(!bdi->dev))
+ return;
- bdi_wb_shutdown(bdi);
- bdi_debug_unregister(bdi);
- device_unregister(bdi->dev);
- bdi->dev = NULL;
- }
+ bdi_set_min_ratio(bdi, 0);
}
EXPORT_SYMBOL(bdi_unregister);
@@ -418,6 +385,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
INIT_LIST_HEAD(&wb->b_dirty);
INIT_LIST_HEAD(&wb->b_io);
INIT_LIST_HEAD(&wb->b_more_io);
+ INIT_LIST_HEAD(&wb->b_dirty_time);
spin_lock_init(&wb->list_lock);
INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
}
@@ -474,37 +442,19 @@ void bdi_destroy(struct backing_dev_info *bdi)
{
int i;
- /*
- * Splice our entries to the default_backing_dev_info. This
- * condition shouldn't happen. @wb must be empty at this point and
- * dirty inodes on it might cause other issues. This workaround is
- * added by ce5f8e779519 ("writeback: splice dirty inode entries to
- * default bdi on bdi_destroy()") without root-causing the issue.
- *
- * http://lkml.kernel.org/g/1253038617-30204-11-git-send-email-jens.axboe@oracle.com
- * http://thread.gmane.org/gmane.linux.file-systems/35341/focus=35350
- *
- * We should probably add WARN_ON() to find out whether it still
- * happens and track it down if so.
- */
- if (bdi_has_dirty_io(bdi)) {
- struct bdi_writeback *dst = &default_backing_dev_info.wb;
-
- bdi_lock_two(&bdi->wb, dst);
- list_splice(&bdi->wb.b_dirty, &dst->b_dirty);
- list_splice(&bdi->wb.b_io, &dst->b_io);
- list_splice(&bdi->wb.b_more_io, &dst->b_more_io);
- spin_unlock(&bdi->wb.list_lock);
- spin_unlock(&dst->list_lock);
- }
-
- bdi_unregister(bdi);
+ bdi_wb_shutdown(bdi);
+ WARN_ON(!list_empty(&bdi->work_list));
WARN_ON(delayed_work_pending(&bdi->wb.dwork));
+ if (bdi->dev) {
+ bdi_debug_unregister(bdi);
+ device_unregister(bdi->dev);
+ bdi->dev = NULL;
+ }
+
for (i = 0; i < NR_BDI_STAT_ITEMS; i++)
percpu_counter_destroy(&bdi->bdi_stat[i]);
-
fprop_local_destroy_percpu(&bdi->completions);
}
EXPORT_SYMBOL(bdi_destroy);
@@ -513,13 +463,12 @@ EXPORT_SYMBOL(bdi_destroy);
* For use from filesystems to quickly init and register a bdi associated
* with dirty writeback
*/
-int bdi_setup_and_register(struct backing_dev_info *bdi, char *name,
- unsigned int cap)
+int bdi_setup_and_register(struct backing_dev_info *bdi, char *name)
{
int err;
bdi->name = name;
- bdi->capabilities = cap;
+ bdi->capabilities = 0;
err = bdi_init(bdi);
if (err)
return err;
diff --git a/mm/cleancache.c b/mm/cleancache.c
index d0eac43..053bcd8 100644
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -25,7 +25,7 @@
static struct cleancache_ops *cleancache_ops __read_mostly;
/*
- * Counters available via /sys/kernel/debug/frontswap (if debugfs is
+ * Counters available via /sys/kernel/debug/cleancache (if debugfs is
* properly configured. These are for information only so are not protected
* against increment races.
*/
diff --git a/mm/cma.c b/mm/cma.c
index a85ae28..75016fd 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -199,6 +199,7 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
cma->order_per_bit = order_per_bit;
*res_cma = cma;
cma_area_count++;
+ totalcma_pages += (size / PAGE_SIZE);
return 0;
}
@@ -337,7 +338,6 @@ int __init cma_declare_contiguous(phys_addr_t base,
if (ret)
goto err;
- totalcma_pages += (size / PAGE_SIZE);
pr_info("Reserved %ld MiB at %pa\n", (unsigned long)size / SZ_1M,
&base);
return 0;
diff --git a/mm/compaction.c b/mm/compaction.c
index 546e571..8c0d945 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,7 @@
#include <linux/sysfs.h>
#include <linux/balloon_compaction.h>
#include <linux/page-isolation.h>
+#include <linux/kasan.h>
#include "internal.h"
#ifdef CONFIG_COMPACTION
@@ -34,6 +35,17 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#endif
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
+#ifdef CONFIG_TRACEPOINTS
+static const char *const compaction_status_string[] = {
+ "deferred",
+ "skipped",
+ "continue",
+ "partial",
+ "complete",
+ "no_suitable_page",
+ "not_suitable_zone",
+};
+#endif
#define CREATE_TRACE_POINTS
#include <trace/events/compaction.h>
@@ -61,6 +73,7 @@ static void map_pages(struct list_head *list)
list_for_each_entry(page, list, lru) {
arch_alloc_page(page, 0);
kernel_map_pages(page, 1, 1);
+ kasan_alloc_pages(page, 0);
}
}
@@ -113,6 +126,77 @@ static struct page *pageblock_pfn_to_page(unsigned long start_pfn,
}
#ifdef CONFIG_COMPACTION
+
+/* Do not skip compaction more than 64 times */
+#define COMPACT_MAX_DEFER_SHIFT 6
+
+/*
+ * Compaction is deferred when compaction fails to result in a page
+ * allocation success. 1 << compact_defer_limit compactions are skipped up
+ * to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
+ */
+void defer_compaction(struct zone *zone, int order)
+{
+ zone->compact_considered = 0;
+ zone->compact_defer_shift++;
+
+ if (order < zone->compact_order_failed)
+ zone->compact_order_failed = order;
+
+ if (zone->compact_defer_shift > COMPACT_MAX_DEFER_SHIFT)
+ zone->compact_defer_shift = COMPACT_MAX_DEFER_SHIFT;
+
+ trace_mm_compaction_defer_compaction(zone, order);
+}
+
+/* Returns true if compaction should be skipped this time */
+bool compaction_deferred(struct zone *zone, int order)
+{
+ unsigned long defer_limit = 1UL << zone->compact_defer_shift;
+
+ if (order < zone->compact_order_failed)
+ return false;
+
+ /* Avoid possible overflow */
+ if (++zone->compact_considered > defer_limit)
+ zone->compact_considered = defer_limit;
+
+ if (zone->compact_considered >= defer_limit)
+ return false;
+
+ trace_mm_compaction_deferred(zone, order);
+
+ return true;
+}
+
+/*
+ * Update defer tracking counters after successful compaction of given order,
+ * which means an allocation either succeeded (alloc_success == true) or is
+ * expected to succeed.
+ */
+void compaction_defer_reset(struct zone *zone, int order,
+ bool alloc_success)
+{
+ if (alloc_success) {
+ zone->compact_considered = 0;
+ zone->compact_defer_shift = 0;
+ }
+ if (order >= zone->compact_order_failed)
+ zone->compact_order_failed = order + 1;
+
+ trace_mm_compaction_defer_reset(zone, order);
+}
+
+/* Returns true if restarting compaction after many failures */
+bool compaction_restarting(struct zone *zone, int order)
+{
+ if (order < zone->compact_order_failed)
+ return false;
+
+ return zone->compact_defer_shift == COMPACT_MAX_DEFER_SHIFT &&
+ zone->compact_considered >= 1UL << zone->compact_defer_shift;
+}
+
/* Returns true if the pageblock should be scanned for pages to isolate. */
static inline bool isolation_suitable(struct compact_control *cc,
struct page *page)
@@ -408,6 +492,13 @@ static unsigned long isolate_freepages_block(struct compact_control *cc,
/* If a page was split, advance to the end of it */
if (isolated) {
+ cc->nr_freepages += isolated;
+ if (!strict &&
+ cc->nr_migratepages <= cc->nr_freepages) {
+ blockpfn += isolated;
+ break;
+ }
+
blockpfn += isolated - 1;
cursor += isolated - 1;
continue;
@@ -421,11 +512,12 @@ isolate_fail:
}
+ trace_mm_compaction_isolate_freepages(*start_pfn, blockpfn,
+ nr_scanned, total_isolated);
+
/* Record how far we have got within the block */
*start_pfn = blockpfn;
- trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated);
-
/*
* If strict isolation is requested by CMA then check that all the
* pages requested were isolated. If there were any failures, 0 is
@@ -581,6 +673,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long flags = 0;
bool locked = false;
struct page *page = NULL, *valid_page = NULL;
+ unsigned long start_pfn = low_pfn;
/*
* Ensure that there are not too many pages isolated from the LRU
@@ -741,7 +834,8 @@ isolate_success:
if (low_pfn == end_pfn)
update_pageblock_skip(cc, valid_page, nr_isolated, true);
- trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated);
+ trace_mm_compaction_isolate_migratepages(start_pfn, low_pfn,
+ nr_scanned, nr_isolated);
count_compact_events(COMPACTMIGRATE_SCANNED, nr_scanned);
if (nr_isolated)
@@ -814,7 +908,6 @@ static void isolate_freepages(struct compact_control *cc)
unsigned long isolate_start_pfn; /* exact pfn we start at */
unsigned long block_end_pfn; /* end of current pageblock */
unsigned long low_pfn; /* lowest pfn scanner is able to scan */
- int nr_freepages = cc->nr_freepages;
struct list_head *freelist = &cc->freepages;
/*
@@ -839,11 +932,11 @@ static void isolate_freepages(struct compact_control *cc)
* pages on cc->migratepages. We stop searching if the migrate
* and free page scanners meet or enough free pages are isolated.
*/
- for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages;
+ for (; block_start_pfn >= low_pfn &&
+ cc->nr_migratepages > cc->nr_freepages;
block_end_pfn = block_start_pfn,
block_start_pfn -= pageblock_nr_pages,
isolate_start_pfn = block_start_pfn) {
- unsigned long isolated;
/*
* This can iterate a massively long zone without finding any
@@ -868,9 +961,8 @@ static void isolate_freepages(struct compact_control *cc)
continue;
/* Found a block suitable for isolating free pages from. */
- isolated = isolate_freepages_block(cc, &isolate_start_pfn,
+ isolate_freepages_block(cc, &isolate_start_pfn,
block_end_pfn, freelist, false);
- nr_freepages += isolated;
/*
* Remember where the free scanner should restart next time,
@@ -902,8 +994,6 @@ static void isolate_freepages(struct compact_control *cc)
*/
if (block_start_pfn < low_pfn)
cc->free_pfn = cc->migrate_pfn;
-
- cc->nr_freepages = nr_freepages;
}
/*
@@ -1015,8 +1105,10 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
low_pfn = isolate_migratepages_block(cc, low_pfn, end_pfn,
isolate_mode);
- if (!low_pfn || cc->contended)
+ if (!low_pfn || cc->contended) {
+ acct_isolated(zone, cc);
return ISOLATE_ABORT;
+ }
/*
* Either we isolated something and proceed with migration. Or
@@ -1037,7 +1129,7 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone,
return cc->nr_migratepages ? ISOLATE_SUCCESS : ISOLATE_NONE;
}
-static int compact_finished(struct zone *zone, struct compact_control *cc,
+static int __compact_finished(struct zone *zone, struct compact_control *cc,
const int migratetype)
{
unsigned int order;
@@ -1088,11 +1180,24 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
return COMPACT_PARTIAL;
/* Job done if allocation would set block type */
- if (cc->order >= pageblock_order && area->nr_free)
+ if (order >= pageblock_order && area->nr_free)
return COMPACT_PARTIAL;
}
- return COMPACT_CONTINUE;
+ return COMPACT_NO_SUITABLE_PAGE;
+}
+
+static int compact_finished(struct zone *zone, struct compact_control *cc,
+ const int migratetype)
+{
+ int ret;
+
+ ret = __compact_finished(zone, cc, migratetype);
+ trace_mm_compaction_finished(zone, cc->order, ret);
+ if (ret == COMPACT_NO_SUITABLE_PAGE)
+ ret = COMPACT_CONTINUE;
+
+ return ret;
}
/*
@@ -1102,7 +1207,7 @@ static int compact_finished(struct zone *zone, struct compact_control *cc,
* COMPACT_PARTIAL - If the allocation would succeed without compaction
* COMPACT_CONTINUE - If compaction should run now
*/
-unsigned long compaction_suitable(struct zone *zone, int order,
+static unsigned long __compaction_suitable(struct zone *zone, int order,
int alloc_flags, int classzone_idx)
{
int fragindex;
@@ -1146,11 +1251,24 @@ unsigned long compaction_suitable(struct zone *zone, int order,
*/
fragindex = fragmentation_index(zone, order);
if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold)
- return COMPACT_SKIPPED;
+ return COMPACT_NOT_SUITABLE_ZONE;
return COMPACT_CONTINUE;
}
+unsigned long compaction_suitable(struct zone *zone, int order,
+ int alloc_flags, int classzone_idx)
+{
+ unsigned long ret;
+
+ ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx);
+ trace_mm_compaction_suitable(zone, order, ret);
+ if (ret == COMPACT_NOT_SUITABLE_ZONE)
+ ret = COMPACT_SKIPPED;
+
+ return ret;
+}
+
static int compact_zone(struct zone *zone, struct compact_control *cc)
{
int ret;
@@ -1197,7 +1315,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc)
zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn;
}
- trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn);
+ trace_mm_compaction_begin(start_pfn, cc->migrate_pfn,
+ cc->free_pfn, end_pfn, sync);
migrate_prep_local();
@@ -1299,7 +1418,8 @@ out:
zone->compact_cached_free_pfn = free_pfn;
}
- trace_mm_compaction_end(ret);
+ trace_mm_compaction_end(start_pfn, cc->migrate_pfn,
+ cc->free_pfn, end_pfn, sync, ret);
return ret;
}
@@ -1335,22 +1455,20 @@ int sysctl_extfrag_threshold = 500;
/**
* try_to_compact_pages - Direct compact to satisfy a high-order allocation
- * @zonelist: The zonelist used for the current allocation
- * @order: The order of the current allocation
* @gfp_mask: The GFP mask of the current allocation
- * @nodemask: The allowed nodes to allocate from
+ * @order: The order of the current allocation
+ * @alloc_flags: The allocation flags of the current allocation
+ * @ac: The context of current allocation
* @mode: The migration mode for async, sync light, or sync migration
* @contended: Return value that determines if compaction was aborted due to
* need_resched() or lock contention
*
* This is the main entry point for direct page compaction.
*/
-unsigned long try_to_compact_pages(struct zonelist *zonelist,
- int order, gfp_t gfp_mask, nodemask_t *nodemask,
- enum migrate_mode mode, int *contended,
- int alloc_flags, int classzone_idx)
+unsigned long try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
+ int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended)
{
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
int may_enter_fs = gfp_mask & __GFP_FS;
int may_perform_io = gfp_mask & __GFP_IO;
struct zoneref *z;
@@ -1364,9 +1482,11 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
if (!order || !may_enter_fs || !may_perform_io)
return COMPACT_SKIPPED;
+ trace_mm_compaction_try_to_compact_pages(order, gfp_mask, mode);
+
/* Compact each zone in the list */
- for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx,
- nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
+ ac->nodemask) {
int status;
int zone_contended;
@@ -1374,7 +1494,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
continue;
status = compact_zone_order(zone, order, gfp_mask, mode,
- &zone_contended, alloc_flags, classzone_idx);
+ &zone_contended, alloc_flags,
+ ac->classzone_idx);
rc = max(status, rc);
/*
* It takes at least one zone that wasn't lock contended
@@ -1384,7 +1505,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist,
/* If a normal allocation would succeed, stop compacting */
if (zone_watermark_ok(zone, order, low_wmark_pages(zone),
- classzone_idx, alloc_flags)) {
+ ac->classzone_idx, alloc_flags)) {
/*
* We think the allocation will succeed in this zone,
* but it is not certain, hence the false. The caller
diff --git a/mm/debug.c b/mm/debug.c
index 0e58f32..3eb3ac2 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -130,7 +130,6 @@ static const struct trace_print_flags vmaflags_names[] = {
{VM_ACCOUNT, "account" },
{VM_NORESERVE, "noreserve" },
{VM_HUGETLB, "hugetlb" },
- {VM_NONLINEAR, "nonlinear" },
#if defined(CONFIG_X86)
{VM_PAT, "pat" },
#elif defined(CONFIG_PPC)
@@ -174,7 +173,7 @@ void dump_mm(const struct mm_struct *mm)
"get_unmapped_area %p\n"
#endif
"mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n"
- "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n"
+ "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n"
"hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n"
"pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n"
"start_code %lx end_code %lx start_data %lx end_data %lx\n"
@@ -207,6 +206,7 @@ void dump_mm(const struct mm_struct *mm)
mm->pgd, atomic_read(&mm->mm_users),
atomic_read(&mm->mm_count),
atomic_long_read((atomic_long_t *)&mm->nr_ptes),
+ mm_nr_pmds((struct mm_struct *)mm),
mm->map_count,
mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm,
mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm,
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 2ad7adf..4a3907c 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -28,6 +28,7 @@
SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
{
struct fd f = fdget(fd);
+ struct inode *inode;
struct address_space *mapping;
struct backing_dev_info *bdi;
loff_t endbyte; /* inclusive */
@@ -39,7 +40,8 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
if (!f.file)
return -EBADF;
- if (S_ISFIFO(file_inode(f.file)->i_mode)) {
+ inode = file_inode(f.file);
+ if (S_ISFIFO(inode->i_mode)) {
ret = -ESPIPE;
goto out;
}
@@ -50,7 +52,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
goto out;
}
- if (mapping->a_ops->get_xip_mem) {
+ if (IS_DAX(inode)) {
switch (advice) {
case POSIX_FADV_NORMAL:
case POSIX_FADV_RANDOM:
@@ -73,7 +75,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
else
endbyte--; /* inclusive */
- bdi = mapping->backing_dev_info;
+ bdi = inode_to_bdi(mapping->host);
switch (advice) {
case POSIX_FADV_NORMAL:
@@ -113,7 +115,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
case POSIX_FADV_NOREUSE:
break;
case POSIX_FADV_DONTNEED:
- if (!bdi_write_congested(mapping->backing_dev_info))
+ if (!bdi_write_congested(bdi))
__filemap_fdatawrite_range(mapping, offset, endbyte,
WB_SYNC_NONE);
diff --git a/mm/filemap.c b/mm/filemap.c
index 673e458..ad72420 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -211,7 +211,7 @@ void __delete_from_page_cache(struct page *page, void *shadow)
*/
if (PageDirty(page) && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
+ dec_bdi_stat(inode_to_bdi(mapping->host), BDI_RECLAIMABLE);
}
}
@@ -1695,8 +1695,7 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
loff_t *ppos = &iocb->ki_pos;
loff_t pos = *ppos;
- /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
- if (file->f_flags & O_DIRECT) {
+ if (io_is_direct(file)) {
struct address_space *mapping = file->f_mapping;
struct inode *inode = mapping->host;
size_t count = iov_iter_count(iter);
@@ -1723,9 +1722,11 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
* we've already read everything we wanted to, or if
* there was a short read because we hit EOF, go ahead
* and return. Otherwise fallthrough to buffered io for
- * the rest of the read.
+ * the rest of the read. Buffered reads will not work for
+ * DAX files, so don't bother trying.
*/
- if (retval < 0 || !iov_iter_count(iter) || *ppos >= size) {
+ if (retval < 0 || !iov_iter_count(iter) || *ppos >= size ||
+ IS_DAX(inode)) {
file_accessed(file);
goto out;
}
@@ -2087,7 +2088,6 @@ const struct vm_operations_struct generic_file_vm_ops = {
.fault = filemap_fault,
.map_pages = filemap_map_pages,
.page_mkwrite = filemap_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
};
/* This is used for a general mmap of a disk file */
@@ -2565,7 +2565,7 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
size_t count = iov_iter_count(from);
/* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
+ current->backing_dev_info = inode_to_bdi(inode);
err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
if (err)
goto out;
@@ -2583,18 +2583,20 @@ ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
goto out;
- /* coalesce the iovecs and go direct-to-BIO for O_DIRECT */
- if (unlikely(file->f_flags & O_DIRECT)) {
+ if (io_is_direct(file)) {
loff_t endbyte;
written = generic_file_direct_write(iocb, from, pos);
- if (written < 0 || written == count)
- goto out;
-
/*
- * direct-io write to a hole: fall through to buffered I/O
- * for completing the rest of the request.
+ * If the write stopped short of completing, fall back to
+ * buffered writes. Some filesystems do this for writes to
+ * holes, for example. For DAX files, a buffered write will
+ * not succeed (even if it did, DAX does not handle dirty
+ * page-cache pages correctly).
*/
+ if (written < 0 || written == count || IS_DAX(inode))
+ goto out;
+
pos += written;
count -= written;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c
deleted file mode 100644
index 0d105ae..0000000
--- a/mm/filemap_xip.c
+++ /dev/null
@@ -1,478 +0,0 @@
-/*
- * linux/mm/filemap_xip.c
- *
- * Copyright (C) 2005 IBM Corporation
- * Author: Carsten Otte <cotte@de.ibm.com>
- *
- * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
- *
- */
-
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include <linux/export.h>
-#include <linux/uio.h>
-#include <linux/rmap.h>
-#include <linux/mmu_notifier.h>
-#include <linux/sched.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-#include <linux/gfp.h>
-#include <asm/tlbflush.h>
-#include <asm/io.h>
-
-/*
- * We do use our own empty page to avoid interference with other users
- * of ZERO_PAGE(), such as /dev/zero
- */
-static DEFINE_MUTEX(xip_sparse_mutex);
-static seqcount_t xip_sparse_seq = SEQCNT_ZERO(xip_sparse_seq);
-static struct page *__xip_sparse_page;
-
-/* called under xip_sparse_mutex */
-static struct page *xip_sparse_page(void)
-{
- if (!__xip_sparse_page) {
- struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
-
- if (page)
- __xip_sparse_page = page;
- }
- return __xip_sparse_page;
-}
-
-/*
- * This is a file read routine for execute in place files, and uses
- * the mapping->a_ops->get_xip_mem() function for the actual low-level
- * stuff.
- *
- * Note the struct file* is not used at all. It may be NULL.
- */
-static ssize_t
-do_xip_mapping_read(struct address_space *mapping,
- struct file_ra_state *_ra,
- struct file *filp,
- char __user *buf,
- size_t len,
- loff_t *ppos)
-{
- struct inode *inode = mapping->host;
- pgoff_t index, end_index;
- unsigned long offset;
- loff_t isize, pos;
- size_t copied = 0, error = 0;
-
- BUG_ON(!mapping->a_ops->get_xip_mem);
-
- pos = *ppos;
- index = pos >> PAGE_CACHE_SHIFT;
- offset = pos & ~PAGE_CACHE_MASK;
-
- isize = i_size_read(inode);
- if (!isize)
- goto out;
-
- end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
- do {
- unsigned long nr, left;
- void *xip_mem;
- unsigned long xip_pfn;
- int zero = 0;
-
- /* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_CACHE_SIZE;
- if (index >= end_index) {
- if (index > end_index)
- goto out;
- nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
- if (nr <= offset) {
- goto out;
- }
- }
- nr = nr - offset;
- if (nr > len - copied)
- nr = len - copied;
-
- error = mapping->a_ops->get_xip_mem(mapping, index, 0,
- &xip_mem, &xip_pfn);
- if (unlikely(error)) {
- if (error == -ENODATA) {
- /* sparse */
- zero = 1;
- } else
- goto out;
- }
-
- /* If users can be writing to this page using arbitrary
- * virtual addresses, take care about potential aliasing
- * before reading the page on the kernel side.
- */
- if (mapping_writably_mapped(mapping))
- /* address based flush */ ;
-
- /*
- * Ok, we have the mem, so now we can copy it to user space...
- *
- * The actor routine returns how many bytes were actually used..
- * NOTE! This may not be the same as how much of a user buffer
- * we filled up (we may be padding etc), so we can only update
- * "pos" here (the actor routine has to update the user buffer
- * pointers and the remaining count).
- */
- if (!zero)
- left = __copy_to_user(buf+copied, xip_mem+offset, nr);
- else
- left = __clear_user(buf + copied, nr);
-
- if (left) {
- error = -EFAULT;
- goto out;
- }
-
- copied += (nr - left);
- offset += (nr - left);
- index += offset >> PAGE_CACHE_SHIFT;
- offset &= ~PAGE_CACHE_MASK;
- } while (copied < len);
-
-out:
- *ppos = pos + copied;
- if (filp)
- file_accessed(filp);
-
- return (copied ? copied : error);
-}
-
-ssize_t
-xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
-{
- if (!access_ok(VERIFY_WRITE, buf, len))
- return -EFAULT;
-
- return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
- buf, len, ppos);
-}
-EXPORT_SYMBOL_GPL(xip_file_read);
-
-/*
- * __xip_unmap is invoked from xip_unmap and xip_write
- *
- * This function walks all vmas of the address_space and unmaps the
- * __xip_sparse_page when found at pgoff.
- */
-static void __xip_unmap(struct address_space * mapping, unsigned long pgoff)
-{
- struct vm_area_struct *vma;
- struct page *page;
- unsigned count;
- int locked = 0;
-
- count = read_seqcount_begin(&xip_sparse_seq);
-
- page = __xip_sparse_page;
- if (!page)
- return;
-
-retry:
- i_mmap_lock_read(mapping);
- vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
- pte_t *pte, pteval;
- spinlock_t *ptl;
- struct mm_struct *mm = vma->vm_mm;
- unsigned long address = vma->vm_start +
- ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
-
- BUG_ON(address < vma->vm_start || address >= vma->vm_end);
- pte = page_check_address(page, mm, address, &ptl, 1);
- if (pte) {
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush(vma, address, pte);
- page_remove_rmap(page);
- dec_mm_counter(mm, MM_FILEPAGES);
- BUG_ON(pte_dirty(pteval));
- pte_unmap_unlock(pte, ptl);
- /* must invalidate_page _before_ freeing the page */
- mmu_notifier_invalidate_page(mm, address);
- page_cache_release(page);
- }
- }
- i_mmap_unlock_read(mapping);
-
- if (locked) {
- mutex_unlock(&xip_sparse_mutex);
- } else if (read_seqcount_retry(&xip_sparse_seq, count)) {
- mutex_lock(&xip_sparse_mutex);
- locked = 1;
- goto retry;
- }
-}
-
-/*
- * xip_fault() is invoked via the vma operations vector for a
- * mapped memory region to read in file data during a page fault.
- *
- * This function is derived from filemap_fault, but used for execute in place
- */
-static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
-{
- struct file *file = vma->vm_file;
- struct address_space *mapping = file->f_mapping;
- struct inode *inode = mapping->host;
- pgoff_t size;
- void *xip_mem;
- unsigned long xip_pfn;
- struct page *page;
- int error;
-
- /* XXX: are VM_FAULT_ codes OK? */
-again:
- size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
- if (vmf->pgoff >= size)
- return VM_FAULT_SIGBUS;
-
- error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
- &xip_mem, &xip_pfn);
- if (likely(!error))
- goto found;
- if (error != -ENODATA)
- return VM_FAULT_OOM;
-
- /* sparse block */
- if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
- (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
- (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
- int err;
-
- /* maybe shared writable, allocate new block */
- mutex_lock(&xip_sparse_mutex);
- error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
- &xip_mem, &xip_pfn);
- mutex_unlock(&xip_sparse_mutex);
- if (error)
- return VM_FAULT_SIGBUS;
- /* unmap sparse mappings at pgoff from all other vmas */
- __xip_unmap(mapping, vmf->pgoff);
-
-found:
- err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
- xip_pfn);
- if (err == -ENOMEM)
- return VM_FAULT_OOM;
- /*
- * err == -EBUSY is fine, we've raced against another thread
- * that faulted-in the same page
- */
- if (err != -EBUSY)
- BUG_ON(err);
- return VM_FAULT_NOPAGE;
- } else {
- int err, ret = VM_FAULT_OOM;
-
- mutex_lock(&xip_sparse_mutex);
- write_seqcount_begin(&xip_sparse_seq);
- error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
- &xip_mem, &xip_pfn);
- if (unlikely(!error)) {
- write_seqcount_end(&xip_sparse_seq);
- mutex_unlock(&xip_sparse_mutex);
- goto again;
- }
- if (error != -ENODATA)
- goto out;
- /* not shared and writable, use xip_sparse_page() */
- page = xip_sparse_page();
- if (!page)
- goto out;
- err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
- page);
- if (err == -ENOMEM)
- goto out;
-
- ret = VM_FAULT_NOPAGE;
-out:
- write_seqcount_end(&xip_sparse_seq);
- mutex_unlock(&xip_sparse_mutex);
-
- return ret;
- }
-}
-
-static const struct vm_operations_struct xip_file_vm_ops = {
- .fault = xip_file_fault,
- .page_mkwrite = filemap_page_mkwrite,
- .remap_pages = generic_file_remap_pages,
-};
-
-int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
-{
- BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
-
- file_accessed(file);
- vma->vm_ops = &xip_file_vm_ops;
- vma->vm_flags |= VM_MIXEDMAP;
- return 0;
-}
-EXPORT_SYMBOL_GPL(xip_file_mmap);
-
-static ssize_t
-__xip_file_write(struct file *filp, const char __user *buf,
- size_t count, loff_t pos, loff_t *ppos)
-{
- struct address_space * mapping = filp->f_mapping;
- const struct address_space_operations *a_ops = mapping->a_ops;
- struct inode *inode = mapping->host;
- long status = 0;
- size_t bytes;
- ssize_t written = 0;
-
- BUG_ON(!mapping->a_ops->get_xip_mem);
-
- do {
- unsigned long index;
- unsigned long offset;
- size_t copied;
- void *xip_mem;
- unsigned long xip_pfn;
-
- offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
- index = pos >> PAGE_CACHE_SHIFT;
- bytes = PAGE_CACHE_SIZE - offset;
- if (bytes > count)
- bytes = count;
-
- status = a_ops->get_xip_mem(mapping, index, 0,
- &xip_mem, &xip_pfn);
- if (status == -ENODATA) {
- /* we allocate a new page unmap it */
- mutex_lock(&xip_sparse_mutex);
- status = a_ops->get_xip_mem(mapping, index, 1,
- &xip_mem, &xip_pfn);
- mutex_unlock(&xip_sparse_mutex);
- if (!status)
- /* unmap page at pgoff from all other vmas */
- __xip_unmap(mapping, index);
- }
-
- if (status)
- break;
-
- copied = bytes -
- __copy_from_user_nocache(xip_mem + offset, buf, bytes);
-
- if (likely(copied > 0)) {
- status = copied;
-
- if (status >= 0) {
- written += status;
- count -= status;
- pos += status;
- buf += status;
- }
- }
- if (unlikely(copied != bytes))
- if (status >= 0)
- status = -EFAULT;
- if (status < 0)
- break;
- } while (count);
- *ppos = pos;
- /*
- * No need to use i_size_read() here, the i_size
- * cannot change under us because we hold i_mutex.
- */
- if (pos > inode->i_size) {
- i_size_write(inode, pos);
- mark_inode_dirty(inode);
- }
-
- return written ? written : status;
-}
-
-ssize_t
-xip_file_write(struct file *filp, const char __user *buf, size_t len,
- loff_t *ppos)
-{
- struct address_space *mapping = filp->f_mapping;
- struct inode *inode = mapping->host;
- size_t count;
- loff_t pos;
- ssize_t ret;
-
- mutex_lock(&inode->i_mutex);
-
- if (!access_ok(VERIFY_READ, buf, len)) {
- ret=-EFAULT;
- goto out_up;
- }
-
- pos = *ppos;
- count = len;
-
- /* We can write back this queue in page reclaim */
- current->backing_dev_info = mapping->backing_dev_info;
-
- ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
- if (ret)
- goto out_backing;
- if (count == 0)
- goto out_backing;
-
- ret = file_remove_suid(filp);
- if (ret)
- goto out_backing;
-
- ret = file_update_time(filp);
- if (ret)
- goto out_backing;
-
- ret = __xip_file_write (filp, buf, count, pos, ppos);
-
- out_backing:
- current->backing_dev_info = NULL;
- out_up:
- mutex_unlock(&inode->i_mutex);
- return ret;
-}
-EXPORT_SYMBOL_GPL(xip_file_write);
-
-/*
- * truncate a page used for execute in place
- * functionality is analog to block_truncate_page but does use get_xip_mem
- * to get the page instead of page cache
- */
-int
-xip_truncate_page(struct address_space *mapping, loff_t from)
-{
- pgoff_t index = from >> PAGE_CACHE_SHIFT;
- unsigned offset = from & (PAGE_CACHE_SIZE-1);
- unsigned blocksize;
- unsigned length;
- void *xip_mem;
- unsigned long xip_pfn;
- int err;
-
- BUG_ON(!mapping->a_ops->get_xip_mem);
-
- blocksize = 1 << mapping->host->i_blkbits;
- length = offset & (blocksize - 1);
-
- /* Block boundary? Nothing to do */
- if (!length)
- return 0;
-
- length = blocksize - length;
-
- err = mapping->a_ops->get_xip_mem(mapping, index, 0,
- &xip_mem, &xip_pfn);
- if (unlikely(err)) {
- if (err == -ENODATA)
- /* Hole? No need to truncate */
- return 0;
- else
- return err;
- }
- memset(xip_mem + offset, 0, length);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xip_truncate_page);
diff --git a/mm/fremap.c b/mm/fremap.c
deleted file mode 100644
index 2805d71..0000000
--- a/mm/fremap.c
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * linux/mm/fremap.c
- *
- * Explicit pagetable population and nonlinear (random) mappings support.
- *
- * started by Ingo Molnar, Copyright (C) 2002, 2003
- */
-#include <linux/export.h>
-#include <linux/backing-dev.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/file.h>
-#include <linux/mman.h>
-#include <linux/pagemap.h>
-#include <linux/swapops.h>
-#include <linux/rmap.h>
-#include <linux/syscalls.h>
-#include <linux/mmu_notifier.h>
-
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-#include "internal.h"
-
-static int mm_counter(struct page *page)
-{
- return PageAnon(page) ? MM_ANONPAGES : MM_FILEPAGES;
-}
-
-static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, pte_t *ptep)
-{
- pte_t pte = *ptep;
- struct page *page;
- swp_entry_t entry;
-
- if (pte_present(pte)) {
- flush_cache_page(vma, addr, pte_pfn(pte));
- pte = ptep_clear_flush_notify(vma, addr, ptep);
- page = vm_normal_page(vma, addr, pte);
- if (page) {
- if (pte_dirty(pte))
- set_page_dirty(page);
- update_hiwater_rss(mm);
- dec_mm_counter(mm, mm_counter(page));
- page_remove_rmap(page);
- page_cache_release(page);
- }
- } else { /* zap_pte() is not called when pte_none() */
- if (!pte_file(pte)) {
- update_hiwater_rss(mm);
- entry = pte_to_swp_entry(pte);
- if (non_swap_entry(entry)) {
- if (is_migration_entry(entry)) {
- page = migration_entry_to_page(entry);
- dec_mm_counter(mm, mm_counter(page));
- }
- } else {
- free_swap_and_cache(entry);
- dec_mm_counter(mm, MM_SWAPENTS);
- }
- }
- pte_clear_not_present_full(mm, addr, ptep, 0);
- }
-}
-
-/*
- * Install a file pte to a given virtual memory address, release any
- * previously existing mapping.
- */
-static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long addr, unsigned long pgoff, pgprot_t prot)
-{
- int err = -ENOMEM;
- pte_t *pte, ptfile;
- spinlock_t *ptl;
-
- pte = get_locked_pte(mm, addr, &ptl);
- if (!pte)
- goto out;
-
- ptfile = pgoff_to_pte(pgoff);
-
- if (!pte_none(*pte))
- zap_pte(mm, vma, addr, pte);
-
- set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
- /*
- * We don't need to run update_mmu_cache() here because the "file pte"
- * being installed by install_file_pte() is not a real pte - it's a
- * non-present entry (like a swap entry), noting what file offset should
- * be mapped there when there's a fault (in a non-linear vma where
- * that's not obvious).
- */
- pte_unmap_unlock(pte, ptl);
- err = 0;
-out:
- return err;
-}
-
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
- unsigned long size, pgoff_t pgoff)
-{
- struct mm_struct *mm = vma->vm_mm;
- int err;
-
- do {
- err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
- if (err)
- return err;
-
- size -= PAGE_SIZE;
- addr += PAGE_SIZE;
- pgoff++;
- } while (size);
-
- return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
-/**
- * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
- * @start: start of the remapped virtual memory range
- * @size: size of the remapped virtual memory range
- * @prot: new protection bits of the range (see NOTE)
- * @pgoff: to-be-mapped page of the backing store file
- * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
- *
- * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
- * (shared backing store file).
- *
- * This syscall works purely via pagetables, so it's the most efficient
- * way to map the same (large) file into a given virtual window. Unlike
- * mmap()/mremap() it does not create any new vmas. The new mappings are
- * also safe across swapout.
- *
- * NOTE: the @prot parameter right now is ignored (but must be zero),
- * and the vma's default protection is used. Arbitrary protections
- * might be implemented in the future.
- */
-SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
- unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
-{
- struct mm_struct *mm = current->mm;
- struct address_space *mapping;
- struct vm_area_struct *vma;
- int err = -EINVAL;
- int has_write_lock = 0;
- vm_flags_t vm_flags = 0;
-
- pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
- "See Documentation/vm/remap_file_pages.txt.\n",
- current->comm, current->pid);
-
- if (prot)
- return err;
- /*
- * Sanitize the syscall parameters:
- */
- start = start & PAGE_MASK;
- size = size & PAGE_MASK;
-
- /* Does the address range wrap, or is the span zero-sized? */
- if (start + size <= start)
- return err;
-
- /* Does pgoff wrap? */
- if (pgoff + (size >> PAGE_SHIFT) < pgoff)
- return err;
-
- /* Can we represent this offset inside this architecture's pte's? */
-#if PTE_FILE_MAX_BITS < BITS_PER_LONG
- if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
- return err;
-#endif
-
- /* We need down_write() to change vma->vm_flags. */
- down_read(&mm->mmap_sem);
- retry:
- vma = find_vma(mm, start);
-
- /*
- * Make sure the vma is shared, that it supports prefaulting,
- * and that the remapped range is valid and fully within
- * the single existing vma.
- */
- if (!vma || !(vma->vm_flags & VM_SHARED))
- goto out;
-
- if (!vma->vm_ops || !vma->vm_ops->remap_pages)
- goto out;
-
- if (start < vma->vm_start || start + size > vma->vm_end)
- goto out;
-
- /* Must set VM_NONLINEAR before any pages are populated. */
- if (!(vma->vm_flags & VM_NONLINEAR)) {
- /*
- * vm_private_data is used as a swapout cursor
- * in a VM_NONLINEAR vma.
- */
- if (vma->vm_private_data)
- goto out;
-
- /* Don't need a nonlinear mapping, exit success */
- if (pgoff == linear_page_index(vma, start)) {
- err = 0;
- goto out;
- }
-
- if (!has_write_lock) {
-get_write_lock:
- up_read(&mm->mmap_sem);
- down_write(&mm->mmap_sem);
- has_write_lock = 1;
- goto retry;
- }
- mapping = vma->vm_file->f_mapping;
- /*
- * page_mkclean doesn't work on nonlinear vmas, so if
- * dirty pages need to be accounted, emulate with linear
- * vmas.
- */
- if (mapping_cap_account_dirty(mapping)) {
- unsigned long addr;
- struct file *file = get_file(vma->vm_file);
- /* mmap_region may free vma; grab the info now */
- vm_flags = vma->vm_flags;
-
- addr = mmap_region(file, start, size, vm_flags, pgoff);
- fput(file);
- if (IS_ERR_VALUE(addr)) {
- err = addr;
- } else {
- BUG_ON(addr != start);
- err = 0;
- }
- goto out_freed;
- }
- i_mmap_lock_write(mapping);
- flush_dcache_mmap_lock(mapping);
- vma->vm_flags |= VM_NONLINEAR;
- vma_interval_tree_remove(vma, &mapping->i_mmap);
- vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
- flush_dcache_mmap_unlock(mapping);
- i_mmap_unlock_write(mapping);
- }
-
- if (vma->vm_flags & VM_LOCKED) {
- /*
- * drop PG_Mlocked flag for over-mapped range
- */
- if (!has_write_lock)
- goto get_write_lock;
- vm_flags = vma->vm_flags;
- munlock_vma_pages_range(vma, start, start + size);
- vma->vm_flags = vm_flags;
- }
-
- mmu_notifier_invalidate_range_start(mm, start, start + size);
- err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
- mmu_notifier_invalidate_range_end(mm, start, start + size);
-
- /*
- * We can't clear VM_NONLINEAR because we'd have to do
- * it after ->populate completes, and that would prevent
- * downgrading the lock. (Locks can't be upgraded).
- */
-
-out:
- if (vma)
- vm_flags = vma->vm_flags;
-out_freed:
- if (likely(!has_write_lock))
- up_read(&mm->mmap_sem);
- else
- up_write(&mm->mmap_sem);
- if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
- mm_populate(start, size);
-
- return err;
-}
diff --git a/mm/gup.c b/mm/gup.c
index 8dd50ce..a6e24e2 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -55,7 +55,7 @@ retry:
*/
if (likely(!(flags & FOLL_MIGRATION)))
goto no_page;
- if (pte_none(pte) || pte_file(pte))
+ if (pte_none(pte))
goto no_page;
entry = pte_to_swp_entry(pte);
if (!is_migration_entry(entry))
@@ -64,7 +64,7 @@ retry:
migration_entry_wait(mm, pmd, address);
goto retry;
}
- if ((flags & FOLL_NUMA) && pte_numa(pte))
+ if ((flags & FOLL_NUMA) && pte_protnone(pte))
goto no_page;
if ((flags & FOLL_WRITE) && !pte_write(pte)) {
pte_unmap_unlock(ptep, ptl);
@@ -167,10 +167,10 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
if (pud_none(*pud))
return no_page_table(vma, flags);
if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
- if (flags & FOLL_GET)
- return NULL;
- page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE);
- return page;
+ page = follow_huge_pud(mm, address, pud, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
}
if (unlikely(pud_bad(*pud)))
return no_page_table(vma, flags);
@@ -179,21 +179,12 @@ struct page *follow_page_mask(struct vm_area_struct *vma,
if (pmd_none(*pmd))
return no_page_table(vma, flags);
if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
- page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE);
- if (flags & FOLL_GET) {
- /*
- * Refcount on tail pages are not well-defined and
- * shouldn't be taken. The caller should handle a NULL
- * return when trying to follow tail pages.
- */
- if (PageHead(page))
- get_page(page);
- else
- page = NULL;
- }
- return page;
+ page = follow_huge_pmd(mm, address, pmd, flags);
+ if (page)
+ return page;
+ return no_page_table(vma, flags);
}
- if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
return no_page_table(vma, flags);
if (pmd_trans_huge(*pmd)) {
if (flags & FOLL_SPLIT) {
@@ -584,6 +575,185 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
return 0;
}
+static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
+ struct mm_struct *mm,
+ unsigned long start,
+ unsigned long nr_pages,
+ int write, int force,
+ struct page **pages,
+ struct vm_area_struct **vmas,
+ int *locked, bool notify_drop,
+ unsigned int flags)
+{
+ long ret, pages_done;
+ bool lock_dropped;
+
+ if (locked) {
+ /* if VM_FAULT_RETRY can be returned, vmas become invalid */
+ BUG_ON(vmas);
+ /* check caller initialized locked */
+ BUG_ON(*locked != 1);
+ }
+
+ if (pages)
+ flags |= FOLL_GET;
+ if (write)
+ flags |= FOLL_WRITE;
+ if (force)
+ flags |= FOLL_FORCE;
+
+ pages_done = 0;
+ lock_dropped = false;
+ for (;;) {
+ ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
+ vmas, locked);
+ if (!locked)
+ /* VM_FAULT_RETRY couldn't trigger, bypass */
+ return ret;
+
+ /* VM_FAULT_RETRY cannot return errors */
+ if (!*locked) {
+ BUG_ON(ret < 0);
+ BUG_ON(ret >= nr_pages);
+ }
+
+ if (!pages)
+ /* If it's a prefault don't insist harder */
+ return ret;
+
+ if (ret > 0) {
+ nr_pages -= ret;
+ pages_done += ret;
+ if (!nr_pages)
+ break;
+ }
+ if (*locked) {
+ /* VM_FAULT_RETRY didn't trigger */
+ if (!pages_done)
+ pages_done = ret;
+ break;
+ }
+ /* VM_FAULT_RETRY triggered, so seek to the faulting offset */
+ pages += ret;
+ start += ret << PAGE_SHIFT;
+
+ /*
+ * Repeat on the address that fired VM_FAULT_RETRY
+ * without FAULT_FLAG_ALLOW_RETRY but with
+ * FAULT_FLAG_TRIED.
+ */
+ *locked = 1;
+ lock_dropped = true;
+ down_read(&mm->mmap_sem);
+ ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
+ pages, NULL, NULL);
+ if (ret != 1) {
+ BUG_ON(ret > 1);
+ if (!pages_done)
+ pages_done = ret;
+ break;
+ }
+ nr_pages--;
+ pages_done++;
+ if (!nr_pages)
+ break;
+ pages++;
+ start += PAGE_SIZE;
+ }
+ if (notify_drop && lock_dropped && *locked) {
+ /*
+ * We must let the caller know we temporarily dropped the lock
+ * and so the critical section protected by it was lost.
+ */
+ up_read(&mm->mmap_sem);
+ *locked = 0;
+ }
+ return pages_done;
+}
+
+/*
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
+ * get_user_pages_locked() is suitable to replace the form:
+ *
+ * down_read(&mm->mmap_sem);
+ * do_something()
+ * get_user_pages(tsk, mm, ..., pages, NULL);
+ * up_read(&mm->mmap_sem);
+ *
+ * to:
+ *
+ * int locked = 1;
+ * down_read(&mm->mmap_sem);
+ * do_something()
+ * get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ * if (locked)
+ * up_read(&mm->mmap_sem);
+ */
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked)
+{
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+ pages, NULL, locked, true, FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+/*
+ * Same as get_user_pages_unlocked(...., FOLL_TOUCH) but it allows to
+ * pass additional gup_flags as last parameter (like FOLL_HWPOISON).
+ *
+ * NOTE: here FOLL_TOUCH is not set implicitly and must be set by the
+ * caller if required (just like with __get_user_pages). "FOLL_GET",
+ * "FOLL_WRITE" and "FOLL_FORCE" are set implicitly as needed
+ * according to the parameters "pages", "write", "force"
+ * respectively.
+ */
+__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ unsigned int gup_flags)
+{
+ long ret;
+ int locked = 1;
+ down_read(&mm->mmap_sem);
+ ret = __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+ pages, NULL, &locked, false, gup_flags);
+ if (locked)
+ up_read(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+
+/*
+ * get_user_pages_unlocked() is suitable to replace the form:
+ *
+ * down_read(&mm->mmap_sem);
+ * get_user_pages(tsk, mm, ..., pages, NULL);
+ * up_read(&mm->mmap_sem);
+ *
+ * with:
+ *
+ * get_user_pages_unlocked(tsk, mm, ..., pages);
+ *
+ * It is functionally equivalent to get_user_pages_fast so
+ * get_user_pages_fast should be used instead, if the two parameters
+ * "tsk" and "mm" are respectively equal to current and current->mm,
+ * or if "force" shall be set to 1 (get_user_pages_fast misses the
+ * "force" parameter).
+ */
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages)
+{
+ return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+ force, pages, FOLL_TOUCH);
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
/*
* get_user_pages() - pin user pages in memory
* @tsk: the task_struct to use for page fault accounting, or
@@ -633,22 +803,18 @@ int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
* use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
+ *
+ * get_user_pages should be phased out in favor of
+ * get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
+ * should use get_user_pages because it cannot pass
+ * FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
unsigned long start, unsigned long nr_pages, int write,
int force, struct page **pages, struct vm_area_struct **vmas)
{
- int flags = FOLL_TOUCH;
-
- if (pages)
- flags |= FOLL_GET;
- if (write)
- flags |= FOLL_WRITE;
- if (force)
- flags |= FOLL_FORCE;
-
- return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas,
- NULL);
+ return __get_user_pages_locked(tsk, mm, start, nr_pages, write, force,
+ pages, vmas, NULL, false, FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
@@ -740,10 +906,10 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
/*
* Similar to the PMD case below, NUMA hinting must take slow
- * path
+ * path using the pte_protnone check.
*/
if (!pte_present(pte) || pte_special(pte) ||
- pte_numa(pte) || (write && !pte_write(pte)))
+ pte_protnone(pte) || (write && !pte_write(pte)))
goto pte_unmap;
VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
@@ -926,7 +1092,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
pmdp = pmd_offset(&pud, addr);
do {
- pmd_t pmd = ACCESS_ONCE(*pmdp);
+ pmd_t pmd = READ_ONCE(*pmdp);
next = pmd_addr_end(addr, end);
if (pmd_none(pmd) || pmd_trans_splitting(pmd))
@@ -938,7 +1104,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
* slowpath for accounting purposes and so that they
* can be serialised against THP migration.
*/
- if (pmd_numa(pmd))
+ if (pmd_protnone(pmd))
return 0;
if (!gup_huge_pmd(pmd, pmdp, addr, next, write,
@@ -1077,10 +1243,8 @@ int get_user_pages_fast(unsigned long start, int nr_pages, int write,
start += nr << PAGE_SHIFT;
pages += nr;
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start,
- nr_pages - nr, write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
+ ret = get_user_pages_unlocked(current, mm, start,
+ nr_pages - nr, write, 0, pages);
/* Have to be a bit careful with return values */
if (nr > 0) {
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 817a875..fc00c8c 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -171,12 +171,7 @@ static int start_khugepaged(void)
}
static atomic_t huge_zero_refcount;
-static struct page *huge_zero_page __read_mostly;
-
-static inline bool is_huge_zero_page(struct page *page)
-{
- return ACCESS_ONCE(huge_zero_page) == page;
-}
+struct page *huge_zero_page __read_mostly;
static inline bool is_huge_zero_pmd(pmd_t pmd)
{
@@ -766,15 +761,6 @@ static inline gfp_t alloc_hugepage_gfpmask(int defrag, gfp_t extra_gfp)
return (GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT)) | extra_gfp;
}
-static inline struct page *alloc_hugepage_vma(int defrag,
- struct vm_area_struct *vma,
- unsigned long haddr, int nd,
- gfp_t extra_gfp)
-{
- return alloc_pages_vma(alloc_hugepage_gfpmask(defrag, extra_gfp),
- HPAGE_PMD_ORDER, vma, haddr, nd);
-}
-
/* Caller must hold page table lock. */
static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd,
@@ -795,6 +781,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pmd_t *pmd,
unsigned int flags)
{
+ gfp_t gfp;
struct page *page;
unsigned long haddr = address & HPAGE_PMD_MASK;
@@ -829,8 +816,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
}
return 0;
}
- page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr, numa_node_id(), 0);
+ gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
if (unlikely(!page)) {
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1118,10 +1105,12 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
spin_unlock(ptl);
alloc:
if (transparent_hugepage_enabled(vma) &&
- !transparent_hugepage_debug_cow())
- new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma),
- vma, haddr, numa_node_id(), 0);
- else
+ !transparent_hugepage_debug_cow()) {
+ gfp_t gfp;
+
+ gfp = alloc_hugepage_gfpmask(transparent_hugepage_defrag(vma), 0);
+ new_page = alloc_hugepage_vma(gfp, vma, haddr, HPAGE_PMD_ORDER);
+ } else
new_page = NULL;
if (unlikely(!new_page)) {
@@ -1222,7 +1211,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
return ERR_PTR(-EFAULT);
/* Full NUMA hinting faults to serialise migration in fault paths */
- if ((flags & FOLL_NUMA) && pmd_numa(*pmd))
+ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
goto out;
page = pmd_page(*pmd);
@@ -1273,6 +1262,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
bool migrated = false;
int flags = 0;
+ /* A PROT_NONE fault should not end up here */
+ BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
+
ptl = pmd_lock(mm, pmdp);
if (unlikely(!pmd_same(pmd, *pmdp)))
goto out_unlock;
@@ -1283,8 +1275,9 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
* check_same as the page may no longer be mapped.
*/
if (unlikely(pmd_trans_migrating(*pmdp))) {
+ page = pmd_page(*pmdp);
spin_unlock(ptl);
- wait_migrate_huge_page(vma->anon_vma, pmdp);
+ wait_on_page_locked(page);
goto out;
}
@@ -1352,7 +1345,7 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* Migrate the THP to the requested node, returns with page unlocked
- * and pmd_numa cleared.
+ * and access rights restored.
*/
spin_unlock(ptl);
migrated = migrate_misplaced_transhuge_page(mm, vma,
@@ -1365,9 +1358,8 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
clear_pmdnuma:
BUG_ON(!PageLocked(page));
- pmd = pmd_mknonnuma(pmd);
+ pmd = pmd_modify(pmd, vma->vm_page_prot);
set_pmd_at(mm, haddr, pmdp, pmd);
- VM_BUG_ON(pmd_numa(*pmdp));
update_mmu_cache_pmd(vma, addr, pmdp);
unlock_page(page);
out_unlock:
@@ -1423,26 +1415,6 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
return ret;
}
-int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- spinlock_t *ptl;
- int ret = 0;
-
- if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
- /*
- * All logical pages in the range are present
- * if backed by a huge page.
- */
- spin_unlock(ptl);
- memset(vec, 1, (end - addr) >> PAGE_SHIFT);
- ret = 1;
- }
-
- return ret;
-}
-
int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
unsigned long old_addr,
unsigned long new_addr, unsigned long old_end,
@@ -1510,29 +1482,24 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
if (__pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
pmd_t entry;
- ret = 1;
- if (!prot_numa) {
+
+ /*
+ * Avoid trapping faults against the zero page. The read-only
+ * data is likely to be read-cached on the local CPU and
+ * local/remote hits to the zero page are not interesting.
+ */
+ if (prot_numa && is_huge_zero_pmd(*pmd)) {
+ spin_unlock(ptl);
+ return 0;
+ }
+
+ if (!prot_numa || !pmd_protnone(*pmd)) {
+ ret = 1;
entry = pmdp_get_and_clear_notify(mm, addr, pmd);
- if (pmd_numa(entry))
- entry = pmd_mknonnuma(entry);
entry = pmd_modify(entry, newprot);
ret = HPAGE_PMD_NR;
set_pmd_at(mm, addr, pmd, entry);
BUG_ON(pmd_write(entry));
- } else {
- struct page *page = pmd_page(*pmd);
-
- /*
- * Do not trap faults against the zero page. The
- * read-only data is likely to be read-cached on the
- * local CPU cache and it is less useful to know about
- * local vs remote hits on the zero page.
- */
- if (!is_huge_zero_page(page) &&
- !pmd_numa(*pmd)) {
- pmdp_set_numa(mm, addr, pmd);
- ret = HPAGE_PMD_NR;
- }
}
spin_unlock(ptl);
}
@@ -1797,9 +1764,9 @@ static int __split_huge_page_map(struct page *page,
pte_t *pte, entry;
BUG_ON(PageCompound(page+i));
/*
- * Note that pmd_numa is not transferred deliberately
- * to avoid any possibility that pte_numa leaks to
- * a PROT_NONE VMA by accident.
+ * Note that NUMA hinting access restrictions are not
+ * transferred to avoid any possibility of altering
+ * permissions across VMAs.
*/
entry = mk_pte(page + i, vma->vm_page_prot);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -2148,7 +2115,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
{
struct page *page;
pte_t *_pte;
- int referenced = 0, none = 0;
+ int none = 0;
+ bool referenced = false, writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
_pte++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
@@ -2158,7 +2126,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
else
goto out;
}
- if (!pte_present(pteval) || !pte_write(pteval))
+ if (!pte_present(pteval))
goto out;
page = vm_normal_page(vma, address, pteval);
if (unlikely(!page))
@@ -2168,9 +2136,6 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!PageAnon(page), page);
VM_BUG_ON_PAGE(!PageSwapBacked(page), page);
- /* cannot use mapcount: can't collapse if there's a gup pin */
- if (page_count(page) != 1)
- goto out;
/*
* We can do it before isolate_lru_page because the
* page can't be freed from under us. NOTE: PG_lock
@@ -2179,6 +2144,29 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
*/
if (!trylock_page(page))
goto out;
+
+ /*
+ * cannot use mapcount: can't collapse if there's a gup pin.
+ * The page must only be referenced by the scanned process
+ * and page swap cache.
+ */
+ if (page_count(page) != 1 + !!PageSwapCache(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ if (pte_write(pteval)) {
+ writable = true;
+ } else {
+ if (PageSwapCache(page) && !reuse_swap_page(page)) {
+ unlock_page(page);
+ goto out;
+ }
+ /*
+ * Page is not in the swap cache. It can be collapsed
+ * into a THP.
+ */
+ }
+
/*
* Isolate the page to avoid collapsing an hugepage
* currently in use by the VM.
@@ -2195,9 +2183,9 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
/* If there is no mapped pte young don't collapse the page */
if (pte_young(pteval) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
- referenced = 1;
+ referenced = true;
}
- if (likely(referenced))
+ if (likely(referenced && writable))
return 1;
out:
release_pte_pages(pte, _pte);
@@ -2550,11 +2538,12 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, referenced = 0, none = 0;
+ int ret = 0, none = 0;
struct page *page;
unsigned long _address;
spinlock_t *ptl;
int node = NUMA_NO_NODE;
+ bool writable = false, referenced = false;
VM_BUG_ON(address & ~HPAGE_PMD_MASK);
@@ -2573,8 +2562,11 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
else
goto out_unmap;
}
- if (!pte_present(pteval) || !pte_write(pteval))
+ if (!pte_present(pteval))
goto out_unmap;
+ if (pte_write(pteval))
+ writable = true;
+
page = vm_normal_page(vma, _address, pteval);
if (unlikely(!page))
goto out_unmap;
@@ -2591,14 +2583,18 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
VM_BUG_ON_PAGE(PageCompound(page), page);
if (!PageLRU(page) || PageLocked(page) || !PageAnon(page))
goto out_unmap;
- /* cannot use mapcount: can't collapse if there's a gup pin */
- if (page_count(page) != 1)
+ /*
+ * cannot use mapcount: can't collapse if there's a gup pin.
+ * The page must only be referenced by the scanned process
+ * and page swap cache.
+ */
+ if (page_count(page) != 1 + !!PageSwapCache(page))
goto out_unmap;
if (pte_young(pteval) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
- referenced = 1;
+ referenced = true;
}
- if (referenced)
+ if (referenced && writable)
ret = 1;
out_unmap:
pte_unmap_unlock(pte, ptl);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 85032de..0a9ac6c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -35,7 +35,7 @@
#include <linux/node.h>
#include "internal.h"
-unsigned long hugepages_treat_as_movable;
+int hugepages_treat_as_movable;
int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
@@ -2657,9 +2657,10 @@ again:
goto unlock;
/*
- * HWPoisoned hugepage is already unmapped and dropped reference
+ * Migrating hugepage or HWPoisoned hugepage is already
+ * unmapped and its refcount is dropped, so just clear pte here.
*/
- if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+ if (unlikely(!pte_present(pte))) {
huge_pte_clear(mm, address, ptep);
goto unlock;
}
@@ -3134,6 +3135,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
struct page *pagecache_page = NULL;
struct hstate *h = hstate_vma(vma);
struct address_space *mapping;
+ int need_wait_lock = 0;
address &= huge_page_mask(h);
@@ -3172,6 +3174,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
ret = 0;
/*
+ * entry could be a migration/hwpoison entry at this point, so this
+ * check prevents the kernel from going below assuming that we have
+ * a active hugepage in pagecache. This goto expects the 2nd page fault,
+ * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
+ * handle it.
+ */
+ if (!pte_present(entry))
+ goto out_mutex;
+
+ /*
* If we are going to COW the mapping later, we examine the pending
* reservations for this page now. This will ensure that any
* allocations necessary to record that reservation occur outside the
@@ -3190,30 +3202,31 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma, address);
}
+ ptl = huge_pte_lock(h, mm, ptep);
+
+ /* Check for a racing update before calling hugetlb_cow */
+ if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
+ goto out_ptl;
+
/*
* hugetlb_cow() requires page locks of pte_page(entry) and
* pagecache_page, so here we need take the former one
* when page != pagecache_page or !pagecache_page.
- * Note that locking order is always pagecache_page -> page,
- * so no worry about deadlock.
*/
page = pte_page(entry);
- get_page(page);
if (page != pagecache_page)
- lock_page(page);
-
- ptl = huge_pte_lockptr(h, mm, ptep);
- spin_lock(ptl);
- /* Check for a racing update before calling hugetlb_cow */
- if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
- goto out_ptl;
+ if (!trylock_page(page)) {
+ need_wait_lock = 1;
+ goto out_ptl;
+ }
+ get_page(page);
if (flags & FAULT_FLAG_WRITE) {
if (!huge_pte_write(entry)) {
ret = hugetlb_cow(mm, vma, address, ptep, entry,
pagecache_page, ptl);
- goto out_ptl;
+ goto out_put_page;
}
entry = huge_pte_mkdirty(entry);
}
@@ -3221,7 +3234,10 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (huge_ptep_set_access_flags(vma, address, ptep, entry,
flags & FAULT_FLAG_WRITE))
update_mmu_cache(vma, address, ptep);
-
+out_put_page:
+ if (page != pagecache_page)
+ unlock_page(page);
+ put_page(page);
out_ptl:
spin_unlock(ptl);
@@ -3229,12 +3245,17 @@ out_ptl:
unlock_page(pagecache_page);
put_page(pagecache_page);
}
- if (page != pagecache_page)
- unlock_page(page);
- put_page(page);
-
out_mutex:
mutex_unlock(&htlb_fault_mutex_table[hash]);
+ /*
+ * Generally it's safe to hold refcount during waiting page lock. But
+ * here we just wait to defer the next page fault to avoid busy loop and
+ * the page is not used after unlocked before returning from the current
+ * page fault. So we are safe from accessing freed page, even if we wait
+ * here without taking refcount.
+ */
+ if (need_wait_lock)
+ wait_on_page_locked(page);
return ret;
}
@@ -3364,7 +3385,26 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
spin_unlock(ptl);
continue;
}
- if (!huge_pte_none(huge_ptep_get(ptep))) {
+ pte = huge_ptep_get(ptep);
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+ spin_unlock(ptl);
+ continue;
+ }
+ if (unlikely(is_hugetlb_entry_migration(pte))) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if (is_write_migration_entry(entry)) {
+ pte_t newpte;
+
+ make_migration_entry_read(&entry);
+ newpte = swp_entry_to_pte(entry);
+ set_huge_pte_at(mm, address, ptep, newpte);
+ pages++;
+ }
+ spin_unlock(ptl);
+ continue;
+ }
+ if (!huge_pte_none(pte)) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(huge_pte_modify(pte, newprot));
pte = arch_make_huge_pte(pte, vma, NULL, 0);
@@ -3558,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
if (saddr) {
spte = huge_pte_offset(svma->vm_mm, saddr);
if (spte) {
+ mm_inc_nr_pmds(mm);
get_page(virt_to_page(spte));
break;
}
@@ -3569,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte);
spin_lock(ptl);
- if (pud_none(*pud))
+ if (pud_none(*pud)) {
pud_populate(mm, pud,
(pmd_t *)((unsigned long)spte & PAGE_MASK));
- else
+ } else {
put_page(virt_to_page(spte));
+ mm_inc_nr_pmds(mm);
+ }
spin_unlock(ptl);
out:
pte = (pte_t *)pmd_alloc(mm, pud, addr);
@@ -3604,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
pud_clear(pud);
put_page(virt_to_page(ptep));
+ mm_dec_nr_pmds(mm);
*addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
return 1;
}
@@ -3660,42 +3704,64 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
return (pte_t *) pmd;
}
-struct page *
-follow_huge_pmd(struct mm_struct *mm, unsigned long address,
- pmd_t *pmd, int write)
-{
- struct page *page;
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
- page = pte_page(*(pte_t *)pmd);
- if (page)
- page += ((address & ~PMD_MASK) >> PAGE_SHIFT);
- return page;
+/*
+ * These functions are overwritable if your architecture needs its own
+ * behavior.
+ */
+struct page * __weak
+follow_huge_addr(struct mm_struct *mm, unsigned long address,
+ int write)
+{
+ return ERR_PTR(-EINVAL);
}
-struct page *
-follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
+struct page * __weak
+follow_huge_pmd(struct mm_struct *mm, unsigned long address,
+ pmd_t *pmd, int flags)
{
- struct page *page;
-
- page = pte_page(*(pte_t *)pud);
- if (page)
- page += ((address & ~PUD_MASK) >> PAGE_SHIFT);
+ struct page *page = NULL;
+ spinlock_t *ptl;
+retry:
+ ptl = pmd_lockptr(mm, pmd);
+ spin_lock(ptl);
+ /*
+ * make sure that the address range covered by this pmd is not
+ * unmapped from other threads.
+ */
+ if (!pmd_huge(*pmd))
+ goto out;
+ if (pmd_present(*pmd)) {
+ page = pte_page(*(pte_t *)pmd) +
+ ((address & ~PMD_MASK) >> PAGE_SHIFT);
+ if (flags & FOLL_GET)
+ get_page(page);
+ } else {
+ if (is_hugetlb_entry_migration(huge_ptep_get((pte_t *)pmd))) {
+ spin_unlock(ptl);
+ __migration_entry_wait(mm, (pte_t *)pmd, ptl);
+ goto retry;
+ }
+ /*
+ * hwpoisoned entry is treated as no_page_table in
+ * follow_page_mask().
+ */
+ }
+out:
+ spin_unlock(ptl);
return page;
}
-#else /* !CONFIG_ARCH_WANT_GENERAL_HUGETLB */
-
-/* Can be overriden by architectures */
struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
- pud_t *pud, int write)
+ pud_t *pud, int flags)
{
- BUG();
- return NULL;
-}
+ if (flags & FOLL_GET)
+ return NULL;
-#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+ return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
+}
#ifdef CONFIG_MEMORY_FAILURE
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 037e1c0..6e00574 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -279,7 +279,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of,
return -EINVAL;
buf = strstrip(buf);
- ret = page_counter_memparse(buf, &nr_pages);
+ ret = page_counter_memparse(buf, "-1", &nr_pages);
if (ret)
return ret;
diff --git a/mm/internal.h b/mm/internal.h
index efad241..a96da5b 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -110,6 +110,28 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
*/
/*
+ * Structure for holding the mostly immutable allocation parameters passed
+ * between functions involved in allocations, including the alloc_pages*
+ * family of functions.
+ *
+ * nodemask, migratetype and high_zoneidx are initialized only once in
+ * __alloc_pages_nodemask() and then never change.
+ *
+ * zonelist, preferred_zone and classzone_idx are set first in
+ * __alloc_pages_nodemask() for the fast path, and might be later changed
+ * in __alloc_pages_slowpath(). All other functions pass the whole strucure
+ * by a const pointer.
+ */
+struct alloc_context {
+ struct zonelist *zonelist;
+ nodemask_t *nodemask;
+ struct zone *preferred_zone;
+ int classzone_idx;
+ int migratetype;
+ enum zone_type high_zoneidx;
+};
+
+/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
*
@@ -329,8 +351,10 @@ extern int mminit_loglevel;
#define mminit_dprintk(level, prefix, fmt, arg...) \
do { \
if (level < mminit_loglevel) { \
- printk(level <= MMINIT_WARNING ? KERN_WARNING : KERN_DEBUG); \
- printk(KERN_CONT "mminit::" prefix " " fmt, ##arg); \
+ if (level <= MMINIT_WARNING) \
+ printk(KERN_WARNING "mminit::" prefix " " fmt, ##arg); \
+ else \
+ printk(KERN_DEBUG "mminit::" prefix " " fmt, ##arg); \
} \
} while (0)
diff --git a/mm/interval_tree.c b/mm/interval_tree.c
index 8da581f..f2c2492 100644
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
}
-INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
- unsigned long, shared.linear.rb_subtree_last,
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
+ unsigned long, shared.rb_subtree_last,
vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
/* Insert node immediately after prev in the interval tree */
@@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
VM_BUG_ON_VMA(vma_start_pgoff(node) != vma_start_pgoff(prev), node);
- if (!prev->shared.linear.rb.rb_right) {
+ if (!prev->shared.rb.rb_right) {
parent = prev;
- link = &prev->shared.linear.rb.rb_right;
+ link = &prev->shared.rb.rb_right;
} else {
- parent = rb_entry(prev->shared.linear.rb.rb_right,
- struct vm_area_struct, shared.linear.rb);
- if (parent->shared.linear.rb_subtree_last < last)
- parent->shared.linear.rb_subtree_last = last;
- while (parent->shared.linear.rb.rb_left) {
- parent = rb_entry(parent->shared.linear.rb.rb_left,
- struct vm_area_struct, shared.linear.rb);
- if (parent->shared.linear.rb_subtree_last < last)
- parent->shared.linear.rb_subtree_last = last;
+ parent = rb_entry(prev->shared.rb.rb_right,
+ struct vm_area_struct, shared.rb);
+ if (parent->shared.rb_subtree_last < last)
+ parent->shared.rb_subtree_last = last;
+ while (parent->shared.rb.rb_left) {
+ parent = rb_entry(parent->shared.rb.rb_left,
+ struct vm_area_struct, shared.rb);
+ if (parent->shared.rb_subtree_last < last)
+ parent->shared.rb_subtree_last = last;
}
- link = &parent->shared.linear.rb.rb_left;
+ link = &parent->shared.rb.rb_left;
}
- node->shared.linear.rb_subtree_last = last;
- rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
- rb_insert_augmented(&node->shared.linear.rb, root,
+ node->shared.rb_subtree_last = last;
+ rb_link_node(&node->shared.rb, &parent->shared.rb, link);
+ rb_insert_augmented(&node->shared.rb, root,
&vma_interval_tree_augment);
}
diff --git a/mm/iov_iter.c b/mm/iov_iter.c
index a1599ca..8277320 100644
--- a/mm/iov_iter.c
+++ b/mm/iov_iter.c
@@ -501,18 +501,31 @@ size_t iov_iter_single_seg_count(const struct iov_iter *i)
EXPORT_SYMBOL(iov_iter_single_seg_count);
void iov_iter_kvec(struct iov_iter *i, int direction,
- const struct kvec *iov, unsigned long nr_segs,
+ const struct kvec *kvec, unsigned long nr_segs,
size_t count)
{
BUG_ON(!(direction & ITER_KVEC));
i->type = direction;
- i->kvec = (struct kvec *)iov;
+ i->kvec = kvec;
i->nr_segs = nr_segs;
i->iov_offset = 0;
i->count = count;
}
EXPORT_SYMBOL(iov_iter_kvec);
+void iov_iter_bvec(struct iov_iter *i, int direction,
+ const struct bio_vec *bvec, unsigned long nr_segs,
+ size_t count)
+{
+ BUG_ON(!(direction & ITER_BVEC));
+ i->type = direction;
+ i->bvec = bvec;
+ i->nr_segs = nr_segs;
+ i->iov_offset = 0;
+ i->count = count;
+}
+EXPORT_SYMBOL(iov_iter_bvec);
+
unsigned long iov_iter_alignment(const struct iov_iter *i)
{
unsigned long res = 0;
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
new file mode 100644
index 0000000..bd837b8
--- /dev/null
+++ b/mm/kasan/Makefile
@@ -0,0 +1,8 @@
+KASAN_SANITIZE := n
+
+CFLAGS_REMOVE_kasan.o = -pg
+# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
+# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
+CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+
+obj-y := kasan.o report.o
diff --git a/mm/kasan/kasan.c b/mm/kasan/kasan.c
new file mode 100644
index 0000000..78fee63
--- /dev/null
+++ b/mm/kasan/kasan.c
@@ -0,0 +1,516 @@
+/*
+ * This file contains shadow memory manipulation code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ * Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/*
+ * Poisons the shadow memory for 'size' bytes starting from 'addr'.
+ * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
+ */
+static void kasan_poison_shadow(const void *address, size_t size, u8 value)
+{
+ void *shadow_start, *shadow_end;
+
+ shadow_start = kasan_mem_to_shadow(address);
+ shadow_end = kasan_mem_to_shadow(address + size);
+
+ memset(shadow_start, value, shadow_end - shadow_start);
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size)
+{
+ kasan_poison_shadow(address, size, 0);
+
+ if (size & KASAN_SHADOW_MASK) {
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+ *shadow = size & KASAN_SHADOW_MASK;
+ }
+}
+
+
+/*
+ * All functions below always inlined so compiler could
+ * perform better optimizations in each of __asan_loadX/__assn_storeX
+ * depending on memory access size X.
+ */
+
+static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+{
+ s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(shadow_value)) {
+ s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+ return unlikely(last_accessible_byte >= shadow_value);
+ }
+
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned_2(unsigned long addr)
+{
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(*shadow_addr)) {
+ if (memory_is_poisoned_1(addr + 1))
+ return true;
+
+ if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
+ return false;
+
+ return unlikely(*(u8 *)shadow_addr);
+ }
+
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned_4(unsigned long addr)
+{
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(*shadow_addr)) {
+ if (memory_is_poisoned_1(addr + 3))
+ return true;
+
+ if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
+ return false;
+
+ return unlikely(*(u8 *)shadow_addr);
+ }
+
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned_8(unsigned long addr)
+{
+ u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(*shadow_addr)) {
+ if (memory_is_poisoned_1(addr + 7))
+ return true;
+
+ if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
+ return false;
+
+ return unlikely(*(u8 *)shadow_addr);
+ }
+
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+{
+ u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
+
+ if (unlikely(*shadow_addr)) {
+ u16 shadow_first_bytes = *(u16 *)shadow_addr;
+ s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK;
+
+ if (unlikely(shadow_first_bytes))
+ return true;
+
+ if (likely(!last_byte))
+ return false;
+
+ return memory_is_poisoned_1(addr + 15);
+ }
+
+ return false;
+}
+
+static __always_inline unsigned long bytes_is_zero(const u8 *start,
+ size_t size)
+{
+ while (size) {
+ if (unlikely(*start))
+ return (unsigned long)start;
+ start++;
+ size--;
+ }
+
+ return 0;
+}
+
+static __always_inline unsigned long memory_is_zero(const void *start,
+ const void *end)
+{
+ unsigned int words;
+ unsigned long ret;
+ unsigned int prefix = (unsigned long)start % 8;
+
+ if (end - start <= 16)
+ return bytes_is_zero(start, end - start);
+
+ if (prefix) {
+ prefix = 8 - prefix;
+ ret = bytes_is_zero(start, prefix);
+ if (unlikely(ret))
+ return ret;
+ start += prefix;
+ }
+
+ words = (end - start) / 8;
+ while (words) {
+ if (unlikely(*(u64 *)start))
+ return bytes_is_zero(start, 8);
+ start += 8;
+ words--;
+ }
+
+ return bytes_is_zero(start, (end - start) % 8);
+}
+
+static __always_inline bool memory_is_poisoned_n(unsigned long addr,
+ size_t size)
+{
+ unsigned long ret;
+
+ ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
+ kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+
+ if (unlikely(ret)) {
+ unsigned long last_byte = addr + size - 1;
+ s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+
+ if (unlikely(ret != (unsigned long)last_shadow ||
+ ((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+ return true;
+ }
+ return false;
+}
+
+static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+{
+ if (__builtin_constant_p(size)) {
+ switch (size) {
+ case 1:
+ return memory_is_poisoned_1(addr);
+ case 2:
+ return memory_is_poisoned_2(addr);
+ case 4:
+ return memory_is_poisoned_4(addr);
+ case 8:
+ return memory_is_poisoned_8(addr);
+ case 16:
+ return memory_is_poisoned_16(addr);
+ default:
+ BUILD_BUG();
+ }
+ }
+
+ return memory_is_poisoned_n(addr, size);
+}
+
+
+static __always_inline void check_memory_region(unsigned long addr,
+ size_t size, bool write)
+{
+ struct kasan_access_info info;
+
+ if (unlikely(size == 0))
+ return;
+
+ if (unlikely((void *)addr <
+ kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+ info.access_addr = (void *)addr;
+ info.access_size = size;
+ info.is_write = write;
+ info.ip = _RET_IP_;
+ kasan_report_user_access(&info);
+ return;
+ }
+
+ if (likely(!memory_is_poisoned(addr, size)))
+ return;
+
+ kasan_report(addr, size, write, _RET_IP_);
+}
+
+void __asan_loadN(unsigned long addr, size_t size);
+void __asan_storeN(unsigned long addr, size_t size);
+
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+ __asan_storeN((unsigned long)addr, len);
+
+ return __memset(addr, c, len);
+}
+
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+ __asan_loadN((unsigned long)src, len);
+ __asan_storeN((unsigned long)dest, len);
+
+ return __memmove(dest, src, len);
+}
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+ __asan_loadN((unsigned long)src, len);
+ __asan_storeN((unsigned long)dest, len);
+
+ return __memcpy(dest, src, len);
+}
+
+void kasan_alloc_pages(struct page *page, unsigned int order)
+{
+ if (likely(!PageHighMem(page)))
+ kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+ if (likely(!PageHighMem(page)))
+ kasan_poison_shadow(page_address(page),
+ PAGE_SIZE << order,
+ KASAN_FREE_PAGE);
+}
+
+void kasan_poison_slab(struct page *page)
+{
+ kasan_poison_shadow(page_address(page),
+ PAGE_SIZE << compound_order(page),
+ KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+{
+ kasan_unpoison_shadow(object, cache->object_size);
+}
+
+void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+{
+ kasan_poison_shadow(object,
+ round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
+ KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_slab_alloc(struct kmem_cache *cache, void *object)
+{
+ kasan_kmalloc(cache, object, cache->object_size);
+}
+
+void kasan_slab_free(struct kmem_cache *cache, void *object)
+{
+ unsigned long size = cache->object_size;
+ unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+
+ /* RCU slabs could be legally used after free within the RCU period */
+ if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+ return;
+
+ kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
+{
+ unsigned long redzone_start;
+ unsigned long redzone_end;
+
+ if (unlikely(object == NULL))
+ return;
+
+ redzone_start = round_up((unsigned long)(object + size),
+ KASAN_SHADOW_SCALE_SIZE);
+ redzone_end = round_up((unsigned long)object + cache->object_size,
+ KASAN_SHADOW_SCALE_SIZE);
+
+ kasan_unpoison_shadow(object, size);
+ kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_KMALLOC_REDZONE);
+}
+EXPORT_SYMBOL(kasan_kmalloc);
+
+void kasan_kmalloc_large(const void *ptr, size_t size)
+{
+ struct page *page;
+ unsigned long redzone_start;
+ unsigned long redzone_end;
+
+ if (unlikely(ptr == NULL))
+ return;
+
+ page = virt_to_page(ptr);
+ redzone_start = round_up((unsigned long)(ptr + size),
+ KASAN_SHADOW_SCALE_SIZE);
+ redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
+
+ kasan_unpoison_shadow(ptr, size);
+ kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_PAGE_REDZONE);
+}
+
+void kasan_krealloc(const void *object, size_t size)
+{
+ struct page *page;
+
+ if (unlikely(object == ZERO_SIZE_PTR))
+ return;
+
+ page = virt_to_head_page(object);
+
+ if (unlikely(!PageSlab(page)))
+ kasan_kmalloc_large(object, size);
+ else
+ kasan_kmalloc(page->slab_cache, object, size);
+}
+
+void kasan_kfree_large(const void *ptr)
+{
+ struct page *page = virt_to_page(ptr);
+
+ kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+ KASAN_FREE_PAGE);
+}
+
+int kasan_module_alloc(void *addr, size_t size)
+{
+ void *ret;
+ size_t shadow_size;
+ unsigned long shadow_start;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+ shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
+ PAGE_SIZE);
+
+ if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+ return -EINVAL;
+
+ ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+ shadow_start + shadow_size,
+ GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+ PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+ __builtin_return_address(0));
+ return ret ? 0 : -ENOMEM;
+}
+
+void kasan_module_free(void *addr)
+{
+ vfree(kasan_mem_to_shadow(addr));
+}
+
+static void register_global(struct kasan_global *global)
+{
+ size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+
+ kasan_unpoison_shadow(global->beg, global->size);
+
+ kasan_poison_shadow(global->beg + aligned_size,
+ global->size_with_redzone - aligned_size,
+ KASAN_GLOBAL_REDZONE);
+}
+
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+
+#define DEFINE_ASAN_LOAD_STORE(size) \
+ void __asan_load##size(unsigned long addr) \
+ { \
+ check_memory_region(addr, size, false); \
+ } \
+ EXPORT_SYMBOL(__asan_load##size); \
+ __alias(__asan_load##size) \
+ void __asan_load##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_load##size##_noabort); \
+ void __asan_store##size(unsigned long addr) \
+ { \
+ check_memory_region(addr, size, true); \
+ } \
+ EXPORT_SYMBOL(__asan_store##size); \
+ __alias(__asan_store##size) \
+ void __asan_store##size##_noabort(unsigned long); \
+ EXPORT_SYMBOL(__asan_store##size##_noabort)
+
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+DEFINE_ASAN_LOAD_STORE(16);
+
+void __asan_loadN(unsigned long addr, size_t size)
+{
+ check_memory_region(addr, size, false);
+}
+EXPORT_SYMBOL(__asan_loadN);
+
+__alias(__asan_loadN)
+void __asan_loadN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_loadN_noabort);
+
+void __asan_storeN(unsigned long addr, size_t size)
+{
+ check_memory_region(addr, size, true);
+}
+EXPORT_SYMBOL(__asan_storeN);
+
+__alias(__asan_storeN)
+void __asan_storeN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_storeN_noabort);
+
+/* to shut up compiler complaints */
+void __asan_handle_no_return(void) {}
+EXPORT_SYMBOL(__asan_handle_no_return);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int kasan_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK;
+}
+
+static int __init kasan_memhotplug_init(void)
+{
+ pr_err("WARNING: KASan doesn't support memory hot-add\n");
+ pr_err("Memory hot-add will be disabled\n");
+
+ hotplug_memory_notifier(kasan_mem_notifier, 0);
+
+ return 0;
+}
+
+module_init(kasan_memhotplug_init);
+#endif
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
new file mode 100644
index 0000000..4986b0a
--- /dev/null
+++ b/mm/kasan/kasan.h
@@ -0,0 +1,75 @@
+#ifndef __MM_KASAN_KASAN_H
+#define __MM_KASAN_KASAN_H
+
+#include <linux/kasan.h>
+
+#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
+#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
+
+#define KASAN_FREE_PAGE 0xFF /* page was freed */
+#define KASAN_FREE_PAGE 0xFF /* page was freed */
+#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
+#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
+#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
+
+/*
+ * Stack redzone shadow values
+ * (Those are compiler's ABI, don't change them)
+ */
+#define KASAN_STACK_LEFT 0xF1
+#define KASAN_STACK_MID 0xF2
+#define KASAN_STACK_RIGHT 0xF3
+#define KASAN_STACK_PARTIAL 0xF4
+
+/* Don't break randconfig/all*config builds */
+#ifndef KASAN_ABI_VERSION
+#define KASAN_ABI_VERSION 1
+#endif
+
+struct kasan_access_info {
+ const void *access_addr;
+ const void *first_bad_addr;
+ size_t access_size;
+ bool is_write;
+ unsigned long ip;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_source_location {
+ const char *filename;
+ int line_no;
+ int column_no;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_global {
+ const void *beg; /* Address of the beginning of the global variable. */
+ size_t size; /* Size of the global variable. */
+ size_t size_with_redzone; /* Size of the variable + size of the red zone. 32 bytes aligned */
+ const void *name;
+ const void *module_name; /* Name of the module where the global variable is declared. */
+ unsigned long has_dynamic_init; /* This needed for C++ */
+#if KASAN_ABI_VERSION >= 4
+ struct kasan_source_location *location;
+#endif
+};
+
+void kasan_report_error(struct kasan_access_info *info);
+void kasan_report_user_access(struct kasan_access_info *info);
+
+static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
+{
+ return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
+ << KASAN_SHADOW_SCALE_SHIFT);
+}
+
+static inline bool kasan_enabled(void)
+{
+ return !current->kasan_depth;
+}
+
+void kasan_report(unsigned long addr, size_t size,
+ bool is_write, unsigned long ip);
+
+#endif
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
new file mode 100644
index 0000000..680ceed
--- /dev/null
+++ b/mm/kasan/report.c
@@ -0,0 +1,269 @@
+/*
+ * This file contains error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ * Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/* Shadow layout customization. */
+#define SHADOW_BYTES_PER_BLOCK 1
+#define SHADOW_BLOCKS_PER_ROW 16
+#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
+#define SHADOW_ROWS_AROUND_ADDR 2
+
+static const void *find_first_bad_addr(const void *addr, size_t size)
+{
+ u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
+ const void *first_bad_addr = addr;
+
+ while (!shadow_val && first_bad_addr < addr + size) {
+ first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
+ shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
+ }
+ return first_bad_addr;
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown crash";
+ u8 shadow_val;
+
+ info->first_bad_addr = find_first_bad_addr(info->access_addr,
+ info->access_size);
+
+ shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+ switch (shadow_val) {
+ case KASAN_FREE_PAGE:
+ case KASAN_KMALLOC_FREE:
+ bug_type = "use after free";
+ break;
+ case KASAN_PAGE_REDZONE:
+ case KASAN_KMALLOC_REDZONE:
+ case KASAN_GLOBAL_REDZONE:
+ case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+ bug_type = "out of bounds access";
+ break;
+ case KASAN_STACK_LEFT:
+ case KASAN_STACK_MID:
+ case KASAN_STACK_RIGHT:
+ case KASAN_STACK_PARTIAL:
+ bug_type = "out of bounds on stack";
+ break;
+ }
+
+ pr_err("BUG: KASan: %s in %pS at addr %p\n",
+ bug_type, (void *)info->ip,
+ info->access_addr);
+ pr_err("%s of size %zu by task %s/%d\n",
+ info->is_write ? "Write" : "Read",
+ info->access_size, current->comm, task_pid_nr(current));
+}
+
+static inline bool kernel_or_module_addr(const void *addr)
+{
+ return (addr >= (void *)_stext && addr < (void *)_end)
+ || (addr >= (void *)MODULES_VADDR
+ && addr < (void *)MODULES_END);
+}
+
+static inline bool init_task_stack_addr(const void *addr)
+{
+ return addr >= (void *)&init_thread_union.stack &&
+ (addr <= (void *)&init_thread_union.stack +
+ sizeof(init_thread_union.stack));
+}
+
+static void print_address_description(struct kasan_access_info *info)
+{
+ const void *addr = info->access_addr;
+
+ if ((addr >= (void *)PAGE_OFFSET) &&
+ (addr < high_memory)) {
+ struct page *page = virt_to_head_page(addr);
+
+ if (PageSlab(page)) {
+ void *object;
+ struct kmem_cache *cache = page->slab_cache;
+ void *last_object;
+
+ object = virt_to_obj(cache, page_address(page), addr);
+ last_object = page_address(page) +
+ page->objects * cache->size;
+
+ if (unlikely(object > last_object))
+ object = last_object; /* we hit into padding */
+
+ object_err(cache, page, object,
+ "kasan: bad access detected");
+ return;
+ }
+ dump_page(page, "kasan: bad access detected");
+ }
+
+ if (kernel_or_module_addr(addr)) {
+ if (!init_task_stack_addr(addr))
+ pr_err("Address belongs to variable %pS\n", addr);
+ }
+
+ dump_stack();
+}
+
+static bool row_is_guilty(const void *row, const void *guilty)
+{
+ return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
+}
+
+static int shadow_pointer_offset(const void *row, const void *shadow)
+{
+ /* The length of ">ff00ff00ff00ff00: " is
+ * 3 + (BITS_PER_LONG/8)*2 chars.
+ */
+ return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
+ (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
+}
+
+static void print_shadow_for_address(const void *addr)
+{
+ int i;
+ const void *shadow = kasan_mem_to_shadow(addr);
+ const void *shadow_row;
+
+ shadow_row = (void *)round_down((unsigned long)shadow,
+ SHADOW_BYTES_PER_ROW)
+ - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
+
+ pr_err("Memory state around the buggy address:\n");
+
+ for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
+ const void *kaddr = kasan_shadow_to_mem(shadow_row);
+ char buffer[4 + (BITS_PER_LONG/8)*2];
+
+ snprintf(buffer, sizeof(buffer),
+ (i == 0) ? ">%p: " : " %p: ", kaddr);
+
+ kasan_disable_current();
+ print_hex_dump(KERN_ERR, buffer,
+ DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
+ shadow_row, SHADOW_BYTES_PER_ROW, 0);
+ kasan_enable_current();
+
+ if (row_is_guilty(shadow_row, shadow))
+ pr_err("%*c\n",
+ shadow_pointer_offset(shadow_row, shadow),
+ '^');
+
+ shadow_row += SHADOW_BYTES_PER_ROW;
+ }
+}
+
+static DEFINE_SPINLOCK(report_lock);
+
+void kasan_report_error(struct kasan_access_info *info)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&report_lock, flags);
+ pr_err("================================="
+ "=================================\n");
+ print_error_description(info);
+ print_address_description(info);
+ print_shadow_for_address(info->first_bad_addr);
+ pr_err("================================="
+ "=================================\n");
+ spin_unlock_irqrestore(&report_lock, flags);
+}
+
+void kasan_report_user_access(struct kasan_access_info *info)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&report_lock, flags);
+ pr_err("================================="
+ "=================================\n");
+ pr_err("BUG: KASan: user-memory-access on address %p\n",
+ info->access_addr);
+ pr_err("%s of size %zu by task %s/%d\n",
+ info->is_write ? "Write" : "Read",
+ info->access_size, current->comm, task_pid_nr(current));
+ dump_stack();
+ pr_err("================================="
+ "=================================\n");
+ spin_unlock_irqrestore(&report_lock, flags);
+}
+
+void kasan_report(unsigned long addr, size_t size,
+ bool is_write, unsigned long ip)
+{
+ struct kasan_access_info info;
+
+ if (likely(!kasan_enabled()))
+ return;
+
+ info.access_addr = (void *)addr;
+ info.access_size = size;
+ info.is_write = is_write;
+ info.ip = ip;
+ kasan_report_error(&info);
+}
+
+
+#define DEFINE_ASAN_REPORT_LOAD(size) \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, false, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size) \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, true, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index 3cda50c..5405aff 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -98,6 +98,7 @@
#include <asm/processor.h>
#include <linux/atomic.h>
+#include <linux/kasan.h>
#include <linux/kmemcheck.h>
#include <linux/kmemleak.h>
#include <linux/memory_hotplug.h>
@@ -1113,7 +1114,10 @@ static bool update_checksum(struct kmemleak_object *object)
if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
return false;
+ kasan_disable_current();
object->checksum = crc32(0, (void *)object->pointer, object->size);
+ kasan_enable_current();
+
return object->checksum != old_csum;
}
@@ -1164,7 +1168,9 @@ static void scan_block(void *_start, void *_end,
BYTES_PER_POINTER))
continue;
+ kasan_disable_current();
pointer = *ptr;
+ kasan_enable_current();
object = find_and_get_object(pointer, 1);
if (!object)
diff --git a/mm/ksm.c b/mm/ksm.c
index 15647fb..4162dce 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -1748,7 +1748,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
*/
if (*vm_flags & (VM_MERGEABLE | VM_SHARED | VM_MAYSHARE |
VM_PFNMAP | VM_IO | VM_DONTEXPAND |
- VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
+ VM_HUGETLB | VM_MIXEDMAP))
return 0; /* just ignore the advice */
#ifdef VM_SAO
diff --git a/mm/list_lru.c b/mm/list_lru.c
index f1a0db1..909eca2 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -9,18 +9,100 @@
#include <linux/mm.h>
#include <linux/list_lru.h>
#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/memcontrol.h>
+
+#ifdef CONFIG_MEMCG_KMEM
+static LIST_HEAD(list_lrus);
+static DEFINE_MUTEX(list_lrus_mutex);
+
+static void list_lru_register(struct list_lru *lru)
+{
+ mutex_lock(&list_lrus_mutex);
+ list_add(&lru->list, &list_lrus);
+ mutex_unlock(&list_lrus_mutex);
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+ mutex_lock(&list_lrus_mutex);
+ list_del(&lru->list);
+ mutex_unlock(&list_lrus_mutex);
+}
+#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return !!lru->node[0].memcg_lrus;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+ /*
+ * The lock protects the array of per cgroup lists from relocation
+ * (see memcg_update_list_lru_node).
+ */
+ lockdep_assert_held(&nlru->lock);
+ if (nlru->memcg_lrus && idx >= 0)
+ return nlru->memcg_lrus->lru[idx];
+
+ return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+ struct mem_cgroup *memcg;
+
+ if (!nlru->memcg_lrus)
+ return &nlru->lru;
+
+ memcg = mem_cgroup_from_kmem(ptr);
+ if (!memcg)
+ return &nlru->lru;
+
+ return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+}
+#else
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+ return false;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+ return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+ return &nlru->lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
bool list_lru_add(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
spin_lock(&nlru->lock);
- WARN_ON_ONCE(nlru->nr_items < 0);
+ l = list_lru_from_kmem(nlru, item);
if (list_empty(item)) {
- list_add_tail(item, &nlru->list);
- if (nlru->nr_items++ == 0)
- node_set(nid, lru->active_nodes);
+ list_add_tail(item, &l->list);
+ l->nr_items++;
spin_unlock(&nlru->lock);
return true;
}
@@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
{
int nid = page_to_nid(virt_to_page(item));
struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
spin_lock(&nlru->lock);
+ l = list_lru_from_kmem(nlru, item);
if (!list_empty(item)) {
list_del_init(item);
- if (--nlru->nr_items == 0)
- node_clear(nid, lru->active_nodes);
- WARN_ON_ONCE(nlru->nr_items < 0);
+ l->nr_items--;
spin_unlock(&nlru->lock);
return true;
}
@@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
}
EXPORT_SYMBOL_GPL(list_lru_del);
-unsigned long
-list_lru_count_node(struct list_lru *lru, int nid)
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
+{
+ list_del_init(item);
+ list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate);
+
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+ struct list_head *head)
+{
+ list_move(item, head);
+ list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate_move);
+
+static unsigned long __list_lru_count_one(struct list_lru *lru,
+ int nid, int memcg_idx)
{
- unsigned long count = 0;
struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
+ unsigned long count;
spin_lock(&nlru->lock);
- WARN_ON_ONCE(nlru->nr_items < 0);
- count += nlru->nr_items;
+ l = list_lru_from_memcg_idx(nlru, memcg_idx);
+ count = l->nr_items;
spin_unlock(&nlru->lock);
return count;
}
+
+unsigned long list_lru_count_one(struct list_lru *lru,
+ int nid, struct mem_cgroup *memcg)
+{
+ return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
+}
+EXPORT_SYMBOL_GPL(list_lru_count_one);
+
+unsigned long list_lru_count_node(struct list_lru *lru, int nid)
+{
+ long count = 0;
+ int memcg_idx;
+
+ count += __list_lru_count_one(lru, nid, -1);
+ if (list_lru_memcg_aware(lru)) {
+ for_each_memcg_cache_index(memcg_idx)
+ count += __list_lru_count_one(lru, nid, memcg_idx);
+ }
+ return count;
+}
EXPORT_SYMBOL_GPL(list_lru_count_node);
-unsigned long
-list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
- void *cb_arg, unsigned long *nr_to_walk)
+static unsigned long
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
{
- struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_node *nlru = &lru->node[nid];
+ struct list_lru_one *l;
struct list_head *item, *n;
unsigned long isolated = 0;
spin_lock(&nlru->lock);
+ l = list_lru_from_memcg_idx(nlru, memcg_idx);
restart:
- list_for_each_safe(item, n, &nlru->list) {
+ list_for_each_safe(item, n, &l->list) {
enum lru_status ret;
/*
@@ -85,14 +206,11 @@ restart:
break;
--*nr_to_walk;
- ret = isolate(item, &nlru->lock, cb_arg);
+ ret = isolate(item, l, &nlru->lock, cb_arg);
switch (ret) {
case LRU_REMOVED_RETRY:
assert_spin_locked(&nlru->lock);
case LRU_REMOVED:
- if (--nlru->nr_items == 0)
- node_clear(nid, lru->active_nodes);
- WARN_ON_ONCE(nlru->nr_items < 0);
isolated++;
/*
* If the lru lock has been dropped, our list
@@ -103,7 +221,7 @@ restart:
goto restart;
break;
case LRU_ROTATE:
- list_move_tail(item, &nlru->list);
+ list_move_tail(item, &l->list);
break;
case LRU_SKIP:
break;
@@ -122,31 +240,322 @@ restart:
spin_unlock(&nlru->lock);
return isolated;
}
+
+unsigned long
+list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
+ isolate, cb_arg, nr_to_walk);
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_one);
+
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+ list_lru_walk_cb isolate, void *cb_arg,
+ unsigned long *nr_to_walk)
+{
+ long isolated = 0;
+ int memcg_idx;
+
+ isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
+ nr_to_walk);
+ if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
+ for_each_memcg_cache_index(memcg_idx) {
+ isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+ isolate, cb_arg, nr_to_walk);
+ if (*nr_to_walk <= 0)
+ break;
+ }
+ }
+ return isolated;
+}
EXPORT_SYMBOL_GPL(list_lru_walk_node);
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+static void init_one_lru(struct list_lru_one *l)
+{
+ INIT_LIST_HEAD(&l->list);
+ l->nr_items = 0;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
+ int begin, int end)
+{
+ int i;
+
+ for (i = begin; i < end; i++)
+ kfree(memcg_lrus->lru[i]);
+}
+
+static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
+ int begin, int end)
+{
+ int i;
+
+ for (i = begin; i < end; i++) {
+ struct list_lru_one *l;
+
+ l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
+ if (!l)
+ goto fail;
+
+ init_one_lru(l);
+ memcg_lrus->lru[i] = l;
+ }
+ return 0;
+fail:
+ __memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+ return -ENOMEM;
+}
+
+static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+{
+ int size = memcg_nr_cache_ids;
+
+ nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
+ if (!nlru->memcg_lrus)
+ return -ENOMEM;
+
+ if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
+ kfree(nlru->memcg_lrus);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
+{
+ __memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
+ kfree(nlru->memcg_lrus);
+}
+
+static int memcg_update_list_lru_node(struct list_lru_node *nlru,
+ int old_size, int new_size)
+{
+ struct list_lru_memcg *old, *new;
+
+ BUG_ON(old_size > new_size);
+
+ old = nlru->memcg_lrus;
+ new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+ if (!new)
+ return -ENOMEM;
+
+ if (__memcg_init_list_lru_node(new, old_size, new_size)) {
+ kfree(new);
+ return -ENOMEM;
+ }
+
+ memcpy(new, old, old_size * sizeof(void *));
+
+ /*
+ * The lock guarantees that we won't race with a reader
+ * (see list_lru_from_memcg_idx).
+ *
+ * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+ * we have to use IRQ-safe primitives here to avoid deadlock.
+ */
+ spin_lock_irq(&nlru->lock);
+ nlru->memcg_lrus = new;
+ spin_unlock_irq(&nlru->lock);
+
+ kfree(old);
+ return 0;
+}
+
+static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
+ int old_size, int new_size)
+{
+ /* do not bother shrinking the array back to the old size, because we
+ * cannot handle allocation failures here */
+ __memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+}
+
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+ int i;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ if (!memcg_aware)
+ lru->node[i].memcg_lrus = NULL;
+ else if (memcg_init_list_lru_node(&lru->node[i]))
+ goto fail;
+ }
+ return 0;
+fail:
+ for (i = i - 1; i >= 0; i--)
+ memcg_destroy_list_lru_node(&lru->node[i]);
+ return -ENOMEM;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for (i = 0; i < nr_node_ids; i++)
+ memcg_destroy_list_lru_node(&lru->node[i]);
+}
+
+static int memcg_update_list_lru(struct list_lru *lru,
+ int old_size, int new_size)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return 0;
+
+ for (i = 0; i < nr_node_ids; i++) {
+ if (memcg_update_list_lru_node(&lru->node[i],
+ old_size, new_size))
+ goto fail;
+ }
+ return 0;
+fail:
+ for (i = i - 1; i >= 0; i--)
+ memcg_cancel_update_list_lru_node(&lru->node[i],
+ old_size, new_size);
+ return -ENOMEM;
+}
+
+static void memcg_cancel_update_list_lru(struct list_lru *lru,
+ int old_size, int new_size)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for (i = 0; i < nr_node_ids; i++)
+ memcg_cancel_update_list_lru_node(&lru->node[i],
+ old_size, new_size);
+}
+
+int memcg_update_all_list_lrus(int new_size)
+{
+ int ret = 0;
+ struct list_lru *lru;
+ int old_size = memcg_nr_cache_ids;
+
+ mutex_lock(&list_lrus_mutex);
+ list_for_each_entry(lru, &list_lrus, list) {
+ ret = memcg_update_list_lru(lru, old_size, new_size);
+ if (ret)
+ goto fail;
+ }
+out:
+ mutex_unlock(&list_lrus_mutex);
+ return ret;
+fail:
+ list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+ memcg_cancel_update_list_lru(lru, old_size, new_size);
+ goto out;
+}
+
+static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
+ int src_idx, int dst_idx)
+{
+ struct list_lru_one *src, *dst;
+
+ /*
+ * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+ * we have to use IRQ-safe primitives here to avoid deadlock.
+ */
+ spin_lock_irq(&nlru->lock);
+
+ src = list_lru_from_memcg_idx(nlru, src_idx);
+ dst = list_lru_from_memcg_idx(nlru, dst_idx);
+
+ list_splice_init(&src->list, &dst->list);
+ dst->nr_items += src->nr_items;
+ src->nr_items = 0;
+
+ spin_unlock_irq(&nlru->lock);
+}
+
+static void memcg_drain_list_lru(struct list_lru *lru,
+ int src_idx, int dst_idx)
+{
+ int i;
+
+ if (!list_lru_memcg_aware(lru))
+ return;
+
+ for (i = 0; i < nr_node_ids; i++)
+ memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+}
+
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+{
+ struct list_lru *lru;
+
+ mutex_lock(&list_lrus_mutex);
+ list_for_each_entry(lru, &list_lrus, list)
+ memcg_drain_list_lru(lru, src_idx, dst_idx);
+ mutex_unlock(&list_lrus_mutex);
+}
+#else
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+ return 0;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+ struct lock_class_key *key)
{
int i;
size_t size = sizeof(*lru->node) * nr_node_ids;
+ int err = -ENOMEM;
+
+ memcg_get_cache_ids();
lru->node = kzalloc(size, GFP_KERNEL);
if (!lru->node)
- return -ENOMEM;
+ goto out;
- nodes_clear(lru->active_nodes);
for (i = 0; i < nr_node_ids; i++) {
spin_lock_init(&lru->node[i].lock);
if (key)
lockdep_set_class(&lru->node[i].lock, key);
- INIT_LIST_HEAD(&lru->node[i].list);
- lru->node[i].nr_items = 0;
+ init_one_lru(&lru->node[i].lru);
}
- return 0;
+
+ err = memcg_init_list_lru(lru, memcg_aware);
+ if (err) {
+ kfree(lru->node);
+ goto out;
+ }
+
+ list_lru_register(lru);
+out:
+ memcg_put_cache_ids();
+ return err;
}
-EXPORT_SYMBOL_GPL(list_lru_init_key);
+EXPORT_SYMBOL_GPL(__list_lru_init);
void list_lru_destroy(struct list_lru *lru)
{
+ /* Already destroyed or not yet initialized? */
+ if (!lru->node)
+ return;
+
+ memcg_get_cache_ids();
+
+ list_lru_unregister(lru);
+
+ memcg_destroy_list_lru(lru);
kfree(lru->node);
+ lru->node = NULL;
+
+ memcg_put_cache_ids();
}
EXPORT_SYMBOL_GPL(list_lru_destroy);
diff --git a/mm/madvise.c b/mm/madvise.c
index a271adc..d551475 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -155,7 +155,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
pte = *(orig_pte + ((index - start) / PAGE_SIZE));
pte_unmap_unlock(orig_pte, ptl);
- if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+ if (pte_present(pte) || pte_none(pte))
continue;
entry = pte_to_swp_entry(pte);
if (unlikely(non_swap_entry(entry)))
@@ -222,21 +222,24 @@ static long madvise_willneed(struct vm_area_struct *vma,
struct file *file = vma->vm_file;
#ifdef CONFIG_SWAP
- if (!file || mapping_cap_swap_backed(file->f_mapping)) {
+ if (!file) {
*prev = vma;
- if (!file)
- force_swapin_readahead(vma, start, end);
- else
- force_shm_swapin_readahead(vma, start, end,
- file->f_mapping);
+ force_swapin_readahead(vma, start, end);
return 0;
}
-#endif
+ if (shmem_mapping(file->f_mapping)) {
+ *prev = vma;
+ force_shm_swapin_readahead(vma, start, end,
+ file->f_mapping);
+ return 0;
+ }
+#else
if (!file)
return -EBADF;
+#endif
- if (file->f_mapping->a_ops->get_xip_mem) {
+ if (IS_DAX(file_inode(file))) {
/* no bad return value, but ignore advice */
return 0;
}
@@ -278,14 +281,7 @@ static long madvise_dontneed(struct vm_area_struct *vma,
if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
return -EINVAL;
- if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
- struct zap_details details = {
- .nonlinear_vma = vma,
- .last_index = ULONG_MAX,
- };
- zap_page_range(vma, start, end - start, &details);
- } else
- zap_page_range(vma, start, end - start, NULL);
+ zap_page_range(vma, start, end - start, NULL);
return 0;
}
@@ -303,7 +299,7 @@ static long madvise_remove(struct vm_area_struct *vma,
*prev = NULL; /* tell sys_madvise we drop mmap_sem */
- if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+ if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
return -EINVAL;
f = vma->vm_file;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2f6893c..d18d3a6 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -72,22 +72,13 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
#define MEM_CGROUP_RECLAIM_RETRIES 5
static struct mem_cgroup *root_mem_cgroup __read_mostly;
+/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
-/* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */
int do_swap_account __read_mostly;
-
-/* for remember boot option*/
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
-static int really_do_swap_account __initdata = 1;
-#else
-static int really_do_swap_account __initdata;
-#endif
-
#else
#define do_swap_account 0
#endif
-
static const char * const mem_cgroup_stat_names[] = {
"cache",
"rss",
@@ -97,14 +88,6 @@ static const char * const mem_cgroup_stat_names[] = {
"swap",
};
-enum mem_cgroup_events_index {
- MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */
- MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */
- MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */
- MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */
- MEM_CGROUP_EVENTS_NSTATS,
-};
-
static const char * const mem_cgroup_events_names[] = {
"pgpgin",
"pgpgout",
@@ -138,7 +121,7 @@ enum mem_cgroup_events_target {
struct mem_cgroup_stat_cpu {
long count[MEM_CGROUP_STAT_NSTATS];
- unsigned long events[MEM_CGROUP_EVENTS_NSTATS];
+ unsigned long events[MEMCG_NR_EVENTS];
unsigned long nr_page_events;
unsigned long targets[MEM_CGROUP_NTARGETS];
};
@@ -284,6 +267,10 @@ struct mem_cgroup {
struct page_counter memsw;
struct page_counter kmem;
+ /* Normal memory consumption range */
+ unsigned long low;
+ unsigned long high;
+
unsigned long soft_limit;
/* vmpressure notifications */
@@ -325,9 +312,11 @@ struct mem_cgroup {
/*
* set > 0 if pages under this cgroup are moving to other cgroup.
*/
- atomic_t moving_account;
+ atomic_t moving_account;
/* taken only while moving_account > 0 */
- spinlock_t move_lock;
+ spinlock_t move_lock;
+ struct task_struct *move_lock_task;
+ unsigned long move_lock_flags;
/*
* percpu counter.
*/
@@ -343,11 +332,10 @@ struct mem_cgroup {
struct cg_proto tcp_mem;
#endif
#if defined(CONFIG_MEMCG_KMEM)
- /* analogous to slab_common's slab_caches list, but per-memcg;
- * protected by memcg_slab_mutex */
- struct list_head memcg_slab_caches;
- /* Index in the kmem_cache->memcg_params->memcg_caches array */
+ /* Index in the kmem_cache->memcg_params.memcg_caches array */
int kmemcg_id;
+ bool kmem_acct_activated;
+ bool kmem_acct_active;
#endif
int last_scanned_node;
@@ -366,29 +354,26 @@ struct mem_cgroup {
};
#ifdef CONFIG_MEMCG_KMEM
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
{
- return memcg->kmemcg_id >= 0;
+ return memcg->kmem_acct_active;
}
#endif
/* Stuffs for move charges at task migration. */
/*
- * Types of charges to be moved. "move_charge_at_immitgrate" and
- * "immigrate_flags" are treated as a left-shifted bitmap of these types.
+ * Types of charges to be moved.
*/
-enum move_type {
- MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */
- MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */
- NR_MOVE_TYPE,
-};
+#define MOVE_ANON 0x1U
+#define MOVE_FILE 0x2U
+#define MOVE_MASK (MOVE_ANON | MOVE_FILE)
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
spinlock_t lock; /* for from, to */
struct mem_cgroup *from;
struct mem_cgroup *to;
- unsigned long immigrate_flags;
+ unsigned long flags;
unsigned long precharge;
unsigned long moved_charge;
unsigned long moved_swap;
@@ -399,16 +384,6 @@ static struct move_charge_struct {
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
-static bool move_anon(void)
-{
- return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags);
-}
-
-static bool move_file(void)
-{
- return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags);
-}
-
/*
* Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
* limit reclaim to prevent infinite loops, if they ever occur.
@@ -544,33 +519,35 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(tcp_proto_cgroup);
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
- if (!memcg_proto_activated(&memcg->tcp_mem))
- return;
- static_key_slow_dec(&memcg_socket_limit_enabled);
-}
-#else
-static void disarm_sock_keys(struct mem_cgroup *memcg)
-{
-}
#endif
#ifdef CONFIG_MEMCG_KMEM
/*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
* memcgs, and none but the 200th is kmem-limited, we'd have to have a
* 200 entry array for that.
*
- * The current size of the caches array is stored in
- * memcg_limited_groups_array_size. It will double each time we have to
- * increase it.
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
+ * will double each time we have to increase it.
*/
-static DEFINE_IDA(kmem_limited_groups);
-int memcg_limited_groups_array_size;
+static DEFINE_IDA(memcg_cache_ida);
+int memcg_nr_cache_ids;
+
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+
+void memcg_get_cache_ids(void)
+{
+ down_read(&memcg_cache_ids_sem);
+}
+
+void memcg_put_cache_ids(void)
+{
+ up_read(&memcg_cache_ids_sem);
+}
/*
* MIN_SIZE is different than 1, because we would like to avoid going through
@@ -596,32 +573,8 @@ int memcg_limited_groups_array_size;
struct static_key memcg_kmem_enabled_key;
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-static void memcg_free_cache_id(int id);
-
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
- if (memcg_kmem_is_active(memcg)) {
- static_key_slow_dec(&memcg_kmem_enabled_key);
- memcg_free_cache_id(memcg->kmemcg_id);
- }
- /*
- * This check can't live in kmem destruction function,
- * since the charges will outlive the cgroup
- */
- WARN_ON(page_counter_read(&memcg->kmem));
-}
-#else
-static void disarm_kmem_keys(struct mem_cgroup *memcg)
-{
-}
#endif /* CONFIG_MEMCG_KMEM */
-static void disarm_static_keys(struct mem_cgroup *memcg)
-{
- disarm_sock_keys(memcg);
- disarm_kmem_keys(memcg);
-}
-
static struct mem_cgroup_per_zone *
mem_cgroup_zone_zoneinfo(struct mem_cgroup *memcg, struct zone *zone)
{
@@ -1368,6 +1321,20 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
return inactive * inactive_ratio < active;
}
+bool mem_cgroup_lruvec_online(struct lruvec *lruvec)
+{
+ struct mem_cgroup_per_zone *mz;
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return true;
+
+ mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec);
+ memcg = mz->memcg;
+
+ return !!(memcg->css.flags & CSS_ONLINE);
+}
+
#define mem_cgroup_from_counter(counter, member) \
container_of(counter, struct mem_cgroup, member)
@@ -1560,7 +1527,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
* quickly exit and free its memory.
*/
if (fatal_signal_pending(current) || task_will_free_mem(current)) {
- set_thread_flag(TIF_MEMDIE);
+ mark_tsk_oom_victim(current);
return;
}
@@ -1934,7 +1901,7 @@ bool mem_cgroup_oom_synchronize(bool handle)
if (!memcg)
return false;
- if (!handle)
+ if (!handle || oom_killer_disabled)
goto cleanup;
owait.memcg = memcg;
@@ -1980,34 +1947,33 @@ cleanup:
/**
* mem_cgroup_begin_page_stat - begin a page state statistics transaction
* @page: page that is going to change accounted state
- * @locked: &memcg->move_lock slowpath was taken
- * @flags: IRQ-state flags for &memcg->move_lock
*
* This function must mark the beginning of an accounted page state
* change to prevent double accounting when the page is concurrently
* being moved to another memcg:
*
- * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ * memcg = mem_cgroup_begin_page_stat(page);
* if (TestClearPageState(page))
* mem_cgroup_update_page_stat(memcg, state, -1);
- * mem_cgroup_end_page_stat(memcg, locked, flags);
- *
- * The RCU lock is held throughout the transaction. The fast path can
- * get away without acquiring the memcg->move_lock (@locked is false)
- * because page moving starts with an RCU grace period.
- *
- * The RCU lock also protects the memcg from being freed when the page
- * state that is going to change is the only thing preventing the page
- * from being uncharged. E.g. end-writeback clearing PageWriteback(),
- * which allows migration to go ahead and uncharge the page before the
- * account transaction might be complete.
+ * mem_cgroup_end_page_stat(memcg);
*/
-struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
- bool *locked,
- unsigned long *flags)
+struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page)
{
struct mem_cgroup *memcg;
+ unsigned long flags;
+ /*
+ * The RCU lock is held throughout the transaction. The fast
+ * path can get away without acquiring the memcg->move_lock
+ * because page moving starts with an RCU grace period.
+ *
+ * The RCU lock also protects the memcg from being freed when
+ * the page state that is going to change is the only thing
+ * preventing the page from being uncharged.
+ * E.g. end-writeback clearing PageWriteback(), which allows
+ * migration to go ahead and uncharge the page before the
+ * account transaction might be complete.
+ */
rcu_read_lock();
if (mem_cgroup_disabled())
@@ -2017,16 +1983,22 @@ again:
if (unlikely(!memcg))
return NULL;
- *locked = false;
if (atomic_read(&memcg->moving_account) <= 0)
return memcg;
- spin_lock_irqsave(&memcg->move_lock, *flags);
+ spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) {
- spin_unlock_irqrestore(&memcg->move_lock, *flags);
+ spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
- *locked = true;
+
+ /*
+ * When charge migration first begins, we can have locked and
+ * unlocked page stat updates happening concurrently. Track
+ * the task who has the lock for mem_cgroup_end_page_stat().
+ */
+ memcg->move_lock_task = current;
+ memcg->move_lock_flags = flags;
return memcg;
}
@@ -2034,14 +2006,17 @@ again:
/**
* mem_cgroup_end_page_stat - finish a page state statistics transaction
* @memcg: the memcg that was accounted against
- * @locked: value received from mem_cgroup_begin_page_stat()
- * @flags: value received from mem_cgroup_begin_page_stat()
*/
-void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool *locked,
- unsigned long *flags)
+void mem_cgroup_end_page_stat(struct mem_cgroup *memcg)
{
- if (memcg && *locked)
- spin_unlock_irqrestore(&memcg->move_lock, *flags);
+ if (memcg && memcg->move_lock_task == current) {
+ unsigned long flags = memcg->move_lock_flags;
+
+ memcg->move_lock_task = NULL;
+ memcg->move_lock_flags = 0;
+
+ spin_unlock_irqrestore(&memcg->move_lock, flags);
+ }
rcu_read_unlock();
}
@@ -2134,17 +2109,6 @@ static void drain_local_stock(struct work_struct *dummy)
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
}
-static void __init memcg_stock_init(void)
-{
- int cpu;
-
- for_each_possible_cpu(cpu) {
- struct memcg_stock_pcp *stock =
- &per_cpu(memcg_stock, cpu);
- INIT_WORK(&stock->work, drain_local_stock);
- }
-}
-
/*
* Cache charges(val) to local per_cpu area.
* This will be consumed by consume_stock() function, later.
@@ -2294,6 +2258,8 @@ retry:
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
+ mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
+
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
@@ -2335,6 +2301,8 @@ retry:
if (fatal_signal_pending(current))
goto bypass;
+ mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
+
mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(nr_pages));
nomem:
if (!(gfp_mask & __GFP_NOFAIL))
@@ -2346,6 +2314,16 @@ done_restock:
css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
+ /*
+ * If the hierarchy is above the normal consumption range,
+ * make the charging task trim their excess contribution.
+ */
+ do {
+ if (page_counter_read(&memcg->memory) <= memcg->high)
+ continue;
+ mem_cgroup_events(memcg, MEMCG_HIGH, 1);
+ try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+ } while ((memcg = parent_mem_cgroup(memcg)));
done:
return ret;
}
@@ -2476,27 +2454,8 @@ static void commit_charge(struct page *page, struct mem_cgroup *memcg,
}
#ifdef CONFIG_MEMCG_KMEM
-/*
- * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or
- * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists.
- */
-static DEFINE_MUTEX(memcg_slab_mutex);
-
-/*
- * This is a bit cumbersome, but it is rarely used and avoids a backpointer
- * in the memcg_cache_params struct.
- */
-static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
-{
- struct kmem_cache *cachep;
-
- VM_BUG_ON(p->is_root_cache);
- cachep = p->root_cache;
- return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg));
-}
-
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
- unsigned long nr_pages)
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+ unsigned long nr_pages)
{
struct page_counter *counter;
int ret = 0;
@@ -2533,8 +2492,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
return ret;
}
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
- unsigned long nr_pages)
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages)
{
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_swap_account)
@@ -2560,18 +2518,19 @@ static int memcg_alloc_cache_id(void)
int id, size;
int err;
- id = ida_simple_get(&kmem_limited_groups,
+ id = ida_simple_get(&memcg_cache_ida,
0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
if (id < 0)
return id;
- if (id < memcg_limited_groups_array_size)
+ if (id < memcg_nr_cache_ids)
return id;
/*
* There's no space for the new id in memcg_caches arrays,
* so we have to grow them.
*/
+ down_write(&memcg_cache_ids_sem);
size = 2 * (id + 1);
if (size < MEMCG_CACHES_MIN_SIZE)
@@ -2579,12 +2538,16 @@ static int memcg_alloc_cache_id(void)
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;
- mutex_lock(&memcg_slab_mutex);
err = memcg_update_all_caches(size);
- mutex_unlock(&memcg_slab_mutex);
+ if (!err)
+ err = memcg_update_all_list_lrus(size);
+ if (!err)
+ memcg_nr_cache_ids = size;
+
+ up_write(&memcg_cache_ids_sem);
if (err) {
- ida_simple_remove(&kmem_limited_groups, id);
+ ida_simple_remove(&memcg_cache_ida, id);
return err;
}
return id;
@@ -2592,136 +2555,23 @@ static int memcg_alloc_cache_id(void)
static void memcg_free_cache_id(int id)
{
- ida_simple_remove(&kmem_limited_groups, id);
+ ida_simple_remove(&memcg_cache_ida, id);
}
-/*
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
- memcg_limited_groups_array_size = num;
-}
-
-static void memcg_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *root_cache)
-{
- static char memcg_name_buf[NAME_MAX + 1]; /* protected by
- memcg_slab_mutex */
- struct kmem_cache *cachep;
- int id;
-
- lockdep_assert_held(&memcg_slab_mutex);
-
- id = memcg_cache_id(memcg);
-
- /*
- * Since per-memcg caches are created asynchronously on first
- * allocation (see memcg_kmem_get_cache()), several threads can try to
- * create the same cache, but only one of them may succeed.
- */
- if (cache_from_memcg_idx(root_cache, id))
- return;
-
- cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1);
- cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf);
- /*
- * If we could not create a memcg cache, do not complain, because
- * that's not critical at all as we can always proceed with the root
- * cache.
- */
- if (!cachep)
- return;
-
- list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-
- /*
- * Since readers won't lock (see cache_from_memcg_idx()), we need a
- * barrier here to ensure nobody will see the kmem_cache partially
- * initialized.
- */
- smp_wmb();
-
- BUG_ON(root_cache->memcg_params->memcg_caches[id]);
- root_cache->memcg_params->memcg_caches[id] = cachep;
-}
-
-static void memcg_unregister_cache(struct kmem_cache *cachep)
-{
- struct kmem_cache *root_cache;
- struct mem_cgroup *memcg;
- int id;
-
- lockdep_assert_held(&memcg_slab_mutex);
-
- BUG_ON(is_root_cache(cachep));
-
- root_cache = cachep->memcg_params->root_cache;
- memcg = cachep->memcg_params->memcg;
- id = memcg_cache_id(memcg);
-
- BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep);
- root_cache->memcg_params->memcg_caches[id] = NULL;
-
- list_del(&cachep->memcg_params->list);
-
- kmem_cache_destroy(cachep);
-}
-
-int __memcg_cleanup_cache_params(struct kmem_cache *s)
-{
- struct kmem_cache *c;
- int i, failed = 0;
-
- mutex_lock(&memcg_slab_mutex);
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
-
- memcg_unregister_cache(c);
-
- if (cache_from_memcg_idx(s, i))
- failed++;
- }
- mutex_unlock(&memcg_slab_mutex);
- return failed;
-}
-
-static void memcg_unregister_all_caches(struct mem_cgroup *memcg)
-{
- struct kmem_cache *cachep;
- struct memcg_cache_params *params, *tmp;
-
- if (!memcg_kmem_is_active(memcg))
- return;
-
- mutex_lock(&memcg_slab_mutex);
- list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) {
- cachep = memcg_params_to_cache(params);
- memcg_unregister_cache(cachep);
- }
- mutex_unlock(&memcg_slab_mutex);
-}
-
-struct memcg_register_cache_work {
+struct memcg_kmem_cache_create_work {
struct mem_cgroup *memcg;
struct kmem_cache *cachep;
struct work_struct work;
};
-static void memcg_register_cache_func(struct work_struct *w)
+static void memcg_kmem_cache_create_func(struct work_struct *w)
{
- struct memcg_register_cache_work *cw =
- container_of(w, struct memcg_register_cache_work, work);
+ struct memcg_kmem_cache_create_work *cw =
+ container_of(w, struct memcg_kmem_cache_create_work, work);
struct mem_cgroup *memcg = cw->memcg;
struct kmem_cache *cachep = cw->cachep;
- mutex_lock(&memcg_slab_mutex);
- memcg_register_cache(memcg, cachep);
- mutex_unlock(&memcg_slab_mutex);
+ memcg_create_kmem_cache(memcg, cachep);
css_put(&memcg->css);
kfree(cw);
@@ -2730,10 +2580,10 @@ static void memcg_register_cache_func(struct work_struct *w)
/*
* Enqueue the creation of a per-memcg kmem_cache.
*/
-static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
- struct memcg_register_cache_work *cw;
+ struct memcg_kmem_cache_create_work *cw;
cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
if (!cw)
@@ -2743,18 +2593,18 @@ static void __memcg_schedule_register_cache(struct mem_cgroup *memcg,
cw->memcg = memcg;
cw->cachep = cachep;
+ INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
- INIT_WORK(&cw->work, memcg_register_cache_func);
schedule_work(&cw->work);
}
-static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+ struct kmem_cache *cachep)
{
/*
* We need to stop accounting when we kmalloc, because if the
* corresponding kmalloc cache is not yet created, the first allocation
- * in __memcg_schedule_register_cache will recurse.
+ * in __memcg_schedule_kmem_cache_create will recurse.
*
* However, it is better to enclose the whole function. Depending on
* the debugging options enabled, INIT_WORK(), for instance, can
@@ -2763,24 +2613,10 @@ static void memcg_schedule_register_cache(struct mem_cgroup *memcg,
* the safest choice is to do it like this, wrapping the whole function.
*/
current->memcg_kmem_skip_account = 1;
- __memcg_schedule_register_cache(memcg, cachep);
+ __memcg_schedule_kmem_cache_create(memcg, cachep);
current->memcg_kmem_skip_account = 0;
}
-int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order)
-{
- unsigned int nr_pages = 1 << order;
-
- return memcg_charge_kmem(cachep->memcg_params->memcg, gfp, nr_pages);
-}
-
-void __memcg_uncharge_slab(struct kmem_cache *cachep, int order)
-{
- unsigned int nr_pages = 1 << order;
-
- memcg_uncharge_kmem(cachep->memcg_params->memcg, nr_pages);
-}
-
/*
* Return the kmem_cache we're supposed to use for a slab allocation.
* We try to use the current memcg's version of the cache.
@@ -2798,18 +2634,19 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
{
struct mem_cgroup *memcg;
struct kmem_cache *memcg_cachep;
+ int kmemcg_id;
- VM_BUG_ON(!cachep->memcg_params);
- VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+ VM_BUG_ON(!is_root_cache(cachep));
if (current->memcg_kmem_skip_account)
return cachep;
memcg = get_mem_cgroup_from_mm(current->mm);
- if (!memcg_kmem_is_active(memcg))
+ kmemcg_id = ACCESS_ONCE(memcg->kmemcg_id);
+ if (kmemcg_id < 0)
goto out;
- memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+ memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
if (likely(memcg_cachep))
return memcg_cachep;
@@ -2825,7 +2662,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep)
* could happen with the slab_mutex held. So it's better to
* defer everything.
*/
- memcg_schedule_register_cache(memcg, cachep);
+ memcg_schedule_kmem_cache_create(memcg, cachep);
out:
css_put(&memcg->css);
return cachep;
@@ -2834,7 +2671,7 @@ out:
void __memcg_kmem_put_cache(struct kmem_cache *cachep)
{
if (!is_root_cache(cachep))
- css_put(&cachep->memcg_params->memcg->css);
+ css_put(&cachep->memcg_params.memcg->css);
}
/*
@@ -2899,6 +2736,24 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
memcg_uncharge_kmem(memcg, 1 << order);
page->mem_cgroup = NULL;
}
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
+{
+ struct mem_cgroup *memcg = NULL;
+ struct kmem_cache *cachep;
+ struct page *page;
+
+ page = virt_to_head_page(ptr);
+ if (PageSlab(page)) {
+ cachep = page->slab_cache;
+ if (!is_root_cache(cachep))
+ memcg = cachep->memcg_params.memcg;
+ } else
+ /* page allocated by alloc_kmem_pages */
+ memcg = page->mem_cgroup;
+
+ return memcg;
+}
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3433,8 +3288,9 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
int err = 0;
int memcg_id;
- if (memcg_kmem_is_active(memcg))
- return 0;
+ BUG_ON(memcg->kmemcg_id >= 0);
+ BUG_ON(memcg->kmem_acct_activated);
+ BUG_ON(memcg->kmem_acct_active);
/*
* For simplicity, we won't allow this to be disabled. It also can't
@@ -3477,6 +3333,8 @@ static int memcg_activate_kmem(struct mem_cgroup *memcg,
* patched.
*/
memcg->kmemcg_id = memcg_id;
+ memcg->kmem_acct_activated = true;
+ memcg->kmem_acct_active = true;
out:
return err;
}
@@ -3533,7 +3391,7 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
int ret;
buf = strstrip(buf);
- ret = page_counter_memparse(buf, &nr_pages);
+ ret = page_counter_memparse(buf, "-1", &nr_pages);
if (ret)
return ret;
@@ -3609,7 +3467,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
{
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- if (val >= (1 << NR_MOVE_TYPE))
+ if (val & ~MOVE_MASK)
return -EINVAL;
/*
@@ -3687,6 +3545,10 @@ static int memcg_stat_show(struct seq_file *m, void *v)
struct mem_cgroup *mi;
unsigned int i;
+ BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
+ MEM_CGROUP_STAT_NSTATS);
+ BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
+ MEM_CGROUP_EVENTS_NSTATS);
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
@@ -3901,7 +3763,7 @@ static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
unsigned long usage;
int i, size, ret;
- ret = page_counter_memparse(args, &threshold);
+ ret = page_counter_memparse(args, "-1", &threshold);
if (ret)
return ret;
@@ -4152,9 +4014,59 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return mem_cgroup_sockets_init(memcg, ss);
}
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+ struct cgroup_subsys_state *css;
+ struct mem_cgroup *parent, *child;
+ int kmemcg_id;
+
+ if (!memcg->kmem_acct_active)
+ return;
+
+ /*
+ * Clear the 'active' flag before clearing memcg_caches arrays entries.
+ * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+ * guarantees no cache will be created for this cgroup after we are
+ * done (see memcg_create_kmem_cache()).
+ */
+ memcg->kmem_acct_active = false;
+
+ memcg_deactivate_kmem_caches(memcg);
+
+ kmemcg_id = memcg->kmemcg_id;
+ BUG_ON(kmemcg_id < 0);
+
+ parent = parent_mem_cgroup(memcg);
+ if (!parent)
+ parent = root_mem_cgroup;
+
+ /*
+ * Change kmemcg_id of this cgroup and all its descendants to the
+ * parent's id, and then move all entries from this cgroup's list_lrus
+ * to ones of the parent. After we have finished, all list_lrus
+ * corresponding to this cgroup are guaranteed to remain empty. The
+ * ordering is imposed by list_lru_node->lock taken by
+ * memcg_drain_all_list_lrus().
+ */
+ css_for_each_descendant_pre(css, &memcg->css) {
+ child = mem_cgroup_from_css(css);
+ BUG_ON(child->kmemcg_id != kmemcg_id);
+ child->kmemcg_id = parent->kmemcg_id;
+ if (!memcg->use_hierarchy)
+ break;
+ }
+ memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+ memcg_free_cache_id(kmemcg_id);
+}
+
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
- memcg_unregister_all_caches(memcg);
+ if (memcg->kmem_acct_activated) {
+ memcg_destroy_kmem_caches(memcg);
+ static_key_slow_dec(&memcg_kmem_enabled_key);
+ WARN_ON(page_counter_read(&memcg->kmem));
+ }
mem_cgroup_sockets_destroy(memcg);
}
#else
@@ -4163,6 +4075,10 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return 0;
}
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+}
+
static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
}
@@ -4391,7 +4307,7 @@ out_kfree:
return ret;
}
-static struct cftype mem_cgroup_files[] = {
+static struct cftype mem_cgroup_legacy_files[] = {
{
.name = "usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
@@ -4502,34 +4418,6 @@ static struct cftype mem_cgroup_files[] = {
{ }, /* terminate */
};
-#ifdef CONFIG_MEMCG_SWAP
-static struct cftype memsw_cgroup_files[] = {
- {
- .name = "memsw.usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.max_usage_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.limit_in_bytes",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
- .write = mem_cgroup_write,
- .read_u64 = mem_cgroup_read_u64,
- },
- {
- .name = "memsw.failcnt",
- .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
- .write = mem_cgroup_reset,
- .read_u64 = mem_cgroup_read_u64,
- },
- { }, /* terminate */
-};
-#endif
static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
struct mem_cgroup_per_node *pn;
@@ -4609,8 +4497,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
free_mem_cgroup_per_zone_info(memcg, node);
free_percpu(memcg->stat);
-
- disarm_static_keys(memcg);
kfree(memcg);
}
@@ -4625,29 +4511,6 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg)
}
EXPORT_SYMBOL(parent_mem_cgroup);
-static void __init mem_cgroup_soft_limit_tree_init(void)
-{
- struct mem_cgroup_tree_per_node *rtpn;
- struct mem_cgroup_tree_per_zone *rtpz;
- int tmp, node, zone;
-
- for_each_node(node) {
- tmp = node;
- if (!node_state(node, N_NORMAL_MEMORY))
- tmp = -1;
- rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp);
- BUG_ON(!rtpn);
-
- soft_limit_tree.rb_tree_per_node[node] = rtpn;
-
- for (zone = 0; zone < MAX_NR_ZONES; zone++) {
- rtpz = &rtpn->rb_tree_per_zone[zone];
- rtpz->rb_root = RB_ROOT;
- spin_lock_init(&rtpz->lock);
- }
- }
-}
-
static struct cgroup_subsys_state * __ref
mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
{
@@ -4667,6 +4530,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent_css == NULL) {
root_mem_cgroup = memcg;
page_counter_init(&memcg->memory, NULL);
+ memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
@@ -4682,7 +4546,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
spin_lock_init(&memcg->event_list_lock);
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
- INIT_LIST_HEAD(&memcg->memcg_slab_caches);
#endif
return &memcg->css;
@@ -4713,6 +4576,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent->use_hierarchy) {
page_counter_init(&memcg->memory, &parent->memory);
+ memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
@@ -4723,6 +4587,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
*/
} else {
page_counter_init(&memcg->memory, NULL);
+ memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
@@ -4768,6 +4633,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
spin_unlock(&memcg->event_list_lock);
vmpressure_cleanup(&memcg->vmpressure);
+
+ memcg_deactivate_kmem(memcg);
}
static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -4798,6 +4665,8 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+ memcg->low = 0;
+ memcg->high = PAGE_COUNTER_MAX;
memcg->soft_limit = PAGE_COUNTER_MAX;
}
@@ -4874,12 +4743,12 @@ static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
if (!page || !page_mapped(page))
return NULL;
if (PageAnon(page)) {
- /* we don't move shared anon */
- if (!move_anon())
+ if (!(mc.flags & MOVE_ANON))
return NULL;
- } else if (!move_file())
- /* we ignore mapcount for file pages */
- return NULL;
+ } else {
+ if (!(mc.flags & MOVE_FILE))
+ return NULL;
+ }
if (!get_page_unless_zero(page))
return NULL;
@@ -4893,7 +4762,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
struct page *page = NULL;
swp_entry_t ent = pte_to_swp_entry(ptent);
- if (!move_anon() || non_swap_entry(ent))
+ if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
return NULL;
/*
* Because lookup_swap_cache() updates some statistics counter,
@@ -4922,14 +4791,11 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
if (!vma->vm_file) /* anonymous vma */
return NULL;
- if (!move_file())
+ if (!(mc.flags & MOVE_FILE))
return NULL;
mapping = vma->vm_file->f_mapping;
- if (pte_none(ptent))
- pgoff = linear_page_index(vma, addr);
- else /* pte_file(ptent) is true */
- pgoff = pte_to_pgoff(ptent);
+ pgoff = linear_page_index(vma, addr);
/* page is moved even if it's not RSS of this task(page-faulted). */
#ifdef CONFIG_SWAP
@@ -4961,7 +4827,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
page = mc_handle_present_pte(vma, addr, ptent);
else if (is_swap_pte(ptent))
page = mc_handle_swap_pte(vma, addr, ptent, &ent);
- else if (pte_none(ptent) || pte_file(ptent))
+ else if (pte_none(ptent))
page = mc_handle_file_pte(vma, addr, ptent, &ent);
if (!page && !ent.val)
@@ -5004,7 +4870,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
page = pmd_page(pmd);
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
- if (!move_anon())
+ if (!(mc.flags & MOVE_ANON))
return ret;
if (page->mem_cgroup == mc.from) {
ret = MC_TARGET_PAGE;
@@ -5027,7 +4893,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
- struct vm_area_struct *vma = walk->private;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
@@ -5053,20 +4919,13 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
- struct vm_area_struct *vma;
+ struct mm_walk mem_cgroup_count_precharge_walk = {
+ .pmd_entry = mem_cgroup_count_precharge_pte_range,
+ .mm = mm,
+ };
down_read(&mm->mmap_sem);
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- struct mm_walk mem_cgroup_count_precharge_walk = {
- .pmd_entry = mem_cgroup_count_precharge_pte_range,
- .mm = mm,
- .private = vma,
- };
- if (is_vm_hugetlb_page(vma))
- continue;
- walk_page_range(vma->vm_start, vma->vm_end,
- &mem_cgroup_count_precharge_walk);
- }
+ walk_page_range(0, ~0UL, &mem_cgroup_count_precharge_walk);
up_read(&mm->mmap_sem);
precharge = mc.precharge;
@@ -5146,15 +5005,15 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
struct task_struct *p = cgroup_taskset_first(tset);
int ret = 0;
struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- unsigned long move_charge_at_immigrate;
+ unsigned long move_flags;
/*
* We are now commited to this value whatever it is. Changes in this
* tunable will only affect upcoming migrations, not the current one.
* So we need to save it, and keep it going.
*/
- move_charge_at_immigrate = memcg->move_charge_at_immigrate;
- if (move_charge_at_immigrate) {
+ move_flags = ACCESS_ONCE(memcg->move_charge_at_immigrate);
+ if (move_flags) {
struct mm_struct *mm;
struct mem_cgroup *from = mem_cgroup_from_task(p);
@@ -5174,7 +5033,7 @@ static int mem_cgroup_can_attach(struct cgroup_subsys_state *css,
spin_lock(&mc.lock);
mc.from = from;
mc.to = memcg;
- mc.immigrate_flags = move_charge_at_immigrate;
+ mc.flags = move_flags;
spin_unlock(&mc.lock);
/* We set mc.moving_task later */
@@ -5199,7 +5058,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
struct mm_walk *walk)
{
int ret = 0;
- struct vm_area_struct *vma = walk->private;
+ struct vm_area_struct *vma = walk->vma;
pte_t *pte;
spinlock_t *ptl;
enum mc_target_type target_type;
@@ -5295,7 +5154,10 @@ put: /* get_mctgt_type() gets the page */
static void mem_cgroup_move_charge(struct mm_struct *mm)
{
- struct vm_area_struct *vma;
+ struct mm_walk mem_cgroup_move_charge_walk = {
+ .pmd_entry = mem_cgroup_move_charge_pte_range,
+ .mm = mm,
+ };
lru_add_drain_all();
/*
@@ -5318,24 +5180,11 @@ retry:
cond_resched();
goto retry;
}
- for (vma = mm->mmap; vma; vma = vma->vm_next) {
- int ret;
- struct mm_walk mem_cgroup_move_charge_walk = {
- .pmd_entry = mem_cgroup_move_charge_pte_range,
- .mm = mm,
- .private = vma,
- };
- if (is_vm_hugetlb_page(vma))
- continue;
- ret = walk_page_range(vma->vm_start, vma->vm_end,
- &mem_cgroup_move_charge_walk);
- if (ret)
- /*
- * means we have consumed all precharges and failed in
- * doing additional charge. Just abandon here.
- */
- break;
- }
+ /*
+ * When we have consumed all precharges and failed in doing
+ * additional charge, the page walk just aborts.
+ */
+ walk_page_range(0, ~0UL, &mem_cgroup_move_charge_walk);
up_read(&mm->mmap_sem);
atomic_dec(&mc.from->moving_account);
}
@@ -5386,118 +5235,211 @@ static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
mem_cgroup_from_css(root_css)->use_hierarchy = true;
}
-struct cgroup_subsys memory_cgrp_subsys = {
- .css_alloc = mem_cgroup_css_alloc,
- .css_online = mem_cgroup_css_online,
- .css_offline = mem_cgroup_css_offline,
- .css_free = mem_cgroup_css_free,
- .css_reset = mem_cgroup_css_reset,
- .can_attach = mem_cgroup_can_attach,
- .cancel_attach = mem_cgroup_cancel_attach,
- .attach = mem_cgroup_move_task,
- .bind = mem_cgroup_bind,
- .legacy_cftypes = mem_cgroup_files,
- .early_init = 0,
-};
+static u64 memory_current_read(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return mem_cgroup_usage(mem_cgroup_from_css(css), false);
+}
-#ifdef CONFIG_MEMCG_SWAP
-static int __init enable_swap_account(char *s)
+static int memory_low_show(struct seq_file *m, void *v)
{
- if (!strcmp(s, "1"))
- really_do_swap_account = 1;
- else if (!strcmp(s, "0"))
- really_do_swap_account = 0;
- return 1;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long low = ACCESS_ONCE(memcg->low);
+
+ if (low == PAGE_COUNTER_MAX)
+ seq_puts(m, "infinity\n");
+ else
+ seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
+
+ return 0;
}
-__setup("swapaccount=", enable_swap_account);
-static void __init memsw_file_init(void)
+static ssize_t memory_low_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
- memsw_cgroup_files));
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long low;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "infinity", &low);
+ if (err)
+ return err;
+
+ memcg->low = low;
+
+ return nbytes;
}
-static void __init enable_swap_cgroup(void)
+static int memory_high_show(struct seq_file *m, void *v)
{
- if (!mem_cgroup_disabled() && really_do_swap_account) {
- do_swap_account = 1;
- memsw_file_init();
- }
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long high = ACCESS_ONCE(memcg->high);
+
+ if (high == PAGE_COUNTER_MAX)
+ seq_puts(m, "infinity\n");
+ else
+ seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
+
+ return 0;
}
-#else
-static void __init enable_swap_cgroup(void)
+static ssize_t memory_high_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long high;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "infinity", &high);
+ if (err)
+ return err;
+
+ memcg->high = high;
+
+ return nbytes;
}
-#endif
-#ifdef CONFIG_MEMCG_SWAP
-/**
- * mem_cgroup_swapout - transfer a memsw charge to swap
- * @page: page whose memsw charge to transfer
- * @entry: swap entry to move the charge to
- *
- * Transfer the memsw charge of @page to @entry.
- */
-void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+static int memory_max_show(struct seq_file *m, void *v)
{
- struct mem_cgroup *memcg;
- unsigned short oldid;
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
+ unsigned long max = ACCESS_ONCE(memcg->memory.limit);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page), page);
+ if (max == PAGE_COUNTER_MAX)
+ seq_puts(m, "infinity\n");
+ else
+ seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
- if (!do_swap_account)
- return;
+ return 0;
+}
- memcg = page->mem_cgroup;
+static ssize_t memory_max_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long max;
+ int err;
- /* Readahead page, never charged */
- if (!memcg)
- return;
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "infinity", &max);
+ if (err)
+ return err;
- oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
- VM_BUG_ON_PAGE(oldid, page);
- mem_cgroup_swap_statistics(memcg, true);
+ err = mem_cgroup_resize_limit(memcg, max);
+ if (err)
+ return err;
- page->mem_cgroup = NULL;
+ return nbytes;
+}
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memory, 1);
+static int memory_events_show(struct seq_file *m, void *v)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
- /* XXX: caller holds IRQ-safe mapping->tree_lock */
- VM_BUG_ON(!irqs_disabled());
+ seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
+ seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
+ seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
+ seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
- mem_cgroup_charge_statistics(memcg, page, -1);
- memcg_check_events(memcg, page);
+ return 0;
}
+static struct cftype memory_files[] = {
+ {
+ .name = "current",
+ .read_u64 = memory_current_read,
+ },
+ {
+ .name = "low",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_low_show,
+ .write = memory_low_write,
+ },
+ {
+ .name = "high",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_high_show,
+ .write = memory_high_write,
+ },
+ {
+ .name = "max",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_max_show,
+ .write = memory_max_write,
+ },
+ {
+ .name = "events",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = memory_events_show,
+ },
+ { } /* terminate */
+};
+
+struct cgroup_subsys memory_cgrp_subsys = {
+ .css_alloc = mem_cgroup_css_alloc,
+ .css_online = mem_cgroup_css_online,
+ .css_offline = mem_cgroup_css_offline,
+ .css_free = mem_cgroup_css_free,
+ .css_reset = mem_cgroup_css_reset,
+ .can_attach = mem_cgroup_can_attach,
+ .cancel_attach = mem_cgroup_cancel_attach,
+ .attach = mem_cgroup_move_task,
+ .bind = mem_cgroup_bind,
+ .dfl_cftypes = memory_files,
+ .legacy_cftypes = mem_cgroup_legacy_files,
+ .early_init = 0,
+};
+
/**
- * mem_cgroup_uncharge_swap - uncharge a swap entry
- * @entry: swap entry to uncharge
+ * mem_cgroup_events - count memory events against a cgroup
+ * @memcg: the memory cgroup
+ * @idx: the event index
+ * @nr: the number of events to account for
+ */
+void mem_cgroup_events(struct mem_cgroup *memcg,
+ enum mem_cgroup_events_index idx,
+ unsigned int nr)
+{
+ this_cpu_add(memcg->stat->events[idx], nr);
+}
+
+/**
+ * mem_cgroup_low - check if memory consumption is below the normal range
+ * @root: the highest ancestor to consider
+ * @memcg: the memory cgroup to check
*
- * Drop the memsw charge associated with @entry.
+ * Returns %true if memory consumption of @memcg, and that of all
+ * configurable ancestors up to @root, is below the normal range.
*/
-void mem_cgroup_uncharge_swap(swp_entry_t entry)
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
{
- struct mem_cgroup *memcg;
- unsigned short id;
+ if (mem_cgroup_disabled())
+ return false;
- if (!do_swap_account)
- return;
+ /*
+ * The toplevel group doesn't have a configurable range, so
+ * it's never low when looked at directly, and it is not
+ * considered an ancestor when assessing the hierarchy.
+ */
- id = swap_cgroup_record(entry, 0);
- rcu_read_lock();
- memcg = mem_cgroup_lookup(id);
- if (memcg) {
- if (!mem_cgroup_is_root(memcg))
- page_counter_uncharge(&memcg->memsw, 1);
- mem_cgroup_swap_statistics(memcg, false);
- css_put(&memcg->css);
+ if (memcg == root_mem_cgroup)
+ return false;
+
+ if (page_counter_read(&memcg->memory) > memcg->low)
+ return false;
+
+ while (memcg != root) {
+ memcg = parent_mem_cgroup(memcg);
+
+ if (memcg == root_mem_cgroup)
+ break;
+
+ if (page_counter_read(&memcg->memory) > memcg->low)
+ return false;
}
- rcu_read_unlock();
+ return true;
}
-#endif
/**
* mem_cgroup_try_charge - try charging a page
@@ -5831,10 +5773,155 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
*/
static int __init mem_cgroup_init(void)
{
+ int cpu, node;
+
hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
- enable_swap_cgroup();
- mem_cgroup_soft_limit_tree_init();
- memcg_stock_init();
+
+ for_each_possible_cpu(cpu)
+ INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
+ drain_local_stock);
+
+ for_each_node(node) {
+ struct mem_cgroup_tree_per_node *rtpn;
+ int zone;
+
+ rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
+ node_online(node) ? node : NUMA_NO_NODE);
+
+ for (zone = 0; zone < MAX_NR_ZONES; zone++) {
+ struct mem_cgroup_tree_per_zone *rtpz;
+
+ rtpz = &rtpn->rb_tree_per_zone[zone];
+ rtpz->rb_root = RB_ROOT;
+ spin_lock_init(&rtpz->lock);
+ }
+ soft_limit_tree.rb_tree_per_node[node] = rtpn;
+ }
+
return 0;
}
subsys_initcall(mem_cgroup_init);
+
+#ifdef CONFIG_MEMCG_SWAP
+/**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ unsigned short oldid;
+
+ VM_BUG_ON_PAGE(PageLRU(page), page);
+ VM_BUG_ON_PAGE(page_count(page), page);
+
+ if (!do_swap_account)
+ return;
+
+ memcg = page->mem_cgroup;
+
+ /* Readahead page, never charged */
+ if (!memcg)
+ return;
+
+ oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
+ VM_BUG_ON_PAGE(oldid, page);
+ mem_cgroup_swap_statistics(memcg, true);
+
+ page->mem_cgroup = NULL;
+
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memory, 1);
+
+ /* XXX: caller holds IRQ-safe mapping->tree_lock */
+ VM_BUG_ON(!irqs_disabled());
+
+ mem_cgroup_charge_statistics(memcg, page, -1);
+ memcg_check_events(memcg, page);
+}
+
+/**
+ * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * @entry: swap entry to uncharge
+ *
+ * Drop the memsw charge associated with @entry.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+ struct mem_cgroup *memcg;
+ unsigned short id;
+
+ if (!do_swap_account)
+ return;
+
+ id = swap_cgroup_record(entry, 0);
+ rcu_read_lock();
+ memcg = mem_cgroup_lookup(id);
+ if (memcg) {
+ if (!mem_cgroup_is_root(memcg))
+ page_counter_uncharge(&memcg->memsw, 1);
+ mem_cgroup_swap_statistics(memcg, false);
+ css_put(&memcg->css);
+ }
+ rcu_read_unlock();
+}
+
+/* for remember boot option*/
+#ifdef CONFIG_MEMCG_SWAP_ENABLED
+static int really_do_swap_account __initdata = 1;
+#else
+static int really_do_swap_account __initdata;
+#endif
+
+static int __init enable_swap_account(char *s)
+{
+ if (!strcmp(s, "1"))
+ really_do_swap_account = 1;
+ else if (!strcmp(s, "0"))
+ really_do_swap_account = 0;
+ return 1;
+}
+__setup("swapaccount=", enable_swap_account);
+
+static struct cftype memsw_cgroup_files[] = {
+ {
+ .name = "memsw.usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.max_usage_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.limit_in_bytes",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
+ .write = mem_cgroup_write,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ {
+ .name = "memsw.failcnt",
+ .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
+ .write = mem_cgroup_reset,
+ .read_u64 = mem_cgroup_read_u64,
+ },
+ { }, /* terminate */
+};
+
+static int __init mem_cgroup_swap_init(void)
+{
+ if (!mem_cgroup_disabled() && really_do_swap_account) {
+ do_swap_account = 1;
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
+ memsw_cgroup_files));
+ }
+ return 0;
+}
+subsys_initcall(mem_cgroup_swap_init);
+
+#endif /* CONFIG_MEMCG_SWAP */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index feb803b..d487f8d 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -242,15 +242,8 @@ void shake_page(struct page *p, int access)
* Only call shrink_node_slabs here (which would also shrink
* other caches) if access is not potentially fatal.
*/
- if (access) {
- int nr;
- int nid = page_to_nid(p);
- do {
- nr = shrink_node_slabs(GFP_KERNEL, nid, 1000, 1000);
- if (page_count(p) == 1)
- break;
- } while (nr > 10);
- }
+ if (access)
+ drop_slab_node(page_to_nid(p));
}
EXPORT_SYMBOL_GPL(shake_page);
@@ -1654,8 +1647,6 @@ static int __soft_offline_page(struct page *page, int flags)
* setting PG_hwpoison.
*/
if (!is_free_buddy_page(page))
- lru_add_drain_all();
- if (!is_free_buddy_page(page))
drain_all_pages(page_zone(page));
SetPageHWPoison(page);
if (!is_free_buddy_page(page))
diff --git a/mm/memory.c b/mm/memory.c
index 2c3536c..8068893 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
pmd = pmd_offset(pud, start);
pud_clear(pud);
pmd_free_tlb(tlb, pmd, start);
+ mm_dec_nr_pmds(tlb->mm);
}
static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
@@ -754,6 +755,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr,
if (HAVE_PTE_SPECIAL) {
if (likely(!pte_special(pte)))
goto check_pfn;
+ if (vma->vm_ops && vma->vm_ops->find_special_page)
+ return vma->vm_ops->find_special_page(vma, addr);
if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP))
return NULL;
if (!is_zero_pfn(pfn))
@@ -811,42 +814,40 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
/* pte contains position in swap or file, so copy. */
if (unlikely(!pte_present(pte))) {
- if (!pte_file(pte)) {
- swp_entry_t entry = pte_to_swp_entry(pte);
-
- if (likely(!non_swap_entry(entry))) {
- if (swap_duplicate(entry) < 0)
- return entry.val;
-
- /* make sure dst_mm is on swapoff's mmlist. */
- if (unlikely(list_empty(&dst_mm->mmlist))) {
- spin_lock(&mmlist_lock);
- if (list_empty(&dst_mm->mmlist))
- list_add(&dst_mm->mmlist,
- &src_mm->mmlist);
- spin_unlock(&mmlist_lock);
- }
- rss[MM_SWAPENTS]++;
- } else if (is_migration_entry(entry)) {
- page = migration_entry_to_page(entry);
-
- if (PageAnon(page))
- rss[MM_ANONPAGES]++;
- else
- rss[MM_FILEPAGES]++;
-
- if (is_write_migration_entry(entry) &&
- is_cow_mapping(vm_flags)) {
- /*
- * COW mappings require pages in both
- * parent and child to be set to read.
- */
- make_migration_entry_read(&entry);
- pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*src_pte))
- pte = pte_swp_mksoft_dirty(pte);
- set_pte_at(src_mm, addr, src_pte, pte);
- }
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if (likely(!non_swap_entry(entry))) {
+ if (swap_duplicate(entry) < 0)
+ return entry.val;
+
+ /* make sure dst_mm is on swapoff's mmlist. */
+ if (unlikely(list_empty(&dst_mm->mmlist))) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist,
+ &src_mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ rss[MM_SWAPENTS]++;
+ } else if (is_migration_entry(entry)) {
+ page = migration_entry_to_page(entry);
+
+ if (PageAnon(page))
+ rss[MM_ANONPAGES]++;
+ else
+ rss[MM_FILEPAGES]++;
+
+ if (is_write_migration_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ /*
+ * COW mappings require pages in both
+ * parent and child to be set to read.
+ */
+ make_migration_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*src_pte))
+ pte = pte_swp_mksoft_dirty(pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
}
}
goto out_set_pte;
@@ -1020,11 +1021,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
* readonly mappings. The tradeoff is that copy_page_range is more
* efficient than faulting.
*/
- if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
- VM_PFNMAP | VM_MIXEDMAP))) {
- if (!vma->anon_vma)
- return 0;
- }
+ if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+ !vma->anon_vma)
+ return 0;
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst_mm, src_mm, vma);
@@ -1082,6 +1081,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
spinlock_t *ptl;
pte_t *start_pte;
pte_t *pte;
+ swp_entry_t entry;
again:
init_rss_vec(rss);
@@ -1107,28 +1107,12 @@ again:
if (details->check_mapping &&
details->check_mapping != page->mapping)
continue;
- /*
- * Each page->index must be checked when
- * invalidating or truncating nonlinear.
- */
- if (details->nonlinear_vma &&
- (page->index < details->first_index ||
- page->index > details->last_index))
- continue;
}
ptent = ptep_get_and_clear_full(mm, addr, pte,
tlb->fullmm);
tlb_remove_tlb_entry(tlb, pte, addr);
if (unlikely(!page))
continue;
- if (unlikely(details) && details->nonlinear_vma
- && linear_page_index(details->nonlinear_vma,
- addr) != page->index) {
- pte_t ptfile = pgoff_to_pte(page->index);
- if (pte_soft_dirty(ptent))
- ptfile = pte_file_mksoft_dirty(ptfile);
- set_pte_at(mm, addr, pte, ptfile);
- }
if (PageAnon(page))
rss[MM_ANONPAGES]--;
else {
@@ -1151,33 +1135,25 @@ again:
}
continue;
}
- /*
- * If details->check_mapping, we leave swap entries;
- * if details->nonlinear_vma, we leave file entries.
- */
+ /* If details->check_mapping, we leave swap entries. */
if (unlikely(details))
continue;
- if (pte_file(ptent)) {
- if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
- print_bad_pte(vma, addr, ptent, NULL);
- } else {
- swp_entry_t entry = pte_to_swp_entry(ptent);
- if (!non_swap_entry(entry))
- rss[MM_SWAPENTS]--;
- else if (is_migration_entry(entry)) {
- struct page *page;
+ entry = pte_to_swp_entry(ptent);
+ if (!non_swap_entry(entry))
+ rss[MM_SWAPENTS]--;
+ else if (is_migration_entry(entry)) {
+ struct page *page;
- page = migration_entry_to_page(entry);
+ page = migration_entry_to_page(entry);
- if (PageAnon(page))
- rss[MM_ANONPAGES]--;
- else
- rss[MM_FILEPAGES]--;
- }
- if (unlikely(!free_swap_and_cache(entry)))
- print_bad_pte(vma, addr, ptent, NULL);
+ if (PageAnon(page))
+ rss[MM_ANONPAGES]--;
+ else
+ rss[MM_FILEPAGES]--;
}
+ if (unlikely(!free_swap_and_cache(entry)))
+ print_bad_pte(vma, addr, ptent, NULL);
pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
} while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1277,7 +1253,7 @@ static void unmap_page_range(struct mmu_gather *tlb,
pgd_t *pgd;
unsigned long next;
- if (details && !details->check_mapping && !details->nonlinear_vma)
+ if (details && !details->check_mapping)
details = NULL;
BUG_ON(addr >= end);
@@ -1371,7 +1347,7 @@ void unmap_vmas(struct mmu_gather *tlb,
* @vma: vm_area_struct holding the applicable pages
* @start: starting address of pages to zap
* @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
*
* Caller must protect the VMA list
*/
@@ -1397,7 +1373,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
* @vma: vm_area_struct holding the applicable pages
* @address: starting address of pages to zap
* @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
*
* The range must fit into one VMA.
*/
@@ -1922,12 +1898,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
EXPORT_SYMBOL_GPL(apply_to_page_range);
/*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically. Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
+ * handle_pte_fault chooses page fault handler according to an entry which was
+ * read non-atomically. Before making any commitment, on those architectures
+ * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
+ * parts, do_swap_page must check under lock before unmapping the pte and
+ * proceeding (but do_wp_page is only called after already making such a check;
* and do_anonymous_page can safely check later on).
*/
static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
@@ -1990,6 +1965,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page,
vmf.pgoff = page->index;
vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE;
vmf.page = page;
+ vmf.cow_page = NULL;
ret = vma->vm_ops->page_mkwrite(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))
@@ -2033,7 +2009,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t entry;
int ret = 0;
int page_mkwrite = 0;
- struct page *dirty_page = NULL;
+ bool dirty_shared = false;
unsigned long mmun_start = 0; /* For mmu_notifiers */
unsigned long mmun_end = 0; /* For mmu_notifiers */
struct mem_cgroup *memcg;
@@ -2084,6 +2060,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unlock_page(old_page);
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
+ page_cache_get(old_page);
/*
* Only catch write-faults on shared writable pages,
* read-only shared pages can get COWed by
@@ -2091,7 +2068,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
*/
if (vma->vm_ops && vma->vm_ops->page_mkwrite) {
int tmp;
- page_cache_get(old_page);
+
pte_unmap_unlock(page_table, ptl);
tmp = do_page_mkwrite(vma, old_page, address);
if (unlikely(!tmp || (tmp &
@@ -2111,11 +2088,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
unlock_page(old_page);
goto unlock;
}
-
page_mkwrite = 1;
}
- dirty_page = old_page;
- get_page(dirty_page);
+
+ dirty_shared = true;
reuse:
/*
@@ -2134,20 +2110,20 @@ reuse:
pte_unmap_unlock(page_table, ptl);
ret |= VM_FAULT_WRITE;
- if (!dirty_page)
- return ret;
-
- if (!page_mkwrite) {
+ if (dirty_shared) {
struct address_space *mapping;
int dirtied;
- lock_page(dirty_page);
- dirtied = set_page_dirty(dirty_page);
- VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
- mapping = dirty_page->mapping;
- unlock_page(dirty_page);
+ if (!page_mkwrite)
+ lock_page(old_page);
+
+ dirtied = set_page_dirty(old_page);
+ VM_BUG_ON_PAGE(PageAnon(old_page), old_page);
+ mapping = old_page->mapping;
+ unlock_page(old_page);
+ page_cache_release(old_page);
- if (dirtied && mapping) {
+ if ((dirtied || page_mkwrite) && mapping) {
/*
* Some device drivers do not set page.mapping
* but still dirty their pages
@@ -2155,25 +2131,9 @@ reuse:
balance_dirty_pages_ratelimited(mapping);
}
- /* file_update_time outside page_lock */
- if (vma->vm_file)
+ if (!page_mkwrite)
file_update_time(vma->vm_file);
}
- put_page(dirty_page);
- if (page_mkwrite) {
- struct address_space *mapping = dirty_page->mapping;
-
- set_page_dirty(dirty_page);
- unlock_page(dirty_page);
- page_cache_release(dirty_page);
- if (mapping) {
- /*
- * Some device drivers do not set page.mapping
- * but still dirty their pages
- */
- balance_dirty_pages_ratelimited(mapping);
- }
- }
return ret;
}
@@ -2331,25 +2291,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
}
}
-static inline void unmap_mapping_range_list(struct list_head *head,
- struct zap_details *details)
-{
- struct vm_area_struct *vma;
-
- /*
- * In nonlinear VMAs there is no correspondence between virtual address
- * offset and file offset. So we must perform an exhaustive search
- * across *all* the pages in each nonlinear VMA, not just the pages
- * whose virtual address lies outside the file truncation point.
- */
- list_for_each_entry(vma, head, shared.nonlinear) {
- details->nonlinear_vma = vma;
- unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
- }
-}
-
/**
- * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified
+ * address_space corresponding to the specified page range in the underlying
+ * file.
+ *
* @mapping: the address space containing mmaps to be unmapped.
* @holebegin: byte in first page to unmap, relative to the start of
* the underlying file. This will be rounded down to a PAGE_SIZE
@@ -2378,18 +2324,16 @@ void unmap_mapping_range(struct address_space *mapping,
}
details.check_mapping = even_cows? NULL: mapping;
- details.nonlinear_vma = NULL;
details.first_index = hba;
details.last_index = hba + hlen - 1;
if (details.last_index < details.first_index)
details.last_index = ULONG_MAX;
+ /* DAX uses i_mmap_lock to serialise file truncate vs page fault */
i_mmap_lock_write(mapping);
if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details);
- if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
- unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
i_mmap_unlock_write(mapping);
}
EXPORT_SYMBOL(unmap_mapping_range);
@@ -2696,7 +2640,8 @@ oom:
* See filemap_fault() and __lock_page_retry().
*/
static int __do_fault(struct vm_area_struct *vma, unsigned long address,
- pgoff_t pgoff, unsigned int flags, struct page **page)
+ pgoff_t pgoff, unsigned int flags,
+ struct page *cow_page, struct page **page)
{
struct vm_fault vmf;
int ret;
@@ -2705,10 +2650,13 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
vmf.pgoff = pgoff;
vmf.flags = flags;
vmf.page = NULL;
+ vmf.cow_page = cow_page;
ret = vma->vm_ops->fault(vma, &vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
+ if (!vmf.page)
+ goto out;
if (unlikely(PageHWPoison(vmf.page))) {
if (ret & VM_FAULT_LOCKED)
@@ -2722,6 +2670,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
else
VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
+ out:
*page = vmf.page;
return ret;
}
@@ -2750,8 +2699,6 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address,
entry = mk_pte(page, vma->vm_page_prot);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
- entry = pte_mksoft_dirty(entry);
if (anon) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, address);
@@ -2886,8 +2833,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* if page by the offset is not ready to be mapped (cold cache or
* something).
*/
- if (vma->vm_ops->map_pages && !(flags & FAULT_FLAG_NONLINEAR) &&
- fault_around_bytes >> PAGE_SHIFT > 1) {
+ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) {
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
do_fault_around(vma, address, pte, pgoff, flags);
if (!pte_same(*pte, orig_pte))
@@ -2895,7 +2841,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(pte, ptl);
}
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -2935,26 +2881,43 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return VM_FAULT_OOM;
}
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
goto uncharge_out;
- copy_user_highpage(new_page, fault_page, address, vma);
+ if (fault_page)
+ copy_user_highpage(new_page, fault_page, address, vma);
__SetPageUptodate(new_page);
pte = pte_offset_map_lock(mm, pmd, address, &ptl);
if (unlikely(!pte_same(*pte, orig_pte))) {
pte_unmap_unlock(pte, ptl);
- unlock_page(fault_page);
- page_cache_release(fault_page);
+ if (fault_page) {
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ } else {
+ /*
+ * The fault handler has no page to lock, so it holds
+ * i_mmap_lock for read to protect against truncate.
+ */
+ i_mmap_unlock_read(vma->vm_file->f_mapping);
+ }
goto uncharge_out;
}
do_set_pte(vma, address, new_page, pte, true, true);
mem_cgroup_commit_charge(new_page, memcg, false);
lru_cache_add_active_or_unevictable(new_page, vma);
pte_unmap_unlock(pte, ptl);
- unlock_page(fault_page);
- page_cache_release(fault_page);
+ if (fault_page) {
+ unlock_page(fault_page);
+ page_cache_release(fault_page);
+ } else {
+ /*
+ * The fault handler has no page to lock, so it holds
+ * i_mmap_lock for read to protect against truncate.
+ */
+ i_mmap_unlock_read(vma->vm_file->f_mapping);
+ }
return ret;
uncharge_out:
mem_cgroup_cancel_charge(new_page, memcg);
@@ -2973,7 +2936,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
int dirtied = 0;
int ret, tmp;
- ret = __do_fault(vma, address, pgoff, flags, &fault_page);
+ ret = __do_fault(vma, address, pgoff, flags, NULL, &fault_page);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
return ret;
@@ -3019,8 +2982,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
balance_dirty_pages_ratelimited(mapping);
}
- /* file_update_time outside page_lock */
- if (vma->vm_file && !vma->vm_ops->page_mkwrite)
+ if (!vma->vm_ops->page_mkwrite)
file_update_time(vma->vm_file);
return ret;
@@ -3032,7 +2994,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
* The mmap_sem may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
-static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *page_table, pmd_t *pmd,
unsigned int flags, pte_t orig_pte)
{
@@ -3049,46 +3011,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}
-/*
- * Fault of a previously existing named mapping. Repopulate the pte
- * from the encoded file_pte if possible. This enables swappable
- * nonlinear vmas.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with pte unmapped and unlocked.
- * The mmap_sem may have been released depending on flags and our
- * return value. See filemap_fault() and __lock_page_or_retry().
- */
-static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
- unsigned long address, pte_t *page_table, pmd_t *pmd,
- unsigned int flags, pte_t orig_pte)
-{
- pgoff_t pgoff;
-
- flags |= FAULT_FLAG_NONLINEAR;
-
- if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
- return 0;
-
- if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
- /*
- * Page table corrupted: show pte and kill process.
- */
- print_bad_pte(vma, address, orig_pte, NULL);
- return VM_FAULT_SIGBUS;
- }
-
- pgoff = pte_to_pgoff(orig_pte);
- if (!(flags & FAULT_FLAG_WRITE))
- return do_read_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
- if (!(vma->vm_flags & VM_SHARED))
- return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
- orig_pte);
- return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
-}
-
static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
unsigned long addr, int page_nid,
int *flags)
@@ -3115,14 +3037,17 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
bool migrated = false;
int flags = 0;
+ /* A PROT_NONE fault should not end up here */
+ BUG_ON(!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)));
+
/*
* The "pte" at this point cannot be used safely without
* validation through pte_unmap_same(). It's of NUMA type but
* the pfn may be screwed if the read is non atomic.
*
- * ptep_modify_prot_start is not called as this is clearing
- * the _PAGE_NUMA bit and it is not really expected that there
- * would be concurrent hardware modifications to the PTE.
+ * We can safely just do a "set_pte_at()", because the old
+ * page table entry is not accessible, so there would be no
+ * concurrent hardware modifications to the PTE.
*/
ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
@@ -3131,7 +3056,9 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
goto out;
}
- pte = pte_mknonnuma(pte);
+ /* Make it present again */
+ pte = pte_modify(pte, vma->vm_page_prot);
+ pte = pte_mkyoung(pte);
set_pte_at(mm, addr, ptep, pte);
update_mmu_cache(vma, addr, ptep);
@@ -3140,7 +3067,6 @@ static int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_unmap_unlock(ptep, ptl);
return 0;
}
- BUG_ON(is_zero_pfn(page_to_pfn(page)));
/*
* Avoid grouping on DSO/COW pages in specific and RO pages
@@ -3216,20 +3142,17 @@ static int handle_pte_fault(struct mm_struct *mm,
if (pte_none(entry)) {
if (vma->vm_ops) {
if (likely(vma->vm_ops->fault))
- return do_linear_fault(mm, vma, address,
- pte, pmd, flags, entry);
+ return do_fault(mm, vma, address, pte,
+ pmd, flags, entry);
}
return do_anonymous_page(mm, vma, address,
pte, pmd, flags);
}
- if (pte_file(entry))
- return do_nonlinear_fault(mm, vma, address,
- pte, pmd, flags, entry);
return do_swap_page(mm, vma, address,
pte, pmd, flags, entry);
}
- if (pte_numa(entry))
+ if (pte_protnone(entry))
return do_numa_page(mm, vma, address, entry, pte, pmd);
ptl = pte_lockptr(mm, pmd);
@@ -3307,7 +3230,7 @@ static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
if (pmd_trans_splitting(orig_pmd))
return 0;
- if (pmd_numa(orig_pmd))
+ if (pmd_protnone(orig_pmd))
return do_huge_pmd_numa_page(mm, vma, address,
orig_pmd, pmd);
@@ -3428,15 +3351,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
spin_lock(&mm->page_table_lock);
#ifndef __ARCH_HAS_4LEVEL_HACK
- if (pud_present(*pud)) /* Another has populated it */
- pmd_free(mm, new);
- else
+ if (!pud_present(*pud)) {
+ mm_inc_nr_pmds(mm);
pud_populate(mm, pud, new);
-#else
- if (pgd_present(*pud)) /* Another has populated it */
+ } else /* Another has populated it */
pmd_free(mm, new);
- else
+#else
+ if (!pgd_present(*pud)) {
+ mm_inc_nr_pmds(mm);
pgd_populate(mm, pud, new);
+ } else /* Another has populated it */
+ pmd_free(mm, new);
#endif /* __ARCH_HAS_4LEVEL_HACK */
spin_unlock(&mm->page_table_lock);
return 0;
@@ -3561,7 +3486,7 @@ int generic_access_phys(struct vm_area_struct *vma, unsigned long addr,
if (follow_phys(vma, addr, write, &prot, &phys_addr))
return -EINVAL;
- maddr = ioremap_prot(phys_addr, PAGE_SIZE, prot);
+ maddr = ioremap_prot(phys_addr, PAGE_ALIGN(len + offset), prot);
if (write)
memcpy_toio(maddr + offset, buf, len);
else
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 0e0961b..4721046 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -471,24 +471,34 @@ static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
+struct queue_pages {
+ struct list_head *pagelist;
+ unsigned long flags;
+ nodemask_t *nmask;
+ struct vm_area_struct *prev;
+};
+
/*
* Scan through pages checking if pages follow certain conditions,
* and move them to the pagelist if they do.
*/
-static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
+static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
- pte_t *orig_pte;
+ struct vm_area_struct *vma = walk->vma;
+ struct page *page;
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
+ int nid;
pte_t *pte;
spinlock_t *ptl;
- orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- do {
- struct page *page;
- int nid;
+ split_huge_page_pmd(vma, addr, pmd);
+ if (pmd_trans_unstable(pmd))
+ return 0;
+ pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr != end; pte++, addr += PAGE_SIZE) {
if (!pte_present(*pte))
continue;
page = vm_normal_page(vma, addr, *pte);
@@ -501,114 +511,46 @@ static int queue_pages_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
if (PageReserved(page))
continue;
nid = page_to_nid(page);
- if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
continue;
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- migrate_page_add(page, private, flags);
- else
- break;
- } while (pte++, addr += PAGE_SIZE, addr != end);
- pte_unmap_unlock(orig_pte, ptl);
- return addr != end;
+ migrate_page_add(page, qp->pagelist, flags);
+ }
+ pte_unmap_unlock(pte - 1, ptl);
+ cond_resched();
+ return 0;
}
-static void queue_pages_hugetlb_pmd_range(struct vm_area_struct *vma,
- pmd_t *pmd, const nodemask_t *nodes, unsigned long flags,
- void *private)
+static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
+ unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
+ struct queue_pages *qp = walk->private;
+ unsigned long flags = qp->flags;
int nid;
struct page *page;
spinlock_t *ptl;
pte_t entry;
- ptl = huge_pte_lock(hstate_vma(vma), vma->vm_mm, (pte_t *)pmd);
- entry = huge_ptep_get((pte_t *)pmd);
+ ptl = huge_pte_lock(hstate_vma(walk->vma), walk->mm, pte);
+ entry = huge_ptep_get(pte);
if (!pte_present(entry))
goto unlock;
page = pte_page(entry);
nid = page_to_nid(page);
- if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
+ if (node_isset(nid, *qp->nmask) == !!(flags & MPOL_MF_INVERT))
goto unlock;
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1))
- isolate_huge_page(page, private);
+ isolate_huge_page(page, qp->pagelist);
unlock:
spin_unlock(ptl);
#else
BUG();
#endif
-}
-
-static inline int queue_pages_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pmd_t *pmd;
- unsigned long next;
-
- pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- if (!pmd_present(*pmd))
- continue;
- if (pmd_huge(*pmd) && is_vm_hugetlb_page(vma)) {
- queue_pages_hugetlb_pmd_range(vma, pmd, nodes,
- flags, private);
- continue;
- }
- split_huge_page_pmd(vma, addr, pmd);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- continue;
- if (queue_pages_pte_range(vma, pmd, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pmd++, addr = next, addr != end);
- return 0;
-}
-
-static inline int queue_pages_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pud_t *pud;
- unsigned long next;
-
- pud = pud_offset(pgd, addr);
- do {
- next = pud_addr_end(addr, end);
- if (pud_huge(*pud) && is_vm_hugetlb_page(vma))
- continue;
- if (pud_none_or_clear_bad(pud))
- continue;
- if (queue_pages_pmd_range(vma, pud, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pud++, addr = next, addr != end);
- return 0;
-}
-
-static inline int queue_pages_pgd_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- const nodemask_t *nodes, unsigned long flags,
- void *private)
-{
- pgd_t *pgd;
- unsigned long next;
-
- pgd = pgd_offset(vma->vm_mm, addr);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- continue;
- if (queue_pages_pud_range(vma, pgd, addr, next, nodes,
- flags, private))
- return -EIO;
- } while (pgd++, addr = next, addr != end);
return 0;
}
@@ -627,7 +569,7 @@ unsigned long change_prot_numa(struct vm_area_struct *vma,
{
int nr_updated;
- nr_updated = change_protection(vma, addr, end, vma->vm_page_prot, 0, 1);
+ nr_updated = change_protection(vma, addr, end, PAGE_NONE, 0, 1);
if (nr_updated)
count_vm_numa_events(NUMA_PTE_UPDATES, nr_updated);
@@ -641,6 +583,49 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
}
#endif /* CONFIG_NUMA_BALANCING */
+static int queue_pages_test_walk(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+ struct queue_pages *qp = walk->private;
+ unsigned long endvma = vma->vm_end;
+ unsigned long flags = qp->flags;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ if (endvma > end)
+ endvma = end;
+ if (vma->vm_start > start)
+ start = vma->vm_start;
+
+ if (!(flags & MPOL_MF_DISCONTIG_OK)) {
+ if (!vma->vm_next && vma->vm_end < end)
+ return -EFAULT;
+ if (qp->prev && qp->prev->vm_end < vma->vm_start)
+ return -EFAULT;
+ }
+
+ qp->prev = vma;
+
+ if (vma->vm_flags & VM_PFNMAP)
+ return 1;
+
+ if (flags & MPOL_MF_LAZY) {
+ /* Similar to task_numa_work, skip inaccessible VMAs */
+ if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
+ change_prot_numa(vma, start, endvma);
+ return 1;
+ }
+
+ if ((flags & MPOL_MF_STRICT) ||
+ ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
+ vma_migratable(vma)))
+ /* queue pages from current vma */
+ return 0;
+ return 1;
+}
+
/*
* Walk through page tables and collect pages to be migrated.
*
@@ -650,50 +635,24 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
*/
static int
queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
- const nodemask_t *nodes, unsigned long flags, void *private)
-{
- int err = 0;
- struct vm_area_struct *vma, *prev;
-
- vma = find_vma(mm, start);
- if (!vma)
- return -EFAULT;
- prev = NULL;
- for (; vma && vma->vm_start < end; vma = vma->vm_next) {
- unsigned long endvma = vma->vm_end;
-
- if (endvma > end)
- endvma = end;
- if (vma->vm_start > start)
- start = vma->vm_start;
-
- if (!(flags & MPOL_MF_DISCONTIG_OK)) {
- if (!vma->vm_next && vma->vm_end < end)
- return -EFAULT;
- if (prev && prev->vm_end < vma->vm_start)
- return -EFAULT;
- }
-
- if (flags & MPOL_MF_LAZY) {
- /* Similar to task_numa_work, skip inaccessible VMAs */
- if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
- change_prot_numa(vma, start, endvma);
- goto next;
- }
-
- if ((flags & MPOL_MF_STRICT) ||
- ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
- vma_migratable(vma))) {
-
- err = queue_pages_pgd_range(vma, start, endvma, nodes,
- flags, private);
- if (err)
- break;
- }
-next:
- prev = vma;
- }
- return err;
+ nodemask_t *nodes, unsigned long flags,
+ struct list_head *pagelist)
+{
+ struct queue_pages qp = {
+ .pagelist = pagelist,
+ .flags = flags,
+ .nmask = nodes,
+ .prev = NULL,
+ };
+ struct mm_walk queue_pages_walk = {
+ .hugetlb_entry = queue_pages_hugetlb,
+ .pmd_entry = queue_pages_pte_range,
+ .test_walk = queue_pages_test_walk,
+ .mm = mm,
+ .private = &qp,
+ };
+
+ return walk_page_range(start, end, &queue_pages_walk);
}
/*
@@ -1988,43 +1947,63 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
* @order:Order of the GFP allocation.
* @vma: Pointer to VMA or NULL if not available.
* @addr: Virtual Address of the allocation. Must be inside the VMA.
+ * @node: Which node to prefer for allocation (modulo policy).
+ * @hugepage: for hugepages try only the preferred node if possible
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
* When VMA is not NULL caller must hold down_read on the mmap_sem of the
* mm_struct of the VMA to prevent it from going away. Should be used for
- * all allocations for pages that will be mapped into
- * user space. Returns NULL when no page can be allocated.
- *
- * Should be called with the mm_sem of the vma hold.
+ * all allocations for pages that will be mapped into user space. Returns
+ * NULL when no page can be allocated.
*/
struct page *
alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
- unsigned long addr, int node)
+ unsigned long addr, int node, bool hugepage)
{
struct mempolicy *pol;
struct page *page;
unsigned int cpuset_mems_cookie;
+ struct zonelist *zl;
+ nodemask_t *nmask;
retry_cpuset:
pol = get_vma_policy(vma, addr);
cpuset_mems_cookie = read_mems_allowed_begin();
- if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
+ if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
+ pol->mode != MPOL_INTERLEAVE)) {
+ /*
+ * For hugepage allocation and non-interleave policy which
+ * allows the current node, we only try to allocate from the
+ * current node and don't fall back to other nodes, as the
+ * cost of remote accesses would likely offset THP benefits.
+ *
+ * If the policy is interleave, or does not allow the current
+ * node in its nodemask, we allocate the standard way.
+ */
+ nmask = policy_nodemask(gfp, pol);
+ if (!nmask || node_isset(node, *nmask)) {
+ mpol_cond_put(pol);
+ page = alloc_pages_exact_node(node, gfp, order);
+ goto out;
+ }
+ }
+
+ if (pol->mode == MPOL_INTERLEAVE) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
mpol_cond_put(pol);
page = alloc_page_interleave(gfp, order, nid);
- if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
- goto retry_cpuset;
-
- return page;
+ goto out;
}
- page = __alloc_pages_nodemask(gfp, order,
- policy_zonelist(gfp, pol, node),
- policy_nodemask(gfp, pol));
+
+ nmask = policy_nodemask(gfp, pol);
+ zl = policy_zonelist(gfp, pol, node);
mpol_cond_put(pol);
+ page = __alloc_pages_nodemask(gfp, order, zl, nmask);
+out:
if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
goto retry_cpuset;
return page;
@@ -2838,8 +2817,7 @@ void mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
p += snprintf(p, buffer + maxlen - p, "relative");
}
- if (!nodes_empty(nodes)) {
- p += snprintf(p, buffer + maxlen - p, ":");
- p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
- }
+ if (!nodes_empty(nodes))
+ p += scnprintf(p, buffer + maxlen - p, ":%*pbl",
+ nodemask_pr_args(&nodes));
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 344cdf6..85e0426 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -179,37 +179,6 @@ out:
}
/*
- * Congratulations to trinity for discovering this bug.
- * mm/fremap.c's remap_file_pages() accepts any range within a single vma to
- * convert that vma to VM_NONLINEAR; and generic_file_remap_pages() will then
- * replace the specified range by file ptes throughout (maybe populated after).
- * If page migration finds a page within that range, while it's still located
- * by vma_interval_tree rather than lost to i_mmap_nonlinear list, no problem:
- * zap_pte() clears the temporary migration entry before mmap_sem is dropped.
- * But if the migrating page is in a part of the vma outside the range to be
- * remapped, then it will not be cleared, and remove_migration_ptes() needs to
- * deal with it. Fortunately, this part of the vma is of course still linear,
- * so we just need to use linear location on the nonlinear list.
- */
-static int remove_linear_migration_ptes_from_nonlinear(struct page *page,
- struct address_space *mapping, void *arg)
-{
- struct vm_area_struct *vma;
- /* hugetlbfs does not support remap_pages, so no huge pgoff worries */
- pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- unsigned long addr;
-
- list_for_each_entry(vma,
- &mapping->i_mmap_nonlinear, shared.nonlinear) {
-
- addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
- if (addr >= vma->vm_start && addr < vma->vm_end)
- remove_migration_pte(page, vma, addr, arg);
- }
- return SWAP_AGAIN;
-}
-
-/*
* Get rid of all migration entries and replace them by
* references to the indicated page.
*/
@@ -218,7 +187,6 @@ static void remove_migration_ptes(struct page *old, struct page *new)
struct rmap_walk_control rwc = {
.rmap_one = remove_migration_pte,
.arg = old,
- .file_nonlinear = remove_linear_migration_ptes_from_nonlinear,
};
rmap_walk(new, &rwc);
@@ -229,7 +197,7 @@ static void remove_migration_ptes(struct page *old, struct page *new)
* get to the page and wait until migration is finished.
* When we return from this function the fault will be retried.
*/
-static void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
+void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep,
spinlock_t *ptl)
{
pte_t pte;
@@ -1268,7 +1236,8 @@ static int do_move_page_to_node_array(struct mm_struct *mm,
goto put_and_set;
if (PageHuge(page)) {
- isolate_huge_page(page, &pagelist);
+ if (PageHead(page))
+ isolate_huge_page(page, &pagelist);
goto put_and_set;
}
@@ -1685,12 +1654,6 @@ bool pmd_trans_migrating(pmd_t pmd)
return PageLocked(page);
}
-void wait_migrate_huge_page(struct anon_vma *anon_vma, pmd_t *pmd)
-{
- struct page *page = pmd_page(*pmd);
- wait_on_page_locked(page);
-}
-
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
@@ -1884,7 +1847,7 @@ out_fail:
out_dropref:
ptl = pmd_lock(mm, pmd);
if (pmd_same(*pmd, entry)) {
- entry = pmd_mknonnuma(entry);
+ entry = pmd_modify(entry, vma->vm_page_prot);
set_pmd_at(mm, mmun_start, pmd, entry);
update_mmu_cache_pmd(vma, address, &entry);
}
diff --git a/mm/mincore.c b/mm/mincore.c
index c8c528b..be25efd 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -19,38 +19,25 @@
#include <asm/uaccess.h>
#include <asm/pgtable.h>
-static void mincore_hugetlb_page_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
+static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
+ unsigned long end, struct mm_walk *walk)
{
#ifdef CONFIG_HUGETLB_PAGE
- struct hstate *h;
+ unsigned char present;
+ unsigned char *vec = walk->private;
- h = hstate_vma(vma);
- while (1) {
- unsigned char present;
- pte_t *ptep;
- /*
- * Huge pages are always in RAM for now, but
- * theoretically it needs to be checked.
- */
- ptep = huge_pte_offset(current->mm,
- addr & huge_page_mask(h));
- present = ptep && !huge_pte_none(huge_ptep_get(ptep));
- while (1) {
- *vec = present;
- vec++;
- addr += PAGE_SIZE;
- if (addr == end)
- return;
- /* check hugepage border */
- if (!(addr & ~huge_page_mask(h)))
- break;
- }
- }
+ /*
+ * Hugepages under user process are always in RAM and never
+ * swapped out, but theoretically it needs to be checked.
+ */
+ present = pte && !huge_pte_none(huge_ptep_get(pte));
+ for (; addr != end; vec++, addr += PAGE_SIZE)
+ *vec = present;
+ walk->private = vec;
#else
BUG();
#endif
+ return 0;
}
/*
@@ -94,9 +81,8 @@ static unsigned char mincore_page(struct address_space *mapping, pgoff_t pgoff)
return present;
}
-static void mincore_unmapped_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
+static int __mincore_unmapped_range(unsigned long addr, unsigned long end,
+ struct vm_area_struct *vma, unsigned char *vec)
{
unsigned long nr = (end - addr) >> PAGE_SHIFT;
int i;
@@ -111,30 +97,47 @@ static void mincore_unmapped_range(struct vm_area_struct *vma,
for (i = 0; i < nr; i++)
vec[i] = 0;
}
+ return nr;
+}
+
+static int mincore_unmapped_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ walk->private += __mincore_unmapped_range(addr, end,
+ walk->vma, walk->private);
+ return 0;
}
-static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
+static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
{
- unsigned long next;
spinlock_t *ptl;
+ struct vm_area_struct *vma = walk->vma;
pte_t *ptep;
+ unsigned char *vec = walk->private;
+ int nr = (end - addr) >> PAGE_SHIFT;
+
+ if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) {
+ memset(vec, 1, nr);
+ spin_unlock(ptl);
+ goto out;
+ }
+
+ if (pmd_trans_unstable(pmd)) {
+ __mincore_unmapped_range(addr, end, vma, vec);
+ goto out;
+ }
- ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
- do {
+ ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl);
+ for (; addr != end; ptep++, addr += PAGE_SIZE) {
pte_t pte = *ptep;
- pgoff_t pgoff;
- next = addr + PAGE_SIZE;
if (pte_none(pte))
- mincore_unmapped_range(vma, addr, next, vec);
+ __mincore_unmapped_range(addr, addr + PAGE_SIZE,
+ vma, vec);
else if (pte_present(pte))
*vec = 1;
- else if (pte_file(pte)) {
- pgoff = pte_to_pgoff(pte);
- *vec = mincore_page(vma->vm_file->f_mapping, pgoff);
- } else { /* pte is a swap entry */
+ else { /* pte is a swap entry */
swp_entry_t entry = pte_to_swp_entry(pte);
if (non_swap_entry(entry)) {
@@ -145,9 +148,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
*vec = 1;
} else {
#ifdef CONFIG_SWAP
- pgoff = entry.val;
*vec = mincore_page(swap_address_space(entry),
- pgoff);
+ entry.val);
#else
WARN_ON(1);
*vec = 1;
@@ -155,69 +157,12 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
}
}
vec++;
- } while (ptep++, addr = next, addr != end);
+ }
pte_unmap_unlock(ptep - 1, ptl);
-}
-
-static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- unsigned long next;
- pmd_t *pmd;
-
- pmd = pmd_offset(pud, addr);
- do {
- next = pmd_addr_end(addr, end);
- if (pmd_trans_huge(*pmd)) {
- if (mincore_huge_pmd(vma, pmd, addr, next, vec)) {
- vec += (next - addr) >> PAGE_SHIFT;
- continue;
- }
- /* fall through */
- }
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
- mincore_unmapped_range(vma, addr, next, vec);
- else
- mincore_pte_range(vma, pmd, addr, next, vec);
- vec += (next - addr) >> PAGE_SHIFT;
- } while (pmd++, addr = next, addr != end);
-}
-
-static void mincore_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- unsigned long next;
- pud_t *pud;
-
- pud = pud_offset(pgd, addr);
- do {
- next = pud_addr_end(addr, end);
- if (pud_none_or_clear_bad(pud))
- mincore_unmapped_range(vma, addr, next, vec);
- else
- mincore_pmd_range(vma, pud, addr, next, vec);
- vec += (next - addr) >> PAGE_SHIFT;
- } while (pud++, addr = next, addr != end);
-}
-
-static void mincore_page_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
- unsigned char *vec)
-{
- unsigned long next;
- pgd_t *pgd;
-
- pgd = pgd_offset(vma->vm_mm, addr);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
- mincore_unmapped_range(vma, addr, next, vec);
- else
- mincore_pud_range(vma, pgd, addr, next, vec);
- vec += (next - addr) >> PAGE_SHIFT;
- } while (pgd++, addr = next, addr != end);
+out:
+ walk->private += nr;
+ cond_resched();
+ return 0;
}
/*
@@ -229,18 +174,22 @@ static long do_mincore(unsigned long addr, unsigned long pages, unsigned char *v
{
struct vm_area_struct *vma;
unsigned long end;
+ int err;
+ struct mm_walk mincore_walk = {
+ .pmd_entry = mincore_pte_range,
+ .pte_hole = mincore_unmapped_range,
+ .hugetlb_entry = mincore_hugetlb,
+ .private = vec,
+ };
vma = find_vma(current->mm, addr);
if (!vma || addr < vma->vm_start)
return -ENOMEM;
-
+ mincore_walk.mm = vma->vm_mm;
end = min(vma->vm_end, addr + (pages << PAGE_SHIFT));
-
- if (is_vm_hugetlb_page(vma))
- mincore_hugetlb_page_range(vma, addr, end, vec);
- else
- mincore_page_range(vma, addr, end, vec);
-
+ err = walk_page_range(addr, end, &mincore_walk);
+ if (err < 0)
+ return err;
return (end - addr) >> PAGE_SHIFT;
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 4074caf..5f420f7 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -14,14 +14,14 @@
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
-int mminit_loglevel;
+int __meminitdata mminit_loglevel;
#ifndef SECTIONS_SHIFT
#define SECTIONS_SHIFT 0
#endif
/* The zonelists are simply reported, validation is manual. */
-void mminit_verify_zonelist(void)
+void __init mminit_verify_zonelist(void)
{
int nid;
diff --git a/mm/mmap.c b/mm/mmap.c
index 7f684d5..da9990a 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -152,7 +152,7 @@ EXPORT_SYMBOL_GPL(vm_memory_committed);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed, reserve;
+ long free, allowed, reserve;
VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
-(s64)vm_committed_as_batch * num_online_cpus(),
@@ -220,7 +220,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
*/
if (mm) {
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min(mm->total_vm / 32, reserve);
+ allowed -= min_t(long, mm->total_vm / 32, reserve);
}
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
@@ -243,10 +243,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
mapping_unmap_writable(mapping);
flush_dcache_mmap_lock(mapping);
- if (unlikely(vma->vm_flags & VM_NONLINEAR))
- list_del_init(&vma->shared.nonlinear);
- else
- vma_interval_tree_remove(vma, &mapping->i_mmap);
+ vma_interval_tree_remove(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
@@ -649,10 +646,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
atomic_inc(&mapping->i_mmap_writable);
flush_dcache_mmap_lock(mapping);
- if (unlikely(vma->vm_flags & VM_NONLINEAR))
- vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
- else
- vma_interval_tree_insert(vma, &mapping->i_mmap);
+ vma_interval_tree_insert(vma, &mapping->i_mmap);
flush_dcache_mmap_unlock(mapping);
}
}
@@ -789,14 +783,11 @@ again: remove_next = 1 + (end > next->vm_end);
if (file) {
mapping = file->f_mapping;
- if (!(vma->vm_flags & VM_NONLINEAR)) {
- root = &mapping->i_mmap;
- uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+ root = &mapping->i_mmap;
+ uprobe_munmap(vma, vma->vm_start, vma->vm_end);
- if (adjust_next)
- uprobe_munmap(next, next->vm_start,
- next->vm_end);
- }
+ if (adjust_next)
+ uprobe_munmap(next, next->vm_start, next->vm_end);
i_mmap_lock_write(mapping);
if (insert) {
@@ -2634,6 +2625,75 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
return vm_munmap(addr, len);
}
+
+/*
+ * Emulation of deprecated remap_file_pages() syscall.
+ */
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+ unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
+{
+
+ struct mm_struct *mm = current->mm;
+ struct vm_area_struct *vma;
+ unsigned long populate = 0;
+ unsigned long ret = -EINVAL;
+ struct file *file;
+
+ pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
+ "See Documentation/vm/remap_file_pages.txt.\n",
+ current->comm, current->pid);
+
+ if (prot)
+ return ret;
+ start = start & PAGE_MASK;
+ size = size & PAGE_MASK;
+
+ if (start + size <= start)
+ return ret;
+
+ /* Does pgoff wrap? */
+ if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+ return ret;
+
+ down_write(&mm->mmap_sem);
+ vma = find_vma(mm, start);
+
+ if (!vma || !(vma->vm_flags & VM_SHARED))
+ goto out;
+
+ if (start < vma->vm_start || start + size > vma->vm_end)
+ goto out;
+
+ if (pgoff == linear_page_index(vma, start)) {
+ ret = 0;
+ goto out;
+ }
+
+ prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+ prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+ prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+ flags &= MAP_NONBLOCK;
+ flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+ if (vma->vm_flags & VM_LOCKED) {
+ flags |= MAP_LOCKED;
+ /* drop PG_Mlocked flag for over-mapped range */
+ munlock_vma_pages_range(vma, start, start + size);
+ }
+
+ file = get_file(vma->vm_file);
+ ret = do_mmap_pgoff(vma->vm_file, start, size,
+ prot, flags, pgoff, &populate);
+ fput(file);
+out:
+ up_write(&mm->mmap_sem);
+ if (populate)
+ mm_populate(ret, populate);
+ if (!IS_ERR_VALUE(ret))
+ ret = 0;
+ return ret;
+}
+
static inline void verify_mm_writelocked(struct mm_struct *mm)
{
#ifdef CONFIG_DEBUG_VM
@@ -2791,9 +2851,6 @@ void exit_mmap(struct mm_struct *mm)
vma = remove_vma(vma);
}
vm_unacct_memory(nr_accounted);
-
- WARN_ON(atomic_long_read(&mm->nr_ptes) >
- (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT);
}
/* Insert vm structure into process list sorted by address
@@ -3108,8 +3165,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
*
* mmap_sem in write mode is required in order to block all operations
* that could modify pagetables and free pages without need of
- * altering the vma layout (for example populate_range() with
- * nonlinear vmas). It's also needed in write mode to avoid new
+ * altering the vma layout. It's also needed in write mode to avoid new
* anon_vmas to be associated with existing vmas.
*
* A single task can't take more than one mm_take_all_locks() in a row
diff --git a/mm/mmzone.c b/mm/mmzone.c
index bf34fb8..7d87ebb 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -54,8 +54,7 @@ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
/* Returns the next zone at or below highest_zoneidx in a zonelist */
struct zoneref *next_zones_zonelist(struct zoneref *z,
enum zone_type highest_zoneidx,
- nodemask_t *nodes,
- struct zone **zone)
+ nodemask_t *nodes)
{
/*
* Find the next suitable zone to use for the allocation.
@@ -69,7 +68,6 @@ struct zoneref *next_zones_zonelist(struct zoneref *z,
(z->zone && !zref_in_nodemask(z, nodes)))
z++;
- *zone = zonelist_zone(z);
return z;
}
diff --git a/mm/mprotect.c b/mm/mprotect.c
index ace9345..4472781 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -75,37 +75,35 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
oldpte = *pte;
if (pte_present(oldpte)) {
pte_t ptent;
- bool updated = false;
- if (!prot_numa) {
- ptent = ptep_modify_prot_start(mm, addr, pte);
- if (pte_numa(ptent))
- ptent = pte_mknonnuma(ptent);
- ptent = pte_modify(ptent, newprot);
- /*
- * Avoid taking write faults for pages we
- * know to be dirty.
- */
- if (dirty_accountable && pte_dirty(ptent) &&
- (pte_soft_dirty(ptent) ||
- !(vma->vm_flags & VM_SOFTDIRTY)))
- ptent = pte_mkwrite(ptent);
- ptep_modify_prot_commit(mm, addr, pte, ptent);
- updated = true;
- } else {
+ /*
+ * Avoid trapping faults against the zero or KSM
+ * pages. See similar comment in change_huge_pmd.
+ */
+ if (prot_numa) {
struct page *page;
page = vm_normal_page(vma, addr, oldpte);
- if (page && !PageKsm(page)) {
- if (!pte_numa(oldpte)) {
- ptep_set_numa(mm, addr, pte);
- updated = true;
- }
- }
+ if (!page || PageKsm(page))
+ continue;
+
+ /* Avoid TLB flush if possible */
+ if (pte_protnone(oldpte))
+ continue;
}
- if (updated)
- pages++;
- } else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
+
+ ptent = ptep_modify_prot_start(mm, addr, pte);
+ ptent = pte_modify(ptent, newprot);
+
+ /* Avoid taking write faults for known dirty pages */
+ if (dirty_accountable && pte_dirty(ptent) &&
+ (pte_soft_dirty(ptent) ||
+ !(vma->vm_flags & VM_SOFTDIRTY))) {
+ ptent = pte_mkwrite(ptent);
+ }
+ ptep_modify_prot_commit(mm, addr, pte, ptent);
+ pages++;
+ } else if (IS_ENABLED(CONFIG_MIGRATION)) {
swp_entry_t entry = pte_to_swp_entry(oldpte);
if (is_write_migration_entry(entry)) {
diff --git a/mm/mremap.c b/mm/mremap.c
index 17fa018..57dadc0 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -81,8 +81,6 @@ static pte_t move_soft_dirty_pte(pte_t pte)
pte = pte_mksoft_dirty(pte);
else if (is_swap_pte(pte))
pte = pte_swp_mksoft_dirty(pte);
- else if (pte_file(pte))
- pte = pte_file_mksoft_dirty(pte);
#endif
return pte;
}
diff --git a/mm/msync.c b/mm/msync.c
index 992a167..bb04d53 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -86,10 +86,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
(vma->vm_flags & VM_SHARED)) {
get_file(file);
up_read(&mm->mmap_sem);
- if (vma->vm_flags & VM_NONLINEAR)
- error = vfs_fsync(file, 1);
- else
- error = vfs_fsync_range(file, fstart, fend, 1);
+ error = vfs_fsync_range(file, fstart, fend, 1);
fput(file);
if (error || start >= end)
goto out;
diff --git a/mm/nommu.c b/mm/nommu.c
index 28bd8c4..7296360 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -214,6 +214,39 @@ long get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
}
EXPORT_SYMBOL(get_user_pages);
+long get_user_pages_locked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ int *locked)
+{
+ return get_user_pages(tsk, mm, start, nr_pages, write, force,
+ pages, NULL);
+}
+EXPORT_SYMBOL(get_user_pages_locked);
+
+long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages,
+ unsigned int gup_flags)
+{
+ long ret;
+ down_read(&mm->mmap_sem);
+ ret = get_user_pages(tsk, mm, start, nr_pages, write, force,
+ pages, NULL);
+ up_read(&mm->mmap_sem);
+ return ret;
+}
+EXPORT_SYMBOL(__get_user_pages_unlocked);
+
+long get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm,
+ unsigned long start, unsigned long nr_pages,
+ int write, int force, struct page **pages)
+{
+ return __get_user_pages_unlocked(tsk, mm, start, nr_pages, write,
+ force, pages, 0);
+}
+EXPORT_SYMBOL(get_user_pages_unlocked);
+
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
@@ -947,9 +980,6 @@ static int validate_mmap_request(struct file *file,
return -EOVERFLOW;
if (file) {
- /* validate file mapping requests */
- struct address_space *mapping;
-
/* files must support mmap */
if (!file->f_op->mmap)
return -ENODEV;
@@ -958,28 +988,22 @@ static int validate_mmap_request(struct file *file,
* - we support chardevs that provide their own "memory"
* - we support files/blockdevs that are memory backed
*/
- mapping = file->f_mapping;
- if (!mapping)
- mapping = file_inode(file)->i_mapping;
-
- capabilities = 0;
- if (mapping && mapping->backing_dev_info)
- capabilities = mapping->backing_dev_info->capabilities;
-
- if (!capabilities) {
+ if (file->f_op->mmap_capabilities) {
+ capabilities = file->f_op->mmap_capabilities(file);
+ } else {
/* no explicit capabilities set, so assume some
* defaults */
switch (file_inode(file)->i_mode & S_IFMT) {
case S_IFREG:
case S_IFBLK:
- capabilities = BDI_CAP_MAP_COPY;
+ capabilities = NOMMU_MAP_COPY;
break;
case S_IFCHR:
capabilities =
- BDI_CAP_MAP_DIRECT |
- BDI_CAP_READ_MAP |
- BDI_CAP_WRITE_MAP;
+ NOMMU_MAP_DIRECT |
+ NOMMU_MAP_READ |
+ NOMMU_MAP_WRITE;
break;
default:
@@ -990,9 +1014,9 @@ static int validate_mmap_request(struct file *file,
/* eliminate any capabilities that we can't support on this
* device */
if (!file->f_op->get_unmapped_area)
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
if (!file->f_op->read)
- capabilities &= ~BDI_CAP_MAP_COPY;
+ capabilities &= ~NOMMU_MAP_COPY;
/* The file shall have been opened with read permission. */
if (!(file->f_mode & FMODE_READ))
@@ -1011,29 +1035,29 @@ static int validate_mmap_request(struct file *file,
if (locks_verify_locked(file))
return -EAGAIN;
- if (!(capabilities & BDI_CAP_MAP_DIRECT))
+ if (!(capabilities & NOMMU_MAP_DIRECT))
return -ENODEV;
/* we mustn't privatise shared mappings */
- capabilities &= ~BDI_CAP_MAP_COPY;
+ capabilities &= ~NOMMU_MAP_COPY;
} else {
/* we're going to read the file into private memory we
* allocate */
- if (!(capabilities & BDI_CAP_MAP_COPY))
+ if (!(capabilities & NOMMU_MAP_COPY))
return -ENODEV;
/* we don't permit a private writable mapping to be
* shared with the backing device */
if (prot & PROT_WRITE)
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
}
- if (capabilities & BDI_CAP_MAP_DIRECT) {
- if (((prot & PROT_READ) && !(capabilities & BDI_CAP_READ_MAP)) ||
- ((prot & PROT_WRITE) && !(capabilities & BDI_CAP_WRITE_MAP)) ||
- ((prot & PROT_EXEC) && !(capabilities & BDI_CAP_EXEC_MAP))
+ if (capabilities & NOMMU_MAP_DIRECT) {
+ if (((prot & PROT_READ) && !(capabilities & NOMMU_MAP_READ)) ||
+ ((prot & PROT_WRITE) && !(capabilities & NOMMU_MAP_WRITE)) ||
+ ((prot & PROT_EXEC) && !(capabilities & NOMMU_MAP_EXEC))
) {
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
if (flags & MAP_SHARED) {
printk(KERN_WARNING
"MAP_SHARED not completely supported on !MMU\n");
@@ -1050,21 +1074,21 @@ static int validate_mmap_request(struct file *file,
} else if ((prot & PROT_READ) && !(prot & PROT_EXEC)) {
/* handle implication of PROT_EXEC by PROT_READ */
if (current->personality & READ_IMPLIES_EXEC) {
- if (capabilities & BDI_CAP_EXEC_MAP)
+ if (capabilities & NOMMU_MAP_EXEC)
prot |= PROT_EXEC;
}
} else if ((prot & PROT_READ) &&
(prot & PROT_EXEC) &&
- !(capabilities & BDI_CAP_EXEC_MAP)
+ !(capabilities & NOMMU_MAP_EXEC)
) {
/* backing file is not executable, try to copy */
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
}
} else {
/* anonymous mappings are always memory backed and can be
* privately mapped
*/
- capabilities = BDI_CAP_MAP_COPY;
+ capabilities = NOMMU_MAP_COPY;
/* handle PROT_EXEC implication by PROT_READ */
if ((prot & PROT_READ) &&
@@ -1096,7 +1120,7 @@ static unsigned long determine_vm_flags(struct file *file,
vm_flags = calc_vm_prot_bits(prot) | calc_vm_flag_bits(flags);
/* vm_flags |= mm->def_flags; */
- if (!(capabilities & BDI_CAP_MAP_DIRECT)) {
+ if (!(capabilities & NOMMU_MAP_DIRECT)) {
/* attempt to share read-only copies of mapped file chunks */
vm_flags |= VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (file && !(prot & PROT_WRITE))
@@ -1105,7 +1129,7 @@ static unsigned long determine_vm_flags(struct file *file,
/* overlay a shareable mapping on the backing device or inode
* if possible - used for chardevs, ramfs/tmpfs/shmfs and
* romfs/cramfs */
- vm_flags |= VM_MAYSHARE | (capabilities & BDI_CAP_VMFLAGS);
+ vm_flags |= VM_MAYSHARE | (capabilities & NOMMU_VMFLAGS);
if (flags & MAP_SHARED)
vm_flags |= VM_SHARED;
}
@@ -1158,7 +1182,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
* shared mappings on devices or memory
* - VM_MAYSHARE will be set if it may attempt to share
*/
- if (capabilities & BDI_CAP_MAP_DIRECT) {
+ if (capabilities & NOMMU_MAP_DIRECT) {
ret = vma->vm_file->f_op->mmap(vma->vm_file, vma);
if (ret == 0) {
/* shouldn't return success if we're not sharing */
@@ -1347,7 +1371,7 @@ unsigned long do_mmap_pgoff(struct file *file,
if ((pregion->vm_pgoff != pgoff || rpglen != pglen) &&
!(pgoff >= pregion->vm_pgoff && pgend <= rpgend)) {
/* new mapping is not a subset of the region */
- if (!(capabilities & BDI_CAP_MAP_DIRECT))
+ if (!(capabilities & NOMMU_MAP_DIRECT))
goto sharing_violation;
continue;
}
@@ -1386,7 +1410,7 @@ unsigned long do_mmap_pgoff(struct file *file,
* - this is the hook for quasi-memory character devices to
* tell us the location of a shared mapping
*/
- if (capabilities & BDI_CAP_MAP_DIRECT) {
+ if (capabilities & NOMMU_MAP_DIRECT) {
addr = file->f_op->get_unmapped_area(file, addr, len,
pgoff, flags);
if (IS_ERR_VALUE(addr)) {
@@ -1398,10 +1422,10 @@ unsigned long do_mmap_pgoff(struct file *file,
* the mapping so we'll have to attempt to copy
* it */
ret = -ENODEV;
- if (!(capabilities & BDI_CAP_MAP_COPY))
+ if (!(capabilities & NOMMU_MAP_COPY))
goto error_just_free;
- capabilities &= ~BDI_CAP_MAP_DIRECT;
+ capabilities &= ~NOMMU_MAP_DIRECT;
} else {
vma->vm_start = region->vm_start = addr;
vma->vm_end = region->vm_end = addr + len;
@@ -1412,7 +1436,7 @@ unsigned long do_mmap_pgoff(struct file *file,
vma->vm_region = region;
/* set up the mapping
- * - the region is filled in if BDI_CAP_MAP_DIRECT is still set
+ * - the region is filled in if NOMMU_MAP_DIRECT is still set
*/
if (file && vma->vm_flags & VM_SHARED)
ret = do_mmap_shared_file(vma);
@@ -1895,7 +1919,7 @@ EXPORT_SYMBOL(unmap_mapping_range);
*/
int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
- unsigned long free, allowed, reserve;
+ long free, allowed, reserve;
vm_acct_memory(pages);
@@ -1959,7 +1983,7 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
*/
if (mm) {
reserve = sysctl_user_reserve_kbytes >> (PAGE_SHIFT - 10);
- allowed -= min(mm->total_vm / 32, reserve);
+ allowed -= min_t(long, mm->total_vm / 32, reserve);
}
if (percpu_counter_read_positive(&vm_committed_as) < allowed)
@@ -1984,14 +2008,6 @@ void filemap_map_pages(struct vm_area_struct *vma, struct vm_fault *vmf)
}
EXPORT_SYMBOL(filemap_map_pages);
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
- unsigned long size, pgoff_t pgoff)
-{
- BUG();
- return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
unsigned long addr, void *buf, int len, int write)
{
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d503e9c..642f38c 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
* The baseline for the badness score is the proportion of RAM that each
* task's rss, pagetable and swap space use.
*/
- points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) +
- get_mm_counter(p->mm, MM_SWAPENTS);
+ points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) +
+ atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm);
task_unlock(p);
/*
@@ -266,8 +266,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
* Don't allow any other task to have access to the reserves.
*/
if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
- if (unlikely(frozen(task)))
- __thaw_task(task);
if (!force_kill)
return OOM_SCAN_ABORT;
}
@@ -353,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
struct task_struct *p;
struct task_struct *task;
- pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n");
+ pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n");
rcu_read_lock();
for_each_process(p) {
if (oom_unkillable_task(p, memcg, nodemask))
@@ -369,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask)
continue;
}
- pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n",
+ pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n",
task->pid, from_kuid(&init_user_ns, task_uid(task)),
task->tgid, task->mm->total_vm, get_mm_rss(task->mm),
atomic_long_read(&task->mm->nr_ptes),
+ mm_nr_pmds(task->mm),
get_mm_counter(task->mm, MM_SWAPENTS),
task->signal->oom_score_adj, task->comm);
task_unlock(task);
@@ -400,20 +399,98 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
}
/*
- * Number of OOM killer invocations (including memcg OOM killer).
- * Primarily used by PM freezer to check for potential races with
- * OOM killed frozen task.
+ * Number of OOM victims in flight
*/
-static atomic_t oom_kills = ATOMIC_INIT(0);
+static atomic_t oom_victims = ATOMIC_INIT(0);
+static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait);
-int oom_kills_count(void)
+bool oom_killer_disabled __read_mostly;
+static DECLARE_RWSEM(oom_sem);
+
+/**
+ * mark_tsk_oom_victim - marks the given taks as OOM victim.
+ * @tsk: task to mark
+ *
+ * Has to be called with oom_sem taken for read and never after
+ * oom has been disabled already.
+ */
+void mark_tsk_oom_victim(struct task_struct *tsk)
{
- return atomic_read(&oom_kills);
+ WARN_ON(oom_killer_disabled);
+ /* OOM killer might race with memcg OOM */
+ if (test_and_set_tsk_thread_flag(tsk, TIF_MEMDIE))
+ return;
+ /*
+ * Make sure that the task is woken up from uninterruptible sleep
+ * if it is frozen because OOM killer wouldn't be able to free
+ * any memory and livelock. freezing_slow_path will tell the freezer
+ * that TIF_MEMDIE tasks should be ignored.
+ */
+ __thaw_task(tsk);
+ atomic_inc(&oom_victims);
+}
+
+/**
+ * unmark_oom_victim - unmarks the current task as OOM victim.
+ *
+ * Wakes up all waiters in oom_killer_disable()
+ */
+void unmark_oom_victim(void)
+{
+ if (!test_and_clear_thread_flag(TIF_MEMDIE))
+ return;
+
+ down_read(&oom_sem);
+ /*
+ * There is no need to signal the lasst oom_victim if there
+ * is nobody who cares.
+ */
+ if (!atomic_dec_return(&oom_victims) && oom_killer_disabled)
+ wake_up_all(&oom_victims_wait);
+ up_read(&oom_sem);
+}
+
+/**
+ * oom_killer_disable - disable OOM killer
+ *
+ * Forces all page allocations to fail rather than trigger OOM killer.
+ * Will block and wait until all OOM victims are killed.
+ *
+ * The function cannot be called when there are runnable user tasks because
+ * the userspace would see unexpected allocation failures as a result. Any
+ * new usage of this function should be consulted with MM people.
+ *
+ * Returns true if successful and false if the OOM killer cannot be
+ * disabled.
+ */
+bool oom_killer_disable(void)
+{
+ /*
+ * Make sure to not race with an ongoing OOM killer
+ * and that the current is not the victim.
+ */
+ down_write(&oom_sem);
+ if (test_thread_flag(TIF_MEMDIE)) {
+ up_write(&oom_sem);
+ return false;
+ }
+
+ oom_killer_disabled = true;
+ up_write(&oom_sem);
+
+ wait_event(oom_victims_wait, !atomic_read(&oom_victims));
+
+ return true;
}
-void note_oom_kill(void)
+/**
+ * oom_killer_enable - enable OOM killer
+ */
+void oom_killer_enable(void)
{
- atomic_inc(&oom_kills);
+ down_write(&oom_sem);
+ oom_killer_disabled = false;
+ up_write(&oom_sem);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
@@ -438,11 +515,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* If the task is already exiting, don't alarm the sysadmin or kill
* its children or threads, just set TIF_MEMDIE so it can die quickly
*/
- if (task_will_free_mem(p)) {
- set_tsk_thread_flag(p, TIF_MEMDIE);
+ task_lock(p);
+ if (p->mm && task_will_free_mem(p)) {
+ mark_tsk_oom_victim(p);
+ task_unlock(p);
put_task_struct(p);
return;
}
+ task_unlock(p);
if (__ratelimit(&oom_rs))
dump_header(p, gfp_mask, order, memcg, nodemask);
@@ -492,6 +572,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
/* mm cannot safely be dereferenced after task_unlock(victim) */
mm = victim->mm;
+ mark_tsk_oom_victim(victim);
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
K(get_mm_counter(victim->mm, MM_ANONPAGES)),
@@ -522,7 +603,6 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
}
rcu_read_unlock();
- set_tsk_thread_flag(victim, TIF_MEMDIE);
do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
put_task_struct(victim);
}
@@ -611,7 +691,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
}
/**
- * out_of_memory - kill the "best" process when we run out of memory
+ * __out_of_memory - kill the "best" process when we run out of memory
* @zonelist: zonelist pointer
* @gfp_mask: memory allocation flags
* @order: amount of memory being requested as a power of 2
@@ -623,7 +703,7 @@ void oom_zonelist_unlock(struct zonelist *zonelist, gfp_t gfp_mask)
* OR try to be smart about which process to kill. Note that we
* don't have to be perfect here, we just have to be good.
*/
-void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+static void __out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask, bool force_kill)
{
const nodemask_t *mpol_mask;
@@ -643,9 +723,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
* If current has a pending SIGKILL or is exiting, then automatically
* select it. The goal is to allow it to allocate so that it may
* quickly exit and free its memory.
+ *
+ * But don't select if current has already released its mm and cleared
+ * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
*/
- if (fatal_signal_pending(current) || task_will_free_mem(current)) {
- set_thread_flag(TIF_MEMDIE);
+ if (current->mm &&
+ (fatal_signal_pending(current) || task_will_free_mem(current))) {
+ mark_tsk_oom_victim(current);
return;
}
@@ -688,6 +772,32 @@ out:
schedule_timeout_killable(1);
}
+/**
+ * out_of_memory - tries to invoke OOM killer.
+ * @zonelist: zonelist pointer
+ * @gfp_mask: memory allocation flags
+ * @order: amount of memory being requested as a power of 2
+ * @nodemask: nodemask passed to page allocator
+ * @force_kill: true if a task must be killed, even if others are exiting
+ *
+ * invokes __out_of_memory if the OOM is not disabled by oom_killer_disable()
+ * when it returns false. Otherwise returns true.
+ */
+bool out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
+ int order, nodemask_t *nodemask, bool force_kill)
+{
+ bool ret = false;
+
+ down_read(&oom_sem);
+ if (!oom_killer_disabled) {
+ __out_of_memory(zonelist, gfp_mask, order, nodemask, force_kill);
+ ret = true;
+ }
+ up_read(&oom_sem);
+
+ return ret;
+}
+
/*
* The pagefault handler calls here because it is out of memory, so kill a
* memory-hogging task. If any populated zone has ZONE_OOM_LOCKED set, a
@@ -697,12 +807,25 @@ void pagefault_out_of_memory(void)
{
struct zonelist *zonelist;
+ down_read(&oom_sem);
if (mem_cgroup_oom_synchronize(true))
- return;
+ goto unlock;
zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
if (oom_zonelist_trylock(zonelist, GFP_KERNEL)) {
- out_of_memory(NULL, 0, 0, NULL, false);
+ if (!oom_killer_disabled)
+ __out_of_memory(NULL, 0, 0, NULL, false);
+ else
+ /*
+ * There shouldn't be any user tasks runable while the
+ * OOM killer is disabled so the current task has to
+ * be a racing OOM victim for which oom_killer_disable()
+ * is waiting for.
+ */
+ WARN_ON(test_thread_flag(TIF_MEMDIE));
+
oom_zonelist_unlock(zonelist, GFP_KERNEL);
}
+unlock:
+ up_read(&oom_sem);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6f43352..45e187b 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -1351,7 +1351,7 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long task_ratelimit;
unsigned long dirty_ratelimit;
unsigned long pos_ratio;
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
bool strictlimit = bdi->capabilities & BDI_CAP_STRICTLIMIT;
unsigned long start_time = jiffies;
@@ -1574,7 +1574,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0;
*/
void balance_dirty_pages_ratelimited(struct address_space *mapping)
{
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
int ratelimit;
int *p;
@@ -1929,7 +1929,7 @@ continue_unlock:
if (!clear_page_dirty_for_io(page))
goto continue_unlock;
- trace_wbc_writepage(wbc, mapping->backing_dev_info);
+ trace_wbc_writepage(wbc, inode_to_bdi(mapping->host));
ret = (*writepage)(page, wbc, data);
if (unlikely(ret)) {
if (ret == AOP_WRITEPAGE_ACTIVATE) {
@@ -2094,10 +2094,12 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
trace_writeback_dirty_page(page, mapping);
if (mapping_cap_account_dirty(mapping)) {
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
+
__inc_zone_page_state(page, NR_FILE_DIRTY);
__inc_zone_page_state(page, NR_DIRTIED);
- __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
- __inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ __inc_bdi_stat(bdi, BDI_RECLAIMABLE);
+ __inc_bdi_stat(bdi, BDI_DIRTIED);
task_io_account_write(PAGE_CACHE_SIZE);
current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits);
@@ -2156,7 +2158,7 @@ void account_page_redirty(struct page *page)
if (mapping && mapping_cap_account_dirty(mapping)) {
current->nr_dirtied--;
dec_zone_page_state(page, NR_DIRTIED);
- dec_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
+ dec_bdi_stat(inode_to_bdi(mapping->host), BDI_DIRTIED);
}
}
EXPORT_SYMBOL(account_page_redirty);
@@ -2168,9 +2170,12 @@ EXPORT_SYMBOL(account_page_redirty);
*/
int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page)
{
+ int ret;
+
wbc->pages_skipped++;
+ ret = __set_page_dirty_nobuffers(page);
account_page_redirty(page);
- return __set_page_dirty_nobuffers(page);
+ return ret;
}
EXPORT_SYMBOL(redirty_page_for_writepage);
@@ -2295,7 +2300,7 @@ int clear_page_dirty_for_io(struct page *page)
*/
if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(mapping->backing_dev_info,
+ dec_bdi_stat(inode_to_bdi(mapping->host),
BDI_RECLAIMABLE);
return 1;
}
@@ -2308,14 +2313,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page)
{
struct address_space *mapping = page_mapping(page);
- unsigned long memcg_flags;
struct mem_cgroup *memcg;
- bool locked;
int ret;
- memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2338,21 +2341,19 @@ int test_clear_page_writeback(struct page *page)
dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN);
}
- mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+ mem_cgroup_end_page_stat(memcg);
return ret;
}
int __test_set_page_writeback(struct page *page, bool keep_write)
{
struct address_space *mapping = page_mapping(page);
- unsigned long memcg_flags;
struct mem_cgroup *memcg;
- bool locked;
int ret;
- memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
+ memcg = mem_cgroup_begin_page_stat(page);
if (mapping) {
- struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long flags;
spin_lock_irqsave(&mapping->tree_lock, flags);
@@ -2380,7 +2381,7 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK);
}
- mem_cgroup_end_page_stat(memcg, &locked, &memcg_flags);
+ mem_cgroup_end_page_stat(memcg);
return ret;
}
@@ -2406,12 +2407,7 @@ EXPORT_SYMBOL(mapping_tagged);
*/
void wait_for_stable_page(struct page *page)
{
- struct address_space *mapping = page_mapping(page);
- struct backing_dev_info *bdi = mapping->backing_dev_info;
-
- if (!bdi_cap_stable_pages_required(bdi))
- return;
-
- wait_on_page_writeback(page);
+ if (bdi_cap_stable_pages_required(inode_to_bdi(page->mapping->host)))
+ wait_on_page_writeback(page);
}
EXPORT_SYMBOL_GPL(wait_for_stable_page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 8e20f9c..a47f0b2 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
+#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
@@ -172,7 +173,7 @@ static void __free_pages_ok(struct page *page, unsigned int order);
* 1G machine -> (16M dma, 784M normal, 224M high)
* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
- * HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
+ * HIGHMEM allocation will leave (224M+784M)/256 of ram reserved in ZONE_DMA
*
* TBD: should special case ZONE_DMA32 machines here - in those we normally
* don't need any ZONE_NORMAL reservation
@@ -244,8 +245,6 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
PB_migrate, PB_migrate_end);
}
-bool oom_killer_disabled __read_mostly;
-
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
@@ -381,36 +380,6 @@ void prep_compound_page(struct page *page, unsigned long order)
}
}
-/* update __split_huge_page_refcount if you change this function */
-static int destroy_compound_page(struct page *page, unsigned long order)
-{
- int i;
- int nr_pages = 1 << order;
- int bad = 0;
-
- if (unlikely(compound_order(page) != order)) {
- bad_page(page, "wrong compound order", 0);
- bad++;
- }
-
- __ClearPageHead(page);
-
- for (i = 1; i < nr_pages; i++) {
- struct page *p = page + i;
-
- if (unlikely(!PageTail(p))) {
- bad_page(page, "PageTail not set", 0);
- bad++;
- } else if (unlikely(p->first_page != page)) {
- bad_page(page, "first_page not consistent", 0);
- bad++;
- }
- __ClearPageTail(p);
- }
-
- return bad;
-}
-
static inline void prep_zero_page(struct page *page, unsigned int order,
gfp_t gfp_flags)
{
@@ -552,17 +521,15 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
return 0;
if (page_is_guard(buddy) && page_order(buddy) == order) {
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
-
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+
return 1;
}
if (PageBuddy(buddy) && page_order(buddy) == order) {
- VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
-
/*
* zone check is done late to avoid uselessly
* calculating zone/node ids for pages that could
@@ -571,6 +538,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
if (page_zone_id(page) != page_zone_id(buddy))
return 0;
+ VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy);
+
return 1;
}
return 0;
@@ -613,10 +582,7 @@ static inline void __free_one_page(struct page *page,
int max_order = MAX_ORDER;
VM_BUG_ON(!zone_is_initialized(zone));
-
- if (unlikely(PageCompound(page)))
- if (unlikely(destroy_compound_page(page, order)))
- return;
+ VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
VM_BUG_ON(migratetype == -1);
if (is_migrate_isolate(migratetype)) {
@@ -797,21 +763,41 @@ static void free_one_page(struct zone *zone,
spin_unlock(&zone->lock);
}
+static int free_tail_pages_check(struct page *head_page, struct page *page)
+{
+ if (!IS_ENABLED(CONFIG_DEBUG_VM))
+ return 0;
+ if (unlikely(!PageTail(page))) {
+ bad_page(page, "PageTail not set", 0);
+ return 1;
+ }
+ if (unlikely(page->first_page != head_page)) {
+ bad_page(page, "first_page not consistent", 0);
+ return 1;
+ }
+ return 0;
+}
+
static bool free_pages_prepare(struct page *page, unsigned int order)
{
- int i;
- int bad = 0;
+ bool compound = PageCompound(page);
+ int i, bad = 0;
VM_BUG_ON_PAGE(PageTail(page), page);
- VM_BUG_ON_PAGE(PageHead(page) && compound_order(page) != order, page);
+ VM_BUG_ON_PAGE(compound && compound_order(page) != order, page);
trace_mm_page_free(page, order);
kmemcheck_free_shadow(page, order);
+ kasan_free_pages(page, order);
if (PageAnon(page))
page->mapping = NULL;
- for (i = 0; i < (1 << order); i++)
+ bad += free_pages_check(page);
+ for (i = 1; i < (1 << order); i++) {
+ if (compound)
+ bad += free_tail_pages_check(page, page + i);
bad += free_pages_check(page + i);
+ }
if (bad)
return false;
@@ -970,7 +956,8 @@ static inline int check_new_page(struct page *page)
return 0;
}
-static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
+static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
+ int alloc_flags)
{
int i;
@@ -985,6 +972,7 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
+ kasan_alloc_pages(page, order);
if (gfp_flags & __GFP_ZERO)
prep_zero_page(page, order, gfp_flags);
@@ -994,6 +982,14 @@ static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags)
set_page_owner(page, order, gfp_flags);
+ /*
+ * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was necessary to
+ * allocate the page. The expectation is that the caller is taking
+ * steps that will free more memory. The caller should avoid the page
+ * being used for !PFMEMALLOC purposes.
+ */
+ page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
+
return 0;
}
@@ -1130,39 +1126,34 @@ static void change_pageblock_range(struct page *pageblock_page,
}
/*
- * If breaking a large block of pages, move all free pages to the preferred
- * allocation list. If falling back for a reclaimable kernel allocation, be
- * more aggressive about taking ownership of free pages.
+ * When we are falling back to another migratetype during allocation, try to
+ * steal extra free pages from the same pageblocks to satisfy further
+ * allocations, instead of polluting multiple pageblocks.
*
- * On the other hand, never change migration type of MIGRATE_CMA pageblocks
- * nor move CMA pages to different free lists. We don't want unmovable pages
- * to be allocated from MIGRATE_CMA areas.
+ * If we are stealing a relatively large buddy page, it is likely there will
+ * be more free pages in the pageblock, so try to steal them all. For
+ * reclaimable and unmovable allocations, we steal regardless of page size,
+ * as fragmentation caused by those allocations polluting movable pageblocks
+ * is worse than movable allocations stealing from unmovable and reclaimable
+ * pageblocks.
*
- * Returns the new migratetype of the pageblock (or the same old migratetype
- * if it was unchanged).
+ * If we claim more than half of the pageblock, change pageblock's migratetype
+ * as well.
*/
-static int try_to_steal_freepages(struct zone *zone, struct page *page,
+static void try_to_steal_freepages(struct zone *zone, struct page *page,
int start_type, int fallback_type)
{
int current_order = page_order(page);
- /*
- * When borrowing from MIGRATE_CMA, we need to release the excess
- * buddy pages to CMA itself. We also ensure the freepage_migratetype
- * is set to CMA so it is returned to the correct freelist in case
- * the page ends up being not actually allocated from the pcp lists.
- */
- if (is_migrate_cma(fallback_type))
- return fallback_type;
-
/* Take ownership for orders >= pageblock_order */
if (current_order >= pageblock_order) {
change_pageblock_range(page, current_order, start_type);
- return start_type;
+ return;
}
if (current_order >= pageblock_order / 2 ||
start_type == MIGRATE_RECLAIMABLE ||
+ start_type == MIGRATE_UNMOVABLE ||
page_group_by_mobility_disabled) {
int pages;
@@ -1170,15 +1161,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page,
/* Claim the whole block if over half of it is free */
if (pages >= (1 << (pageblock_order-1)) ||
- page_group_by_mobility_disabled) {
-
+ page_group_by_mobility_disabled)
set_pageblock_migratetype(page, start_type);
- return start_type;
- }
-
}
-
- return fallback_type;
}
/* Remove an element from the buddy allocator from the fallback list */
@@ -1188,14 +1173,15 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct free_area *area;
unsigned int current_order;
struct page *page;
- int migratetype, new_type, i;
/* Find the largest possible block of pages in the other list */
for (current_order = MAX_ORDER-1;
current_order >= order && current_order <= MAX_ORDER-1;
--current_order) {
+ int i;
for (i = 0;; i++) {
- migratetype = fallbacks[start_migratetype][i];
+ int migratetype = fallbacks[start_migratetype][i];
+ int buddy_type = start_migratetype;
/* MIGRATE_RESERVE handled later if necessary */
if (migratetype == MIGRATE_RESERVE)
@@ -1209,25 +1195,39 @@ __rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype)
struct page, lru);
area->nr_free--;
- new_type = try_to_steal_freepages(zone, page,
- start_migratetype,
- migratetype);
+ if (!is_migrate_cma(migratetype)) {
+ try_to_steal_freepages(zone, page,
+ start_migratetype,
+ migratetype);
+ } else {
+ /*
+ * When borrowing from MIGRATE_CMA, we need to
+ * release the excess buddy pages to CMA
+ * itself, and we do not try to steal extra
+ * free pages.
+ */
+ buddy_type = migratetype;
+ }
/* Remove the page from the freelists */
list_del(&page->lru);
rmv_page_order(page);
expand(zone, page, order, current_order, area,
- new_type);
- /* The freepage_migratetype may differ from pageblock's
+ buddy_type);
+
+ /*
+ * The freepage_migratetype may differ from pageblock's
* migratetype depending on the decisions in
- * try_to_steal_freepages. This is OK as long as it does
- * not differ for MIGRATE_CMA type.
+ * try_to_steal_freepages(). This is OK as long as it
+ * does not differ for MIGRATE_CMA pageblocks. For CMA
+ * we need to make sure unallocated pages flushed from
+ * pcp lists are returned to the correct freelist.
*/
- set_freepage_migratetype(page, new_type);
+ set_freepage_migratetype(page, buddy_type);
trace_mm_page_alloc_extfrag(page, order, current_order,
- start_migratetype, migratetype, new_type);
+ start_migratetype, migratetype);
return page;
}
@@ -1642,9 +1642,7 @@ int split_free_page(struct page *page)
}
/*
- * Really, prep_compound_page() should be called from __rmqueue_bulk(). But
- * we cheat by calling it from here, in the order > 0 path. Saves a branch
- * or two.
+ * Allocate a page from the given zone. Use pcplists for order-0 allocations.
*/
static inline
struct page *buffered_rmqueue(struct zone *preferred_zone,
@@ -1655,7 +1653,6 @@ struct page *buffered_rmqueue(struct zone *preferred_zone,
struct page *page;
bool cold = ((gfp_flags & __GFP_COLD) != 0);
-again:
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
struct list_head *list;
@@ -1711,8 +1708,6 @@ again:
local_irq_restore(flags);
VM_BUG_ON_PAGE(bad_range(zone, page), page);
- if (prep_new_page(page, order, gfp_flags))
- goto again;
return page;
failed:
@@ -2033,10 +2028,10 @@ static void reset_alloc_batches(struct zone *preferred_zone)
* a page.
*/
static struct page *
-get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order,
- struct zonelist *zonelist, int high_zoneidx, int alloc_flags,
- struct zone *preferred_zone, int classzone_idx, int migratetype)
+get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
+ const struct alloc_context *ac)
{
+ struct zonelist *zonelist = ac->zonelist;
struct zoneref *z;
struct page *page = NULL;
struct zone *zone;
@@ -2055,8 +2050,8 @@ zonelist_scan:
* Scan zonelist, looking for a zone with enough free.
* See also __cpuset_node_allowed() comment in kernel/cpuset.c.
*/
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+ ac->nodemask) {
unsigned long mark;
if (IS_ENABLED(CONFIG_NUMA) && zlc_active &&
@@ -2073,7 +2068,7 @@ zonelist_scan:
* time the page has in memory before being reclaimed.
*/
if (alloc_flags & ALLOC_FAIR) {
- if (!zone_local(preferred_zone, zone))
+ if (!zone_local(ac->preferred_zone, zone))
break;
if (test_bit(ZONE_FAIR_DEPLETED, &zone->flags)) {
nr_fair_skipped++;
@@ -2111,7 +2106,7 @@ zonelist_scan:
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
if (!zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags)) {
+ ac->classzone_idx, alloc_flags)) {
int ret;
/* Checked here to keep the fast path fast */
@@ -2132,7 +2127,7 @@ zonelist_scan:
}
if (zone_reclaim_mode == 0 ||
- !zone_allows_reclaim(preferred_zone, zone))
+ !zone_allows_reclaim(ac->preferred_zone, zone))
goto this_zone_full;
/*
@@ -2154,7 +2149,7 @@ zonelist_scan:
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags))
+ ac->classzone_idx, alloc_flags))
goto try_this_zone;
/*
@@ -2175,27 +2170,18 @@ zonelist_scan:
}
try_this_zone:
- page = buffered_rmqueue(preferred_zone, zone, order,
- gfp_mask, migratetype);
- if (page)
- break;
+ page = buffered_rmqueue(ac->preferred_zone, zone, order,
+ gfp_mask, ac->migratetype);
+ if (page) {
+ if (prep_new_page(page, order, gfp_mask, alloc_flags))
+ goto try_this_zone;
+ return page;
+ }
this_zone_full:
if (IS_ENABLED(CONFIG_NUMA) && zlc_active)
zlc_mark_zone_full(zonelist, z);
}
- if (page) {
- /*
- * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was
- * necessary to allocate the page. The expectation is
- * that the caller is taking steps that will free more
- * memory. The caller should avoid the page being used
- * for !PFMEMALLOC purposes.
- */
- page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS);
- return page;
- }
-
/*
* The first pass makes sure allocations are spread fairly within the
* local node. However, the local node might have free pages left
@@ -2208,7 +2194,7 @@ this_zone_full:
alloc_flags &= ~ALLOC_FAIR;
if (nr_fair_skipped) {
zonelist_rescan = true;
- reset_alloc_batches(preferred_zone);
+ reset_alloc_batches(ac->preferred_zone);
}
if (nr_online_nodes > 1)
zonelist_rescan = true;
@@ -2330,44 +2316,29 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
static inline struct page *
__alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, struct zone *preferred_zone,
- int classzone_idx, int migratetype, unsigned long *did_some_progress)
+ const struct alloc_context *ac, unsigned long *did_some_progress)
{
struct page *page;
*did_some_progress = 0;
- if (oom_killer_disabled)
- return NULL;
-
/*
* Acquire the per-zone oom lock for each zone. If that
* fails, somebody else is making progress for us.
*/
- if (!oom_zonelist_trylock(zonelist, gfp_mask)) {
+ if (!oom_zonelist_trylock(ac->zonelist, gfp_mask)) {
*did_some_progress = 1;
schedule_timeout_uninterruptible(1);
return NULL;
}
/*
- * PM-freezer should be notified that there might be an OOM killer on
- * its way to kill and wake somebody up. This is too early and we might
- * end up not killing anything but false positives are acceptable.
- * See freeze_processes.
- */
- note_oom_kill();
-
- /*
* Go through the zonelist yet one more time, keep very high watermark
* here, this is only to catch a parallel oom killing, we must fail if
* we're still under heavy pressure.
*/
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask,
- order, zonelist, high_zoneidx,
- ALLOC_WMARK_HIGH|ALLOC_CPUSET,
- preferred_zone, classzone_idx, migratetype);
+ page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
+ ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
if (page)
goto out;
@@ -2379,7 +2350,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (order > PAGE_ALLOC_COSTLY_ORDER)
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
- if (high_zoneidx < ZONE_NORMAL)
+ if (ac->high_zoneidx < ZONE_NORMAL)
goto out;
/* The OOM killer does not compensate for light reclaim */
if (!(gfp_mask & __GFP_FS))
@@ -2395,10 +2366,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
goto out;
}
/* Exhausted what can be done so it's blamo time */
- out_of_memory(zonelist, gfp_mask, order, nodemask, false);
- *did_some_progress = 1;
+ if (out_of_memory(ac->zonelist, gfp_mask, order, ac->nodemask, false))
+ *did_some_progress = 1;
out:
- oom_zonelist_unlock(zonelist, gfp_mask);
+ oom_zonelist_unlock(ac->zonelist, gfp_mask);
return page;
}
@@ -2406,10 +2377,9 @@ out:
/* Try memory compaction for high-order allocations before reclaim */
static struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int classzone_idx, int migratetype, enum migrate_mode mode,
- int *contended_compaction, bool *deferred_compaction)
+ int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended_compaction,
+ bool *deferred_compaction)
{
unsigned long compact_result;
struct page *page;
@@ -2418,10 +2388,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
return NULL;
current->flags |= PF_MEMALLOC;
- compact_result = try_to_compact_pages(zonelist, order, gfp_mask,
- nodemask, mode,
- contended_compaction,
- alloc_flags, classzone_idx);
+ compact_result = try_to_compact_pages(gfp_mask, order, alloc_flags, ac,
+ mode, contended_compaction);
current->flags &= ~PF_MEMALLOC;
switch (compact_result) {
@@ -2440,10 +2408,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
*/
count_vm_event(COMPACTSTALL);
- page = get_page_from_freelist(gfp_mask, nodemask,
- order, zonelist, high_zoneidx,
- alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx, migratetype);
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
if (page) {
struct zone *zone = page_zone(page);
@@ -2467,10 +2433,9 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
#else
static inline struct page *
__alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int classzone_idx, int migratetype, enum migrate_mode mode,
- int *contended_compaction, bool *deferred_compaction)
+ int alloc_flags, const struct alloc_context *ac,
+ enum migrate_mode mode, int *contended_compaction,
+ bool *deferred_compaction)
{
return NULL;
}
@@ -2478,8 +2443,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
/* Perform direct synchronous page reclaim */
static int
-__perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
- nodemask_t *nodemask)
+__perform_reclaim(gfp_t gfp_mask, unsigned int order,
+ const struct alloc_context *ac)
{
struct reclaim_state reclaim_state;
int progress;
@@ -2493,7 +2458,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
reclaim_state.reclaimed_slab = 0;
current->reclaim_state = &reclaim_state;
- progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask);
+ progress = try_to_free_pages(ac->zonelist, order, gfp_mask,
+ ac->nodemask);
current->reclaim_state = NULL;
lockdep_clear_current_reclaim_state();
@@ -2507,28 +2473,23 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist,
/* The really slow allocator path where we enter direct reclaim */
static inline struct page *
__alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone,
- int classzone_idx, int migratetype, unsigned long *did_some_progress)
+ int alloc_flags, const struct alloc_context *ac,
+ unsigned long *did_some_progress)
{
struct page *page = NULL;
bool drained = false;
- *did_some_progress = __perform_reclaim(gfp_mask, order, zonelist,
- nodemask);
+ *did_some_progress = __perform_reclaim(gfp_mask, order, ac);
if (unlikely(!(*did_some_progress)))
return NULL;
/* After successful reclaim, reconsider all zones for allocation */
if (IS_ENABLED(CONFIG_NUMA))
- zlc_clear_zones_full(zonelist);
+ zlc_clear_zones_full(ac->zonelist);
retry:
- page = get_page_from_freelist(gfp_mask, nodemask, order,
- zonelist, high_zoneidx,
- alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx,
- migratetype);
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
/*
* If an allocation failed after direct reclaim, it could be because
@@ -2549,36 +2510,30 @@ retry:
*/
static inline struct page *
__alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, struct zone *preferred_zone,
- int classzone_idx, int migratetype)
+ const struct alloc_context *ac)
{
struct page *page;
do {
- page = get_page_from_freelist(gfp_mask, nodemask, order,
- zonelist, high_zoneidx, ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx, migratetype);
+ page = get_page_from_freelist(gfp_mask, order,
+ ALLOC_NO_WATERMARKS, ac);
if (!page && gfp_mask & __GFP_NOFAIL)
- wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC,
+ HZ/50);
} while (!page && (gfp_mask & __GFP_NOFAIL));
return page;
}
-static void wake_all_kswapds(unsigned int order,
- struct zonelist *zonelist,
- enum zone_type high_zoneidx,
- struct zone *preferred_zone,
- nodemask_t *nodemask)
+static void wake_all_kswapds(unsigned int order, const struct alloc_context *ac)
{
struct zoneref *z;
struct zone *zone;
- for_each_zone_zonelist_nodemask(zone, z, zonelist,
- high_zoneidx, nodemask)
- wakeup_kswapd(zone, order, zone_idx(preferred_zone));
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->high_zoneidx, ac->nodemask)
+ wakeup_kswapd(zone, order, zone_idx(ac->preferred_zone));
}
static inline int
@@ -2637,9 +2592,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
static inline struct page *
__alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
- struct zonelist *zonelist, enum zone_type high_zoneidx,
- nodemask_t *nodemask, struct zone *preferred_zone,
- int classzone_idx, int migratetype)
+ struct alloc_context *ac)
{
const gfp_t wait = gfp_mask & __GFP_WAIT;
struct page *page = NULL;
@@ -2675,8 +2628,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
retry:
if (!(gfp_mask & __GFP_NO_KSWAPD))
- wake_all_kswapds(order, zonelist, high_zoneidx,
- preferred_zone, nodemask);
+ wake_all_kswapds(order, ac);
/*
* OK, we're below the kswapd watermark and have kicked background
@@ -2689,17 +2641,16 @@ retry:
* Find the true preferred zone if the allocation is unconstrained by
* cpusets.
*/
- if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) {
+ if (!(alloc_flags & ALLOC_CPUSET) && !ac->nodemask) {
struct zoneref *preferred_zoneref;
- preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
- NULL, &preferred_zone);
- classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ preferred_zoneref = first_zones_zonelist(ac->zonelist,
+ ac->high_zoneidx, NULL, &ac->preferred_zone);
+ ac->classzone_idx = zonelist_zone_idx(preferred_zoneref);
}
/* This is the last chance, in general, before the goto nopage. */
- page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
- high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
- preferred_zone, classzone_idx, migratetype);
+ page = get_page_from_freelist(gfp_mask, order,
+ alloc_flags & ~ALLOC_NO_WATERMARKS, ac);
if (page)
goto got_pg;
@@ -2710,11 +2661,10 @@ retry:
* the allocation is high priority and these type of
* allocations are system rather than user orientated
*/
- zonelist = node_zonelist(numa_node_id(), gfp_mask);
+ ac->zonelist = node_zonelist(numa_node_id(), gfp_mask);
+
+ page = __alloc_pages_high_priority(gfp_mask, order, ac);
- page = __alloc_pages_high_priority(gfp_mask, order,
- zonelist, high_zoneidx, nodemask,
- preferred_zone, classzone_idx, migratetype);
if (page) {
goto got_pg;
}
@@ -2743,11 +2693,9 @@ retry:
* Try direct compaction. The first pass is asynchronous. Subsequent
* attempts after direct reclaim are synchronous
*/
- page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
- high_zoneidx, nodemask, alloc_flags,
- preferred_zone,
- classzone_idx, migratetype,
- migration_mode, &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
+ migration_mode,
+ &contended_compaction,
&deferred_compaction);
if (page)
goto got_pg;
@@ -2793,12 +2741,8 @@ retry:
migration_mode = MIGRATE_SYNC_LIGHT;
/* Try direct reclaim and then allocating */
- page = __alloc_pages_direct_reclaim(gfp_mask, order,
- zonelist, high_zoneidx,
- nodemask,
- alloc_flags, preferred_zone,
- classzone_idx, migratetype,
- &did_some_progress);
+ page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
+ &did_some_progress);
if (page)
goto got_pg;
@@ -2812,17 +2756,15 @@ retry:
* start OOM killing tasks.
*/
if (!did_some_progress) {
- page = __alloc_pages_may_oom(gfp_mask, order, zonelist,
- high_zoneidx, nodemask,
- preferred_zone, classzone_idx,
- migratetype,&did_some_progress);
+ page = __alloc_pages_may_oom(gfp_mask, order, ac,
+ &did_some_progress);
if (page)
goto got_pg;
if (!did_some_progress)
goto nopage;
}
/* Wait for some write requests to complete then retry */
- wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
+ wait_iff_congested(ac->preferred_zone, BLK_RW_ASYNC, HZ/50);
goto retry;
} else {
/*
@@ -2830,11 +2772,9 @@ retry:
* direct reclaim and reclaim/compaction depends on compaction
* being called after reclaim so call directly if necessary
*/
- page = __alloc_pages_direct_compact(gfp_mask, order, zonelist,
- high_zoneidx, nodemask, alloc_flags,
- preferred_zone,
- classzone_idx, migratetype,
- migration_mode, &contended_compaction,
+ page = __alloc_pages_direct_compact(gfp_mask, order,
+ alloc_flags, ac, migration_mode,
+ &contended_compaction,
&deferred_compaction);
if (page)
goto got_pg;
@@ -2842,11 +2782,7 @@ retry:
nopage:
warn_alloc_failed(gfp_mask, order, NULL);
- return page;
got_pg:
- if (kmemcheck_enabled)
- kmemcheck_pagealloc_alloc(page, order, gfp_mask);
-
return page;
}
@@ -2857,14 +2793,16 @@ struct page *
__alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, nodemask_t *nodemask)
{
- enum zone_type high_zoneidx = gfp_zone(gfp_mask);
- struct zone *preferred_zone;
struct zoneref *preferred_zoneref;
struct page *page = NULL;
- int migratetype = gfpflags_to_migratetype(gfp_mask);
unsigned int cpuset_mems_cookie;
int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
- int classzone_idx;
+ gfp_t alloc_mask; /* The gfp_t that was actually used for allocation */
+ struct alloc_context ac = {
+ .high_zoneidx = gfp_zone(gfp_mask),
+ .nodemask = nodemask,
+ .migratetype = gfpflags_to_migratetype(gfp_mask),
+ };
gfp_mask &= gfp_allowed_mask;
@@ -2883,37 +2821,40 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
if (unlikely(!zonelist->_zonerefs->zone))
return NULL;
- if (IS_ENABLED(CONFIG_CMA) && migratetype == MIGRATE_MOVABLE)
+ if (IS_ENABLED(CONFIG_CMA) && ac.migratetype == MIGRATE_MOVABLE)
alloc_flags |= ALLOC_CMA;
retry_cpuset:
cpuset_mems_cookie = read_mems_allowed_begin();
+ /* We set it here, as __alloc_pages_slowpath might have changed it */
+ ac.zonelist = zonelist;
/* The preferred zone is used for statistics later */
- preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx,
- nodemask ? : &cpuset_current_mems_allowed,
- &preferred_zone);
- if (!preferred_zone)
+ preferred_zoneref = first_zones_zonelist(ac.zonelist, ac.high_zoneidx,
+ ac.nodemask ? : &cpuset_current_mems_allowed,
+ &ac.preferred_zone);
+ if (!ac.preferred_zone)
goto out;
- classzone_idx = zonelist_zone_idx(preferred_zoneref);
+ ac.classzone_idx = zonelist_zone_idx(preferred_zoneref);
/* First allocation attempt */
- page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
- zonelist, high_zoneidx, alloc_flags,
- preferred_zone, classzone_idx, migratetype);
+ alloc_mask = gfp_mask|__GFP_HARDWALL;
+ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
if (unlikely(!page)) {
/*
* Runtime PM, block IO and its error handling path
* can deadlock because I/O on the device might not
* complete.
*/
- gfp_mask = memalloc_noio_flags(gfp_mask);
- page = __alloc_pages_slowpath(gfp_mask, order,
- zonelist, high_zoneidx, nodemask,
- preferred_zone, classzone_idx, migratetype);
+ alloc_mask = memalloc_noio_flags(gfp_mask);
+
+ page = __alloc_pages_slowpath(alloc_mask, order, &ac);
}
- trace_mm_page_alloc(page, order, gfp_mask, migratetype);
+ if (kmemcheck_enabled && page)
+ kmemcheck_pagealloc_alloc(page, order, gfp_mask);
+
+ trace_mm_page_alloc(page, order, alloc_mask, ac.migratetype);
out:
/*
@@ -3933,18 +3874,29 @@ static int __build_all_zonelists(void *data)
return 0;
}
+static noinline void __init
+build_all_zonelists_init(void)
+{
+ __build_all_zonelists(NULL);
+ mminit_verify_zonelist();
+ cpuset_init_current_mems_allowed();
+}
+
/*
* Called with zonelists_mutex held always
* unless system_state == SYSTEM_BOOTING.
+ *
+ * __ref due to (1) call of __meminit annotated setup_zone_pageset
+ * [we're only called with non-NULL zone through __meminit paths] and
+ * (2) call of __init annotated helper build_all_zonelists_init
+ * [protected by SYSTEM_BOOTING].
*/
void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
{
set_zonelist_order();
if (system_state == SYSTEM_BOOTING) {
- __build_all_zonelists(NULL);
- mminit_verify_zonelist();
- cpuset_init_current_mems_allowed();
+ build_all_zonelists_init();
} else {
#ifdef CONFIG_MEMORY_HOTPLUG
if (zone)
@@ -5047,8 +4999,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_start_pfn = node_start_pfn;
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
- printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", nid,
- (u64) start_pfn << PAGE_SHIFT, (u64) (end_pfn << PAGE_SHIFT) - 1);
+ pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT, ((u64)end_pfn << PAGE_SHIFT) - 1);
#endif
calculate_node_totalpages(pgdat, start_pfn, end_pfn,
zones_size, zholes_size);
@@ -5420,9 +5372,10 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
arch_zone_highest_possible_pfn[i])
pr_cont("empty\n");
else
- pr_cont("[mem %0#10lx-%0#10lx]\n",
- arch_zone_lowest_possible_pfn[i] << PAGE_SHIFT,
- (arch_zone_highest_possible_pfn[i]
+ pr_cont("[mem %#018Lx-%#018Lx]\n",
+ (u64)arch_zone_lowest_possible_pfn[i]
+ << PAGE_SHIFT,
+ ((u64)arch_zone_highest_possible_pfn[i]
<< PAGE_SHIFT) - 1);
}
@@ -5430,15 +5383,16 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
pr_info("Movable zone start for each node\n");
for (i = 0; i < MAX_NUMNODES; i++) {
if (zone_movable_pfn[i])
- pr_info(" Node %d: %#010lx\n", i,
- zone_movable_pfn[i] << PAGE_SHIFT);
+ pr_info(" Node %d: %#018Lx\n", i,
+ (u64)zone_movable_pfn[i] << PAGE_SHIFT);
}
/* Print out the early node map */
pr_info("Early memory node ranges\n");
for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, &nid)
- pr_info(" node %3d: [mem %#010lx-%#010lx]\n", nid,
- start_pfn << PAGE_SHIFT, (end_pfn << PAGE_SHIFT) - 1);
+ pr_info(" node %3d: [mem %#018Lx-%#018Lx]\n", nid,
+ (u64)start_pfn << PAGE_SHIFT,
+ ((u64)end_pfn << PAGE_SHIFT) - 1);
/* Initialise every node */
mminit_verify_pageflags_layout();
diff --git a/mm/page_counter.c b/mm/page_counter.c
index a009574..11b4bed 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -166,18 +166,19 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit)
/**
* page_counter_memparse - memparse() for page counter limits
* @buf: string to parse
+ * @max: string meaning maximum possible value
* @nr_pages: returns the result in number of pages
*
* Returns -EINVAL, or 0 and @nr_pages on success. @nr_pages will be
* limited to %PAGE_COUNTER_MAX.
*/
-int page_counter_memparse(const char *buf, unsigned long *nr_pages)
+int page_counter_memparse(const char *buf, const char *max,
+ unsigned long *nr_pages)
{
- char unlimited[] = "-1";
char *end;
u64 bytes;
- if (!strncmp(buf, unlimited, sizeof(unlimited))) {
+ if (!strcmp(buf, max)) {
*nr_pages = PAGE_COUNTER_MAX;
return 0;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 955db8b..e604580 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -269,14 +269,9 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
.bv_len = PAGE_SIZE,
.bv_offset = 0
};
- struct iov_iter from = {
- .type = ITER_BVEC | WRITE,
- .count = PAGE_SIZE,
- .iov_offset = 0,
- .nr_segs = 1,
- };
- from.bvec = &bv; /* older gcc versions are broken */
+ struct iov_iter from;
+ iov_iter_bvec(&from, ITER_BVEC | WRITE, &bv, 1, PAGE_SIZE);
init_sync_kiocb(&kiocb, swap_file);
kiocb.ki_pos = page_file_offset(page);
kiocb.ki_nbytes = PAGE_SIZE;
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 9ab4a9b..0993f5f 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -59,20 +59,19 @@ void __reset_page_owner(struct page *page, unsigned int order)
void __set_page_owner(struct page *page, unsigned int order, gfp_t gfp_mask)
{
- struct page_ext *page_ext;
- struct stack_trace *trace;
-
- page_ext = lookup_page_ext(page);
+ struct page_ext *page_ext = lookup_page_ext(page);
+ struct stack_trace trace = {
+ .nr_entries = 0,
+ .max_entries = ARRAY_SIZE(page_ext->trace_entries),
+ .entries = &page_ext->trace_entries[0],
+ .skip = 3,
+ };
- trace = &page_ext->trace;
- trace->nr_entries = 0;
- trace->max_entries = ARRAY_SIZE(page_ext->trace_entries);
- trace->entries = &page_ext->trace_entries[0];
- trace->skip = 3;
- save_stack_trace(&page_ext->trace);
+ save_stack_trace(&trace);
page_ext->order = order;
page_ext->gfp_mask = gfp_mask;
+ page_ext->nr_entries = trace.nr_entries;
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
}
@@ -84,6 +83,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
int ret;
int pageblock_mt, page_mt;
char *kbuf;
+ struct stack_trace trace = {
+ .nr_entries = page_ext->nr_entries,
+ .entries = &page_ext->trace_entries[0],
+ };
kbuf = kmalloc(count, GFP_KERNEL);
if (!kbuf)
@@ -121,8 +124,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
if (ret >= count)
goto err;
- ret += snprint_stack_trace(kbuf + ret, count - ret,
- &page_ext->trace, 0);
+ ret += snprint_stack_trace(kbuf + ret, count - ret, &trace, 0);
if (ret >= count)
goto err;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index b264bda..75c1f28 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -35,7 +35,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
do {
again:
next = pmd_addr_end(addr, end);
- if (pmd_none(*pmd)) {
+ if (pmd_none(*pmd) || !walk->vma) {
if (walk->pte_hole)
err = walk->pte_hole(addr, next, walk);
if (err)
@@ -59,7 +59,7 @@ again:
continue;
split_huge_page_pmd_mm(walk->mm, addr, pmd);
- if (pmd_none_or_trans_huge_or_clear_bad(pmd))
+ if (pmd_trans_unstable(pmd))
goto again;
err = walk_pte_range(pmd, addr, next, walk);
if (err)
@@ -86,9 +86,7 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
break;
continue;
}
- if (walk->pud_entry)
- err = walk->pud_entry(pud, addr, next, walk);
- if (!err && (walk->pmd_entry || walk->pte_entry))
+ if (walk->pmd_entry || walk->pte_entry)
err = walk_pmd_range(pud, addr, next, walk);
if (err)
break;
@@ -97,6 +95,32 @@ static int walk_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end,
return err;
}
+static int walk_pgd_range(unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ pgd_t *pgd;
+ unsigned long next;
+ int err = 0;
+
+ pgd = pgd_offset(walk->mm, addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd)) {
+ if (walk->pte_hole)
+ err = walk->pte_hole(addr, next, walk);
+ if (err)
+ break;
+ continue;
+ }
+ if (walk->pmd_entry || walk->pte_entry)
+ err = walk_pud_range(pgd, addr, next, walk);
+ if (err)
+ break;
+ } while (pgd++, addr = next, addr != end);
+
+ return err;
+}
+
#ifdef CONFIG_HUGETLB_PAGE
static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
unsigned long end)
@@ -105,10 +129,10 @@ static unsigned long hugetlb_entry_end(struct hstate *h, unsigned long addr,
return boundary < end ? boundary : end;
}
-static int walk_hugetlb_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
+ struct vm_area_struct *vma = walk->vma;
struct hstate *h = hstate_vma(vma);
unsigned long next;
unsigned long hmask = huge_page_mask(h);
@@ -121,15 +145,14 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
if (pte && walk->hugetlb_entry)
err = walk->hugetlb_entry(pte, hmask, addr, next, walk);
if (err)
- return err;
+ break;
} while (addr = next, addr != end);
- return 0;
+ return err;
}
#else /* CONFIG_HUGETLB_PAGE */
-static int walk_hugetlb_range(struct vm_area_struct *vma,
- unsigned long addr, unsigned long end,
+static int walk_hugetlb_range(unsigned long addr, unsigned long end,
struct mm_walk *walk)
{
return 0;
@@ -137,115 +160,138 @@ static int walk_hugetlb_range(struct vm_area_struct *vma,
#endif /* CONFIG_HUGETLB_PAGE */
+/*
+ * Decide whether we really walk over the current vma on [@start, @end)
+ * or skip it via the returned value. Return 0 if we do walk over the
+ * current vma, and return 1 if we skip the vma. Negative values means
+ * error, where we abort the current walk.
+ */
+static int walk_page_test(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct vm_area_struct *vma = walk->vma;
+
+ if (walk->test_walk)
+ return walk->test_walk(start, end, walk);
+
+ /*
+ * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP
+ * range, so we don't walk over it as we do for normal vmas. However,
+ * Some callers are interested in handling hole range and they don't
+ * want to just ignore any single address range. Such users certainly
+ * define their ->pte_hole() callbacks, so let's delegate them to handle
+ * vma(VM_PFNMAP).
+ */
+ if (vma->vm_flags & VM_PFNMAP) {
+ int err = 1;
+ if (walk->pte_hole)
+ err = walk->pte_hole(start, end, walk);
+ return err ? err : 1;
+ }
+ return 0;
+}
+
+static int __walk_page_range(unsigned long start, unsigned long end,
+ struct mm_walk *walk)
+{
+ int err = 0;
+ struct vm_area_struct *vma = walk->vma;
+
+ if (vma && is_vm_hugetlb_page(vma)) {
+ if (walk->hugetlb_entry)
+ err = walk_hugetlb_range(start, end, walk);
+ } else
+ err = walk_pgd_range(start, end, walk);
+ return err;
+}
/**
- * walk_page_range - walk a memory map's page tables with a callback
- * @addr: starting address
- * @end: ending address
- * @walk: set of callbacks to invoke for each level of the tree
+ * walk_page_range - walk page table with caller specific callbacks
*
- * Recursively walk the page table for the memory area in a VMA,
- * calling supplied callbacks. Callbacks are called in-order (first
- * PGD, first PUD, first PMD, first PTE, second PTE... second PMD,
- * etc.). If lower-level callbacks are omitted, walking depth is reduced.
+ * Recursively walk the page table tree of the process represented by @walk->mm
+ * within the virtual address range [@start, @end). During walking, we can do
+ * some caller-specific works for each entry, by setting up pmd_entry(),
+ * pte_entry(), and/or hugetlb_entry(). If you don't set up for some of these
+ * callbacks, the associated entries/pages are just ignored.
+ * The return values of these callbacks are commonly defined like below:
+ * - 0 : succeeded to handle the current entry, and if you don't reach the
+ * end address yet, continue to walk.
+ * - >0 : succeeded to handle the current entry, and return to the caller
+ * with caller specific value.
+ * - <0 : failed to handle the current entry, and return to the caller
+ * with error code.
*
- * Each callback receives an entry pointer and the start and end of the
- * associated range, and a copy of the original mm_walk for access to
- * the ->private or ->mm fields.
+ * Before starting to walk page table, some callers want to check whether
+ * they really want to walk over the current vma, typically by checking
+ * its vm_flags. walk_page_test() and @walk->test_walk() are used for this
+ * purpose.
*
- * Usually no locks are taken, but splitting transparent huge page may
- * take page table lock. And the bottom level iterator will map PTE
- * directories from highmem if necessary.
+ * struct mm_walk keeps current values of some common data like vma and pmd,
+ * which are useful for the access from callbacks. If you want to pass some
+ * caller-specific data to callbacks, @walk->private should be helpful.
*
- * If any callback returns a non-zero value, the walk is aborted and
- * the return value is propagated back to the caller. Otherwise 0 is returned.
- *
- * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry
- * is !NULL.
+ * Locking:
+ * Callers of walk_page_range() and walk_page_vma() should hold
+ * @walk->mm->mmap_sem, because these function traverse vma list and/or
+ * access to vma's data.
*/
-int walk_page_range(unsigned long addr, unsigned long end,
+int walk_page_range(unsigned long start, unsigned long end,
struct mm_walk *walk)
{
- pgd_t *pgd;
- unsigned long next;
int err = 0;
+ unsigned long next;
+ struct vm_area_struct *vma;
- if (addr >= end)
- return err;
+ if (start >= end)
+ return -EINVAL;
if (!walk->mm)
return -EINVAL;
VM_BUG_ON_MM(!rwsem_is_locked(&walk->mm->mmap_sem), walk->mm);
- pgd = pgd_offset(walk->mm, addr);
+ vma = find_vma(walk->mm, start);
do {
- struct vm_area_struct *vma = NULL;
-
- next = pgd_addr_end(addr, end);
+ if (!vma) { /* after the last vma */
+ walk->vma = NULL;
+ next = end;
+ } else if (start < vma->vm_start) { /* outside vma */
+ walk->vma = NULL;
+ next = min(end, vma->vm_start);
+ } else { /* inside vma */
+ walk->vma = vma;
+ next = min(end, vma->vm_end);
+ vma = vma->vm_next;
- /*
- * This function was not intended to be vma based.
- * But there are vma special cases to be handled:
- * - hugetlb vma's
- * - VM_PFNMAP vma's
- */
- vma = find_vma(walk->mm, addr);
- if (vma) {
- /*
- * There are no page structures backing a VM_PFNMAP
- * range, so do not allow split_huge_page_pmd().
- */
- if ((vma->vm_start <= addr) &&
- (vma->vm_flags & VM_PFNMAP)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
- if (err)
- break;
- pgd = pgd_offset(walk->mm, next);
- continue;
- }
- /*
- * Handle hugetlb vma individually because pagetable
- * walk for the hugetlb page is dependent on the
- * architecture and we can't handled it in the same
- * manner as non-huge pages.
- */
- if (walk->hugetlb_entry && (vma->vm_start <= addr) &&
- is_vm_hugetlb_page(vma)) {
- if (vma->vm_end < next)
- next = vma->vm_end;
- /*
- * Hugepage is very tightly coupled with vma,
- * so walk through hugetlb entries within a
- * given vma.
- */
- err = walk_hugetlb_range(vma, addr, next, walk);
- if (err)
- break;
- pgd = pgd_offset(walk->mm, next);
+ err = walk_page_test(start, next, walk);
+ if (err > 0)
continue;
- }
- }
-
- if (pgd_none_or_clear_bad(pgd)) {
- if (walk->pte_hole)
- err = walk->pte_hole(addr, next, walk);
- if (err)
+ if (err < 0)
break;
- pgd++;
- continue;
}
- if (walk->pgd_entry)
- err = walk->pgd_entry(pgd, addr, next, walk);
- if (!err &&
- (walk->pud_entry || walk->pmd_entry || walk->pte_entry))
- err = walk_pud_range(pgd, addr, next, walk);
+ if (walk->vma || walk->pte_hole)
+ err = __walk_page_range(start, next, walk);
if (err)
break;
- pgd++;
- } while (addr = next, addr < end);
-
+ } while (start = next, start < end);
return err;
}
+
+int walk_page_vma(struct vm_area_struct *vma, struct mm_walk *walk)
+{
+ int err;
+
+ if (!walk->mm)
+ return -EINVAL;
+
+ VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem));
+ VM_BUG_ON(!vma);
+ walk->vma = vma;
+ err = walk_page_test(vma->vm_start, vma->vm_end, walk);
+ if (err > 0)
+ return 0;
+ if (err < 0)
+ return err;
+ return __walk_page_range(vma->vm_start, vma->vm_end, walk);
+}
diff --git a/mm/percpu.c b/mm/percpu.c
index d39e2f4..73c97a5 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1528,7 +1528,6 @@ static void pcpu_dump_alloc_info(const char *lvl,
int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
- static char cpus_buf[4096] __initdata;
static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
size_t dyn_size = ai->dyn_size;
@@ -1541,12 +1540,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
int *unit_map;
int group, unit, i;
- cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask);
-
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
pr_emerg("PERCPU: failed to initialize, %s", #cond); \
- pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \
+ pr_emerg("PERCPU: cpu_possible_mask=%*pb\n", \
+ cpumask_pr_args(cpu_possible_mask)); \
pcpu_dump_alloc_info(KERN_EMERG, ai); \
BUG(); \
} \
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index dfb79e0..c25f94b 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -193,8 +193,6 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
pmd_t entry = *pmdp;
- if (pmd_numa(entry))
- entry = pmd_mknonnuma(entry);
set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry));
flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 5077afc..b159769 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -99,11 +99,8 @@ static int process_vm_rw_single_vec(unsigned long addr,
size_t bytes;
/* Get the pages we're interested in */
- down_read(&mm->mmap_sem);
- pages = get_user_pages(task, mm, pa, pages,
- vm_write, 0, process_pages, NULL);
- up_read(&mm->mmap_sem);
-
+ pages = get_user_pages_unlocked(task, mm, pa, pages,
+ vm_write, 0, process_pages);
if (pages <= 0)
return -EFAULT;
diff --git a/mm/readahead.c b/mm/readahead.c
index 17b9172..9356758 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -27,7 +27,7 @@
void
file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping)
{
- ra->ra_pages = mapping->backing_dev_info->ra_pages;
+ ra->ra_pages = inode_to_bdi(mapping->host)->ra_pages;
ra->prev_pos = -1;
}
EXPORT_SYMBOL_GPL(file_ra_state_init);
@@ -541,7 +541,7 @@ page_cache_async_readahead(struct address_space *mapping,
/*
* Defer asynchronous read-ahead on IO congestion.
*/
- if (bdi_read_congested(mapping->backing_dev_info))
+ if (bdi_read_congested(inode_to_bdi(mapping->host)))
return;
/* do read-ahead */
diff --git a/mm/rmap.c b/mm/rmap.c
index 71cd5bd..5e3e090 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -590,9 +590,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
if (!vma->anon_vma || !page__anon_vma ||
vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
- } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
- if (!vma->vm_file ||
- vma->vm_file->f_mapping != page->mapping)
+ } else if (page->mapping) {
+ if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
return -EFAULT;
} else
return -EFAULT;
@@ -1086,24 +1085,20 @@ void page_add_new_anon_rmap(struct page *page,
void page_add_file_rmap(struct page *page)
{
struct mem_cgroup *memcg;
- unsigned long flags;
- bool locked;
- memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ memcg = mem_cgroup_begin_page_stat(page);
if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
}
- mem_cgroup_end_page_stat(memcg, &locked, &flags);
+ mem_cgroup_end_page_stat(memcg);
}
static void page_remove_file_rmap(struct page *page)
{
struct mem_cgroup *memcg;
- unsigned long flags;
- bool locked;
- memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
+ memcg = mem_cgroup_begin_page_stat(page);
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
@@ -1124,7 +1119,7 @@ static void page_remove_file_rmap(struct page *page)
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
out:
- mem_cgroup_end_page_stat(memcg, &locked, &flags);
+ mem_cgroup_end_page_stat(memcg);
}
/**
@@ -1274,7 +1269,6 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (pte_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
set_pte_at(mm, address, pte, swp_pte);
- BUG_ON(pte_file(*pte));
} else if (IS_ENABLED(CONFIG_MIGRATION) &&
(flags & TTU_MIGRATION)) {
/* Establish migration entry for a file page */
@@ -1316,211 +1310,6 @@ out_mlock:
return ret;
}
-/*
- * objrmap doesn't work for nonlinear VMAs because the assumption that
- * offset-into-file correlates with offset-into-virtual-addresses does not hold.
- * Consequently, given a particular page and its ->index, we cannot locate the
- * ptes which are mapping that page without an exhaustive linear search.
- *
- * So what this code does is a mini "virtual scan" of each nonlinear VMA which
- * maps the file to which the target page belongs. The ->vm_private_data field
- * holds the current cursor into that scan. Successive searches will circulate
- * around the vma's virtual address space.
- *
- * So as more replacement pressure is applied to the pages in a nonlinear VMA,
- * more scanning pressure is placed against them as well. Eventually pages
- * will become fully unmapped and are eligible for eviction.
- *
- * For very sparsely populated VMAs this is a little inefficient - chances are
- * there there won't be many ptes located within the scan cluster. In this case
- * maybe we could scan further - to the end of the pte page, perhaps.
- *
- * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can
- * acquire it without blocking. If vma locked, mlock the pages in the cluster,
- * rather than unmapping them. If we encounter the "check_page" that vmscan is
- * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
- */
-#define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE)
-#define CLUSTER_MASK (~(CLUSTER_SIZE - 1))
-
-static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
- struct vm_area_struct *vma, struct page *check_page)
-{
- struct mm_struct *mm = vma->vm_mm;
- pmd_t *pmd;
- pte_t *pte;
- pte_t pteval;
- spinlock_t *ptl;
- struct page *page;
- unsigned long address;
- unsigned long mmun_start; /* For mmu_notifiers */
- unsigned long mmun_end; /* For mmu_notifiers */
- unsigned long end;
- int ret = SWAP_AGAIN;
- int locked_vma = 0;
-
- address = (vma->vm_start + cursor) & CLUSTER_MASK;
- end = address + CLUSTER_SIZE;
- if (address < vma->vm_start)
- address = vma->vm_start;
- if (end > vma->vm_end)
- end = vma->vm_end;
-
- pmd = mm_find_pmd(mm, address);
- if (!pmd)
- return ret;
-
- mmun_start = address;
- mmun_end = end;
- mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-
- /*
- * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
- * keep the sem while scanning the cluster for mlocking pages.
- */
- if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
- locked_vma = (vma->vm_flags & VM_LOCKED);
- if (!locked_vma)
- up_read(&vma->vm_mm->mmap_sem); /* don't need it */
- }
-
- pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-
- /* Update high watermark before we lower rss */
- update_hiwater_rss(mm);
-
- for (; address < end; pte++, address += PAGE_SIZE) {
- if (!pte_present(*pte))
- continue;
- page = vm_normal_page(vma, address, *pte);
- BUG_ON(!page || PageAnon(page));
-
- if (locked_vma) {
- if (page == check_page) {
- /* we know we have check_page locked */
- mlock_vma_page(page);
- ret = SWAP_MLOCK;
- } else if (trylock_page(page)) {
- /*
- * If we can lock the page, perform mlock.
- * Otherwise leave the page alone, it will be
- * eventually encountered again later.
- */
- mlock_vma_page(page);
- unlock_page(page);
- }
- continue; /* don't unmap */
- }
-
- /*
- * No need for _notify because we're within an
- * mmu_notifier_invalidate_range_ {start|end} scope.
- */
- if (ptep_clear_flush_young(vma, address, pte))
- continue;
-
- /* Nuke the page table entry. */
- flush_cache_page(vma, address, pte_pfn(*pte));
- pteval = ptep_clear_flush_notify(vma, address, pte);
-
- /* If nonlinear, store the file page offset in the pte. */
- if (page->index != linear_page_index(vma, address)) {
- pte_t ptfile = pgoff_to_pte(page->index);
- if (pte_soft_dirty(pteval))
- ptfile = pte_file_mksoft_dirty(ptfile);
- set_pte_at(mm, address, pte, ptfile);
- }
-
- /* Move the dirty bit to the physical page now the pte is gone. */
- if (pte_dirty(pteval))
- set_page_dirty(page);
-
- page_remove_rmap(page);
- page_cache_release(page);
- dec_mm_counter(mm, MM_FILEPAGES);
- (*mapcount)--;
- }
- pte_unmap_unlock(pte - 1, ptl);
- mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
- if (locked_vma)
- up_read(&vma->vm_mm->mmap_sem);
- return ret;
-}
-
-static int try_to_unmap_nonlinear(struct page *page,
- struct address_space *mapping, void *arg)
-{
- struct vm_area_struct *vma;
- int ret = SWAP_AGAIN;
- unsigned long cursor;
- unsigned long max_nl_cursor = 0;
- unsigned long max_nl_size = 0;
- unsigned int mapcount;
-
- list_for_each_entry(vma,
- &mapping->i_mmap_nonlinear, shared.nonlinear) {
-
- cursor = (unsigned long) vma->vm_private_data;
- if (cursor > max_nl_cursor)
- max_nl_cursor = cursor;
- cursor = vma->vm_end - vma->vm_start;
- if (cursor > max_nl_size)
- max_nl_size = cursor;
- }
-
- if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */
- return SWAP_FAIL;
- }
-
- /*
- * We don't try to search for this page in the nonlinear vmas,
- * and page_referenced wouldn't have found it anyway. Instead
- * just walk the nonlinear vmas trying to age and unmap some.
- * The mapcount of the page we came in with is irrelevant,
- * but even so use it as a guide to how hard we should try?
- */
- mapcount = page_mapcount(page);
- if (!mapcount)
- return ret;
-
- cond_resched();
-
- max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
- if (max_nl_cursor == 0)
- max_nl_cursor = CLUSTER_SIZE;
-
- do {
- list_for_each_entry(vma,
- &mapping->i_mmap_nonlinear, shared.nonlinear) {
-
- cursor = (unsigned long) vma->vm_private_data;
- while (cursor < max_nl_cursor &&
- cursor < vma->vm_end - vma->vm_start) {
- if (try_to_unmap_cluster(cursor, &mapcount,
- vma, page) == SWAP_MLOCK)
- ret = SWAP_MLOCK;
- cursor += CLUSTER_SIZE;
- vma->vm_private_data = (void *) cursor;
- if ((int)mapcount <= 0)
- return ret;
- }
- vma->vm_private_data = (void *) max_nl_cursor;
- }
- cond_resched();
- max_nl_cursor += CLUSTER_SIZE;
- } while (max_nl_cursor <= max_nl_size);
-
- /*
- * Don't loop forever (perhaps all the remaining pages are
- * in locked vmas). Reset cursor on all unreserved nonlinear
- * vmas, now forgetting on which ones it had fallen behind.
- */
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
- vma->vm_private_data = NULL;
-
- return ret;
-}
-
bool is_vma_temporary_stack(struct vm_area_struct *vma)
{
int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1566,7 +1355,6 @@ int try_to_unmap(struct page *page, enum ttu_flags flags)
.rmap_one = try_to_unmap_one,
.arg = (void *)flags,
.done = page_not_mapped,
- .file_nonlinear = try_to_unmap_nonlinear,
.anon_lock = page_lock_anon_vma_read,
};
@@ -1612,12 +1400,6 @@ int try_to_munlock(struct page *page)
.rmap_one = try_to_unmap_one,
.arg = (void *)TTU_MUNLOCK,
.done = page_not_mapped,
- /*
- * We don't bother to try to find the munlocked page in
- * nonlinears. It's costly. Instead, later, page reclaim logic
- * may call try_to_unmap() and recover PG_mlocked lazily.
- */
- .file_nonlinear = NULL,
.anon_lock = page_lock_anon_vma_read,
};
@@ -1748,13 +1530,6 @@ static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
goto done;
}
- if (!rwc->file_nonlinear)
- goto done;
-
- if (list_empty(&mapping->i_mmap_nonlinear))
- goto done;
-
- ret = rwc->file_nonlinear(page, mapping, rwc->arg);
done:
i_mmap_unlock_read(mapping);
return ret;
diff --git a/mm/shmem.c b/mm/shmem.c
index 993e6ba..2f17cb5 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -191,11 +191,6 @@ static const struct inode_operations shmem_dir_inode_operations;
static const struct inode_operations shmem_special_inode_operations;
static const struct vm_operations_struct shmem_vm_ops;
-static struct backing_dev_info shmem_backing_dev_info __read_mostly = {
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
@@ -765,11 +760,11 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
goto redirty;
/*
- * shmem_backing_dev_info's capabilities prevent regular writeback or
- * sync from ever calling shmem_writepage; but a stacking filesystem
- * might use ->writepage of its underlying filesystem, in which case
- * tmpfs should write out to swap only in response to memory pressure,
- * and not for the writeback threads or sync.
+ * Our capabilities prevent regular writeback or sync from ever calling
+ * shmem_writepage; but a stacking filesystem might use ->writepage of
+ * its underlying filesystem, in which case tmpfs should write out to
+ * swap only in response to memory pressure, and not for the writeback
+ * threads or sync.
*/
if (!wbc->for_reclaim) {
WARN_ON_ONCE(1); /* Still happens? Tell us about it! */
@@ -1131,7 +1126,7 @@ repeat:
* truncated or holepunched since swap was confirmed.
* shmem_undo_range() will have done some of the
* unaccounting, now delete_from_swap_cache() will do
- * the rest (including mem_cgroup_uncharge_swapcache).
+ * the rest.
* Reset swap.val? No, leave it so "failed" goes back to
* "repeat": reading a hole and writing should succeed.
*/
@@ -1415,7 +1410,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
inode->i_ino = get_next_ino();
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
- inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
inode->i_generation = get_seconds();
info = SHMEM_I(inode);
@@ -1461,7 +1455,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
bool shmem_mapping(struct address_space *mapping)
{
- return mapping->backing_dev_info == &shmem_backing_dev_info;
+ return mapping->host->i_sb->s_op == &shmem_ops;
}
#ifdef CONFIG_TMPFS
@@ -2325,8 +2319,8 @@ static int shmem_rmdir(struct inode *dir, struct dentry *dentry)
static int shmem_exchange(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry)
{
- bool old_is_dir = S_ISDIR(old_dentry->d_inode->i_mode);
- bool new_is_dir = S_ISDIR(new_dentry->d_inode->i_mode);
+ bool old_is_dir = d_is_dir(old_dentry);
+ bool new_is_dir = d_is_dir(new_dentry);
if (old_dir != new_dir && old_is_dir != new_is_dir) {
if (old_is_dir) {
@@ -3201,7 +3195,6 @@ static const struct vm_operations_struct shmem_vm_ops = {
.set_policy = shmem_set_policy,
.get_policy = shmem_get_policy,
#endif
- .remap_pages = generic_file_remap_pages,
};
static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -3226,10 +3219,6 @@ int __init shmem_init(void)
if (shmem_inode_cachep)
return 0;
- error = bdi_init(&shmem_backing_dev_info);
- if (error)
- goto out4;
-
error = shmem_init_inodecache();
if (error)
goto out3;
@@ -3253,8 +3242,6 @@ out1:
out2:
shmem_destroy_inodecache();
out3:
- bdi_destroy(&shmem_backing_dev_info);
-out4:
shm_mnt = ERR_PTR(error);
return error;
}
diff --git a/mm/slab.c b/mm/slab.c
index 65b5dcb..c4b89ea 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2382,7 +2382,7 @@ out:
return nr_freed;
}
-int __kmem_cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
{
int ret = 0;
int node;
@@ -2404,7 +2404,7 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
int i;
struct kmem_cache_node *n;
- int rc = __kmem_cache_shrink(cachep);
+ int rc = __kmem_cache_shrink(cachep, false);
if (rc)
return rc;
@@ -3708,8 +3708,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
int batchcount, int shared, gfp_t gfp)
{
int ret;
- struct kmem_cache *c = NULL;
- int i = 0;
+ struct kmem_cache *c;
ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
@@ -3719,12 +3718,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
if ((ret < 0) || !is_root_cache(cachep))
return ret;
- VM_BUG_ON(!mutex_is_locked(&slab_mutex));
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(cachep, i);
- if (c)
- /* return value determined by the parent cache only */
- __do_tune_cpucache(c, limit, batchcount, shared, gfp);
+ lockdep_assert_held(&slab_mutex);
+ for_each_memcg_cache(c, cachep) {
+ /* return value determined by the root cache only */
+ __do_tune_cpucache(c, limit, batchcount, shared, gfp);
}
return ret;
diff --git a/mm/slab.h b/mm/slab.h
index 1cf40054..4c3ac12 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -86,8 +86,6 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
extern void create_boot_cache(struct kmem_cache *, const char *name,
size_t size, unsigned long flags);
-struct mem_cgroup;
-
int slab_unmergeable(struct kmem_cache *s);
struct kmem_cache *find_mergeable(size_t size, size_t align,
unsigned long flags, const char *name, void (*ctor)(void *));
@@ -140,7 +138,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
#define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
int __kmem_cache_shutdown(struct kmem_cache *);
-int __kmem_cache_shrink(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *, bool);
void slab_kmem_cache_release(struct kmem_cache *);
struct seq_file;
@@ -165,16 +163,27 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos);
#ifdef CONFIG_MEMCG_KMEM
+/*
+ * Iterate over all memcg caches of the given root cache. The caller must hold
+ * slab_mutex.
+ */
+#define for_each_memcg_cache(iter, root) \
+ list_for_each_entry(iter, &(root)->memcg_params.list, \
+ memcg_params.list)
+
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+ list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
+ memcg_params.list)
+
static inline bool is_root_cache(struct kmem_cache *s)
{
- return !s->memcg_params || s->memcg_params->is_root_cache;
+ return s->memcg_params.is_root_cache;
}
static inline bool slab_equal_or_root(struct kmem_cache *s,
- struct kmem_cache *p)
+ struct kmem_cache *p)
{
- return (p == s) ||
- (s->memcg_params && (p == s->memcg_params->root_cache));
+ return p == s || p == s->memcg_params.root_cache;
}
/*
@@ -185,37 +194,30 @@ static inline bool slab_equal_or_root(struct kmem_cache *s,
static inline const char *cache_name(struct kmem_cache *s)
{
if (!is_root_cache(s))
- return s->memcg_params->root_cache->name;
+ s = s->memcg_params.root_cache;
return s->name;
}
/*
* Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
- * That said the caller must assure the memcg's cache won't go away. Since once
- * created a memcg's cache is destroyed only along with the root cache, it is
- * true if we are going to allocate from the cache or hold a reference to the
- * root cache by other means. Otherwise, we should hold either the slab_mutex
- * or the memcg's slab_caches_mutex while calling this function and accessing
- * the returned value.
+ * That said the caller must assure the memcg's cache won't go away by either
+ * taking a css reference to the owner cgroup, or holding the slab_mutex.
*/
static inline struct kmem_cache *
cache_from_memcg_idx(struct kmem_cache *s, int idx)
{
struct kmem_cache *cachep;
- struct memcg_cache_params *params;
-
- if (!s->memcg_params)
- return NULL;
+ struct memcg_cache_array *arr;
rcu_read_lock();
- params = rcu_dereference(s->memcg_params);
+ arr = rcu_dereference(s->memcg_params.memcg_caches);
/*
* Make sure we will access the up-to-date value. The code updating
* memcg_caches issues a write barrier to match this (see
- * memcg_register_cache()).
+ * memcg_create_kmem_cache()).
*/
- cachep = lockless_dereference(params->memcg_caches[idx]);
+ cachep = lockless_dereference(arr->entries[idx]);
rcu_read_unlock();
return cachep;
@@ -225,7 +227,7 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
{
if (is_root_cache(s))
return s;
- return s->memcg_params->root_cache;
+ return s->memcg_params.root_cache;
}
static __always_inline int memcg_charge_slab(struct kmem_cache *s,
@@ -235,7 +237,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s,
return 0;
if (is_root_cache(s))
return 0;
- return __memcg_charge_slab(s, gfp, order);
+ return memcg_charge_kmem(s->memcg_params.memcg, gfp, 1 << order);
}
static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
@@ -244,9 +246,18 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
return;
if (is_root_cache(s))
return;
- __memcg_uncharge_slab(s, order);
+ memcg_uncharge_kmem(s->memcg_params.memcg, 1 << order);
}
-#else
+
+extern void slab_init_memcg_params(struct kmem_cache *);
+
+#else /* !CONFIG_MEMCG_KMEM */
+
+#define for_each_memcg_cache(iter, root) \
+ for ((void)(iter), (void)(root); 0; )
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+ for ((void)(iter), (void)(tmp), (void)(root); 0; )
+
static inline bool is_root_cache(struct kmem_cache *s)
{
return true;
@@ -282,7 +293,11 @@ static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
{
}
-#endif
+
+static inline void slab_init_memcg_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
diff --git a/mm/slab_common.c b/mm/slab_common.c
index e03dd6f..999bb34 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -106,62 +106,67 @@ static inline int kmem_cache_sanity_check(const char *name, size_t size)
#endif
#ifdef CONFIG_MEMCG_KMEM
-static int memcg_alloc_cache_params(struct mem_cgroup *memcg,
- struct kmem_cache *s, struct kmem_cache *root_cache)
+void slab_init_memcg_params(struct kmem_cache *s)
{
- size_t size;
+ s->memcg_params.is_root_cache = true;
+ INIT_LIST_HEAD(&s->memcg_params.list);
+ RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+}
+
+static int init_memcg_params(struct kmem_cache *s,
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+ struct memcg_cache_array *arr;
- if (!memcg_kmem_enabled())
+ if (memcg) {
+ s->memcg_params.is_root_cache = false;
+ s->memcg_params.memcg = memcg;
+ s->memcg_params.root_cache = root_cache;
return 0;
+ }
- if (!memcg) {
- size = offsetof(struct memcg_cache_params, memcg_caches);
- size += memcg_limited_groups_array_size * sizeof(void *);
- } else
- size = sizeof(struct memcg_cache_params);
+ slab_init_memcg_params(s);
- s->memcg_params = kzalloc(size, GFP_KERNEL);
- if (!s->memcg_params)
- return -ENOMEM;
+ if (!memcg_nr_cache_ids)
+ return 0;
- if (memcg) {
- s->memcg_params->memcg = memcg;
- s->memcg_params->root_cache = root_cache;
- } else
- s->memcg_params->is_root_cache = true;
+ arr = kzalloc(sizeof(struct memcg_cache_array) +
+ memcg_nr_cache_ids * sizeof(void *),
+ GFP_KERNEL);
+ if (!arr)
+ return -ENOMEM;
+ RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
return 0;
}
-static void memcg_free_cache_params(struct kmem_cache *s)
+static void destroy_memcg_params(struct kmem_cache *s)
{
- kfree(s->memcg_params);
+ if (is_root_cache(s))
+ kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
}
-static int memcg_update_cache_params(struct kmem_cache *s, int num_memcgs)
+static int update_memcg_params(struct kmem_cache *s, int new_array_size)
{
- int size;
- struct memcg_cache_params *new_params, *cur_params;
-
- BUG_ON(!is_root_cache(s));
+ struct memcg_cache_array *old, *new;
- size = offsetof(struct memcg_cache_params, memcg_caches);
- size += num_memcgs * sizeof(void *);
+ if (!is_root_cache(s))
+ return 0;
- new_params = kzalloc(size, GFP_KERNEL);
- if (!new_params)
+ new = kzalloc(sizeof(struct memcg_cache_array) +
+ new_array_size * sizeof(void *), GFP_KERNEL);
+ if (!new)
return -ENOMEM;
- cur_params = s->memcg_params;
- memcpy(new_params->memcg_caches, cur_params->memcg_caches,
- memcg_limited_groups_array_size * sizeof(void *));
-
- new_params->is_root_cache = true;
-
- rcu_assign_pointer(s->memcg_params, new_params);
- if (cur_params)
- kfree_rcu(cur_params, rcu_head);
+ old = rcu_dereference_protected(s->memcg_params.memcg_caches,
+ lockdep_is_held(&slab_mutex));
+ if (old)
+ memcpy(new->entries, old->entries,
+ memcg_nr_cache_ids * sizeof(void *));
+ rcu_assign_pointer(s->memcg_params.memcg_caches, new);
+ if (old)
+ kfree_rcu(old, rcu);
return 0;
}
@@ -169,34 +174,28 @@ int memcg_update_all_caches(int num_memcgs)
{
struct kmem_cache *s;
int ret = 0;
- mutex_lock(&slab_mutex);
+ mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list) {
- if (!is_root_cache(s))
- continue;
-
- ret = memcg_update_cache_params(s, num_memcgs);
+ ret = update_memcg_params(s, num_memcgs);
/*
* Instead of freeing the memory, we'll just leave the caches
* up to this point in an updated state.
*/
if (ret)
- goto out;
+ break;
}
-
- memcg_update_array_size(num_memcgs);
-out:
mutex_unlock(&slab_mutex);
return ret;
}
#else
-static inline int memcg_alloc_cache_params(struct mem_cgroup *memcg,
- struct kmem_cache *s, struct kmem_cache *root_cache)
+static inline int init_memcg_params(struct kmem_cache *s,
+ struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
return 0;
}
-static inline void memcg_free_cache_params(struct kmem_cache *s)
+static inline void destroy_memcg_params(struct kmem_cache *s)
{
}
#endif /* CONFIG_MEMCG_KMEM */
@@ -296,8 +295,8 @@ unsigned long calculate_alignment(unsigned long flags,
}
static struct kmem_cache *
-do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
- unsigned long flags, void (*ctor)(void *),
+do_kmem_cache_create(const char *name, size_t object_size, size_t size,
+ size_t align, unsigned long flags, void (*ctor)(void *),
struct mem_cgroup *memcg, struct kmem_cache *root_cache)
{
struct kmem_cache *s;
@@ -314,7 +313,7 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
s->align = align;
s->ctor = ctor;
- err = memcg_alloc_cache_params(memcg, s, root_cache);
+ err = init_memcg_params(s, memcg, root_cache);
if (err)
goto out_free_cache;
@@ -330,8 +329,8 @@ out:
return s;
out_free_cache:
- memcg_free_cache_params(s);
- kfree(s);
+ destroy_memcg_params(s);
+ kmem_cache_free(kmem_cache, s);
goto out;
}
@@ -364,11 +363,12 @@ kmem_cache_create(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
- char *cache_name;
+ const char *cache_name;
int err;
get_online_cpus();
get_online_mems();
+ memcg_get_cache_ids();
mutex_lock(&slab_mutex);
@@ -390,7 +390,7 @@ kmem_cache_create(const char *name, size_t size, size_t align,
if (s)
goto out_unlock;
- cache_name = kstrdup(name, GFP_KERNEL);
+ cache_name = kstrdup_const(name, GFP_KERNEL);
if (!cache_name) {
err = -ENOMEM;
goto out_unlock;
@@ -401,12 +401,13 @@ kmem_cache_create(const char *name, size_t size, size_t align,
flags, ctor, NULL, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
- kfree(cache_name);
+ kfree_const(cache_name);
}
out_unlock:
mutex_unlock(&slab_mutex);
+ memcg_put_cache_ids();
put_online_mems();
put_online_cpus();
@@ -425,31 +426,91 @@ out_unlock:
}
EXPORT_SYMBOL(kmem_cache_create);
+static int do_kmem_cache_shutdown(struct kmem_cache *s,
+ struct list_head *release, bool *need_rcu_barrier)
+{
+ if (__kmem_cache_shutdown(s) != 0) {
+ printk(KERN_ERR "kmem_cache_destroy %s: "
+ "Slab cache still has objects\n", s->name);
+ dump_stack();
+ return -EBUSY;
+ }
+
+ if (s->flags & SLAB_DESTROY_BY_RCU)
+ *need_rcu_barrier = true;
+
+#ifdef CONFIG_MEMCG_KMEM
+ if (!is_root_cache(s))
+ list_del(&s->memcg_params.list);
+#endif
+ list_move(&s->list, release);
+ return 0;
+}
+
+static void do_kmem_cache_release(struct list_head *release,
+ bool need_rcu_barrier)
+{
+ struct kmem_cache *s, *s2;
+
+ if (need_rcu_barrier)
+ rcu_barrier();
+
+ list_for_each_entry_safe(s, s2, release, list) {
+#ifdef SLAB_SUPPORTS_SYSFS
+ sysfs_slab_remove(s);
+#else
+ slab_kmem_cache_release(s);
+#endif
+ }
+}
+
#ifdef CONFIG_MEMCG_KMEM
/*
* memcg_create_kmem_cache - Create a cache for a memory cgroup.
* @memcg: The memory cgroup the new cache is for.
* @root_cache: The parent of the new cache.
- * @memcg_name: The name of the memory cgroup (used for naming the new cache).
*
* This function attempts to create a kmem cache that will serve allocation
* requests going from @memcg to @root_cache. The new cache inherits properties
* from its parent.
*/
-struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *root_cache,
- const char *memcg_name)
+void memcg_create_kmem_cache(struct mem_cgroup *memcg,
+ struct kmem_cache *root_cache)
{
+ static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
+ struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+ struct memcg_cache_array *arr;
struct kmem_cache *s = NULL;
char *cache_name;
+ int idx;
get_online_cpus();
get_online_mems();
mutex_lock(&slab_mutex);
+ /*
+ * The memory cgroup could have been deactivated while the cache
+ * creation work was pending.
+ */
+ if (!memcg_kmem_is_active(memcg))
+ goto out_unlock;
+
+ idx = memcg_cache_id(memcg);
+ arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
+ lockdep_is_held(&slab_mutex));
+
+ /*
+ * Since per-memcg caches are created asynchronously on first
+ * allocation (see memcg_kmem_get_cache()), several threads can try to
+ * create the same cache, but only one of them may succeed.
+ */
+ if (arr->entries[idx])
+ goto out_unlock;
+
+ cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
- memcg_cache_id(memcg), memcg_name);
+ css->id, memcg_name_buf);
if (!cache_name)
goto out_unlock;
@@ -457,49 +518,108 @@ struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
root_cache->size, root_cache->align,
root_cache->flags, root_cache->ctor,
memcg, root_cache);
+ /*
+ * If we could not create a memcg cache, do not complain, because
+ * that's not critical at all as we can always proceed with the root
+ * cache.
+ */
if (IS_ERR(s)) {
kfree(cache_name);
- s = NULL;
+ goto out_unlock;
}
+ list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
+
+ /*
+ * Since readers won't lock (see cache_from_memcg_idx()), we need a
+ * barrier here to ensure nobody will see the kmem_cache partially
+ * initialized.
+ */
+ smp_wmb();
+ arr->entries[idx] = s;
+
out_unlock:
mutex_unlock(&slab_mutex);
put_online_mems();
put_online_cpus();
-
- return s;
}
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
+void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
{
- int rc;
+ int idx;
+ struct memcg_cache_array *arr;
+ struct kmem_cache *s, *c;
- if (!s->memcg_params ||
- !s->memcg_params->is_root_cache)
- return 0;
+ idx = memcg_cache_id(memcg);
+
+ get_online_cpus();
+ get_online_mems();
- mutex_unlock(&slab_mutex);
- rc = __memcg_cleanup_cache_params(s);
mutex_lock(&slab_mutex);
+ list_for_each_entry(s, &slab_caches, list) {
+ if (!is_root_cache(s))
+ continue;
+
+ arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+ lockdep_is_held(&slab_mutex));
+ c = arr->entries[idx];
+ if (!c)
+ continue;
+
+ __kmem_cache_shrink(c, true);
+ arr->entries[idx] = NULL;
+ }
+ mutex_unlock(&slab_mutex);
- return rc;
+ put_online_mems();
+ put_online_cpus();
}
-#else
-static int memcg_cleanup_cache_params(struct kmem_cache *s)
+
+void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
{
- return 0;
+ LIST_HEAD(release);
+ bool need_rcu_barrier = false;
+ struct kmem_cache *s, *s2;
+
+ get_online_cpus();
+ get_online_mems();
+
+ mutex_lock(&slab_mutex);
+ list_for_each_entry_safe(s, s2, &slab_caches, list) {
+ if (is_root_cache(s) || s->memcg_params.memcg != memcg)
+ continue;
+ /*
+ * The cgroup is about to be freed and therefore has no charges
+ * left. Hence, all its caches must be empty by now.
+ */
+ BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
+ }
+ mutex_unlock(&slab_mutex);
+
+ put_online_mems();
+ put_online_cpus();
+
+ do_kmem_cache_release(&release, need_rcu_barrier);
}
#endif /* CONFIG_MEMCG_KMEM */
void slab_kmem_cache_release(struct kmem_cache *s)
{
- kfree(s->name);
+ destroy_memcg_params(s);
+ kfree_const(s->name);
kmem_cache_free(kmem_cache, s);
}
void kmem_cache_destroy(struct kmem_cache *s)
{
+ struct kmem_cache *c, *c2;
+ LIST_HEAD(release);
+ bool need_rcu_barrier = false;
+ bool busy = false;
+
+ BUG_ON(!is_root_cache(s));
+
get_online_cpus();
get_online_mems();
@@ -509,35 +629,21 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
- if (memcg_cleanup_cache_params(s) != 0)
- goto out_unlock;
-
- if (__kmem_cache_shutdown(s) != 0) {
- printk(KERN_ERR "kmem_cache_destroy %s: "
- "Slab cache still has objects\n", s->name);
- dump_stack();
- goto out_unlock;
+ for_each_memcg_cache_safe(c, c2, s) {
+ if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+ busy = true;
}
- list_del(&s->list);
-
- mutex_unlock(&slab_mutex);
- if (s->flags & SLAB_DESTROY_BY_RCU)
- rcu_barrier();
-
- memcg_free_cache_params(s);
-#ifdef SLAB_SUPPORTS_SYSFS
- sysfs_slab_remove(s);
-#else
- slab_kmem_cache_release(s);
-#endif
- goto out;
+ if (!busy)
+ do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
out_unlock:
mutex_unlock(&slab_mutex);
-out:
+
put_online_mems();
put_online_cpus();
+
+ do_kmem_cache_release(&release, need_rcu_barrier);
}
EXPORT_SYMBOL(kmem_cache_destroy);
@@ -554,7 +660,7 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
get_online_cpus();
get_online_mems();
- ret = __kmem_cache_shrink(cachep);
+ ret = __kmem_cache_shrink(cachep, false);
put_online_mems();
put_online_cpus();
return ret;
@@ -576,6 +682,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
s->name = name;
s->size = s->object_size = size;
s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+ slab_init_memcg_params(s);
+
err = __kmem_cache_create(s, flags);
if (err)
@@ -789,6 +898,7 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
page = alloc_kmem_pages(flags, order);
ret = page ? page_address(page) : NULL;
kmemleak_alloc(ret, size, 1, flags);
+ kasan_kmalloc_large(ret, size);
return ret;
}
EXPORT_SYMBOL(kmalloc_order);
@@ -855,16 +965,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
{
struct kmem_cache *c;
struct slabinfo sinfo;
- int i;
if (!is_root_cache(s))
return;
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
-
+ for_each_memcg_cache(c, s) {
memset(&sinfo, 0, sizeof(sinfo));
get_slabinfo(c, &sinfo);
@@ -916,7 +1021,7 @@ int memcg_slab_show(struct seq_file *m, void *p)
if (p == slab_caches.next)
print_slabinfo_header(m);
- if (!is_root_cache(s) && s->memcg_params->memcg == memcg)
+ if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
cache_show(s, m);
return 0;
}
@@ -973,8 +1078,10 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
if (p)
ks = ksize(p);
- if (ks >= new_size)
+ if (ks >= new_size) {
+ kasan_krealloc((void *)p, new_size);
return (void *)p;
+ }
ret = kmalloc_track_caller(new_size, flags);
if (ret && p)
diff --git a/mm/slob.c b/mm/slob.c
index 96a8620..94a7fed 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -618,7 +618,7 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
return 0;
}
-int __kmem_cache_shrink(struct kmem_cache *d)
+int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
{
return 0;
}
diff --git a/mm/slub.c b/mm/slub.c
index fe376fe..6832c4e 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
#include <linux/proc_fs.h>
#include <linux/notifier.h>
#include <linux/seq_file.h>
+#include <linux/kasan.h>
#include <linux/kmemcheck.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
@@ -468,12 +469,30 @@ static char *slub_debug_slabs;
static int disable_higher_order_debug;
/*
+ * slub is about to manipulate internal object metadata. This memory lies
+ * outside the range of the allocated object, so accessing it would normally
+ * be reported by kasan as a bounds error. metadata_access_enable() is used
+ * to tell kasan that these accesses are OK.
+ */
+static inline void metadata_access_enable(void)
+{
+ kasan_disable_current();
+}
+
+static inline void metadata_access_disable(void)
+{
+ kasan_enable_current();
+}
+
+/*
* Object debugging
*/
static void print_section(char *text, u8 *addr, unsigned int length)
{
+ metadata_access_enable();
print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
length, 1);
+ metadata_access_disable();
}
static struct track *get_track(struct kmem_cache *s, void *object,
@@ -503,7 +522,9 @@ static void set_track(struct kmem_cache *s, void *object,
trace.max_entries = TRACK_ADDRS_COUNT;
trace.entries = p->addrs;
trace.skip = 3;
+ metadata_access_enable();
save_stack_trace(&trace);
+ metadata_access_disable();
/* See rant in lockdep.c */
if (trace.nr_entries != 0 &&
@@ -629,7 +650,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
dump_stack();
}
-static void object_err(struct kmem_cache *s, struct page *page,
+void object_err(struct kmem_cache *s, struct page *page,
u8 *object, char *reason)
{
slab_bug(s, "%s", reason);
@@ -677,7 +698,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
u8 *fault;
u8 *end;
+ metadata_access_enable();
fault = memchr_inv(start, value, bytes);
+ metadata_access_disable();
if (!fault)
return 1;
@@ -770,7 +793,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
if (!remainder)
return 1;
+ metadata_access_enable();
fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+ metadata_access_disable();
if (!fault)
return 1;
while (end > fault && end[-1] == POISON_INUSE)
@@ -1226,11 +1251,13 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node,
static inline void kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags)
{
kmemleak_alloc(ptr, size, 1, flags);
+ kasan_kmalloc_large(ptr, size);
}
static inline void kfree_hook(const void *x)
{
kmemleak_free(x);
+ kasan_kfree_large(x);
}
static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
@@ -1253,6 +1280,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
kmemleak_alloc_recursive(object, s->object_size, 1, s->flags, flags);
memcg_kmem_put_cache(s);
+ kasan_slab_alloc(s, object);
}
static inline void slab_free_hook(struct kmem_cache *s, void *x)
@@ -1276,6 +1304,8 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x)
#endif
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
+
+ kasan_slab_free(s, x);
}
/*
@@ -1370,8 +1400,11 @@ static void setup_object(struct kmem_cache *s, struct page *page,
void *object)
{
setup_object_debug(s, page, object);
- if (unlikely(s->ctor))
+ if (unlikely(s->ctor)) {
+ kasan_unpoison_object_data(s, object);
s->ctor(object);
+ kasan_poison_object_data(s, object);
+ }
}
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1404,6 +1437,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
if (unlikely(s->flags & SLAB_POISON))
memset(start, POISON_INUSE, PAGE_SIZE << order);
+ kasan_poison_slab(page);
+
for_each_object_idx(p, idx, s, start, page->objects) {
setup_object(s, page, p);
if (likely(idx < page->objects))
@@ -2007,6 +2042,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
int pages;
int pobjects;
+ preempt_disable();
do {
pages = 0;
pobjects = 0;
@@ -2040,6 +2076,14 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page)
!= oldpage);
+ if (unlikely(!s->cpu_partial)) {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+ local_irq_restore(flags);
+ }
+ preempt_enable();
#endif
}
@@ -2398,13 +2442,24 @@ redo:
* reading from one cpu area. That does not matter as long
* as we end up on the original cpu again when doing the cmpxchg.
*
- * Preemption is disabled for the retrieval of the tid because that
- * must occur from the current processor. We cannot allow rescheduling
- * on a different processor between the determination of the pointer
- * and the retrieval of the tid.
+ * We should guarantee that tid and kmem_cache are retrieved on
+ * the same cpu. It could be different if CONFIG_PREEMPT so we need
+ * to check if it is matched or not.
*/
- preempt_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ do {
+ tid = this_cpu_read(s->cpu_slab->tid);
+ c = raw_cpu_ptr(s->cpu_slab);
+ } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
+
+ /*
+ * Irqless object alloc/free algorithm used here depends on sequence
+ * of fetching cpu_slab's data. tid should be fetched before anything
+ * on c to guarantee that object and page associated with previous tid
+ * won't be used with current tid. If we fetch tid first, object and
+ * page could be one associated with next tid and our alloc/free
+ * request will be failed. In this case, we will retry. So, no problem.
+ */
+ barrier();
/*
* The transaction ids are globally unique per cpu and per operation on
@@ -2412,8 +2467,6 @@ redo:
* occurs on the right processor and that there was no operation on the
* linked list in between.
*/
- tid = c->tid;
- preempt_enable();
object = c->freelist;
page = c->page;
@@ -2479,6 +2532,7 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
{
void *ret = slab_alloc(s, gfpflags, _RET_IP_);
trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+ kasan_kmalloc(s, ret, size);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_trace);
@@ -2505,6 +2559,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
trace_kmalloc_node(_RET_IP_, ret,
size, s->size, gfpflags, node);
+
+ kasan_kmalloc(s, ret, size);
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2512,7 +2568,7 @@ EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
#endif
/*
- * Slow patch handling. This may still be called frequently since objects
+ * Slow path handling. This may still be called frequently since objects
* have a longer lifetime than the cpu slabs in most processing loads.
*
* So we still attempt to reduce cache line usage. Just take the slab
@@ -2659,11 +2715,13 @@ redo:
* data is retrieved via this pointer. If we are on the same cpu
* during the cmpxchg then the free will succedd.
*/
- preempt_disable();
- c = this_cpu_ptr(s->cpu_slab);
+ do {
+ tid = this_cpu_read(s->cpu_slab->tid);
+ c = raw_cpu_ptr(s->cpu_slab);
+ } while (IS_ENABLED(CONFIG_PREEMPT) && unlikely(tid != c->tid));
- tid = c->tid;
- preempt_enable();
+ /* Same with comment on barrier() in slab_alloc_node() */
+ barrier();
if (likely(page == c->page)) {
set_freepointer(s, object, c->freelist);
@@ -2888,6 +2946,7 @@ static void early_kmem_cache_node_alloc(int node)
init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
init_tracking(kmem_cache_node, n);
#endif
+ kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
init_kmem_cache_node(n);
inc_slabs_node(kmem_cache_node, node, page->objects);
@@ -3260,6 +3319,8 @@ void *__kmalloc(size_t size, gfp_t flags)
trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
+ kasan_kmalloc(s, ret, size);
+
return ret;
}
EXPORT_SYMBOL(__kmalloc);
@@ -3303,12 +3364,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
+ kasan_kmalloc(s, ret, size);
+
return ret;
}
EXPORT_SYMBOL(__kmalloc_node);
#endif
-size_t ksize(const void *object)
+static size_t __ksize(const void *object)
{
struct page *page;
@@ -3324,6 +3387,15 @@ size_t ksize(const void *object)
return slab_ksize(page->slab_cache);
}
+
+size_t ksize(const void *object)
+{
+ size_t size = __ksize(object);
+ /* We assume that ksize callers could use whole allocated area,
+ so we need unpoison this area. */
+ kasan_krealloc(object, size);
+ return size;
+}
EXPORT_SYMBOL(ksize);
void kfree(const void *x)
@@ -3347,69 +3419,92 @@ void kfree(const void *x)
}
EXPORT_SYMBOL(kfree);
+#define SHRINK_PROMOTE_MAX 32
+
/*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
+ * up most to the head of the partial lists. New allocations will then
+ * fill those up and thus they can be removed from the partial lists.
*
* The slabs with the least items are placed last. This results in them
* being allocated from last increasing the chance that the last objects
* are freed in them.
*/
-int __kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
{
int node;
int i;
struct kmem_cache_node *n;
struct page *page;
struct page *t;
- int objects = oo_objects(s->max);
- struct list_head *slabs_by_inuse =
- kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+ struct list_head discard;
+ struct list_head promote[SHRINK_PROMOTE_MAX];
unsigned long flags;
+ int ret = 0;
- if (!slabs_by_inuse)
- return -ENOMEM;
+ if (deactivate) {
+ /*
+ * Disable empty slabs caching. Used to avoid pinning offline
+ * memory cgroups by kmem pages that can be freed.
+ */
+ s->cpu_partial = 0;
+ s->min_partial = 0;
+
+ /*
+ * s->cpu_partial is checked locklessly (see put_cpu_partial),
+ * so we have to make sure the change is visible.
+ */
+ kick_all_cpus_sync();
+ }
flush_all(s);
for_each_kmem_cache_node(s, node, n) {
- if (!n->nr_partial)
- continue;
-
- for (i = 0; i < objects; i++)
- INIT_LIST_HEAD(slabs_by_inuse + i);
+ INIT_LIST_HEAD(&discard);
+ for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+ INIT_LIST_HEAD(promote + i);
spin_lock_irqsave(&n->list_lock, flags);
/*
- * Build lists indexed by the items in use in each slab.
+ * Build lists of slabs to discard or promote.
*
* Note that concurrent frees may occur while we hold the
* list_lock. page->inuse here is the upper limit.
*/
list_for_each_entry_safe(page, t, &n->partial, lru) {
- list_move(&page->lru, slabs_by_inuse + page->inuse);
- if (!page->inuse)
+ int free = page->objects - page->inuse;
+
+ /* Do not reread page->inuse */
+ barrier();
+
+ /* We do not keep full slabs on the list */
+ BUG_ON(free <= 0);
+
+ if (free == page->objects) {
+ list_move(&page->lru, &discard);
n->nr_partial--;
+ } else if (free <= SHRINK_PROMOTE_MAX)
+ list_move(&page->lru, promote + free - 1);
}
/*
- * Rebuild the partial list with the slabs filled up most
- * first and the least used slabs at the end.
+ * Promote the slabs filled up most to the head of the
+ * partial list.
*/
- for (i = objects - 1; i > 0; i--)
- list_splice(slabs_by_inuse + i, n->partial.prev);
+ for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
+ list_splice(promote + i, &n->partial);
spin_unlock_irqrestore(&n->list_lock, flags);
/* Release empty slabs */
- list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+ list_for_each_entry_safe(page, t, &discard, lru)
discard_slab(s, page);
+
+ if (slabs_node(s, node))
+ ret = 1;
}
- kfree(slabs_by_inuse);
- return 0;
+ return ret;
}
static int slab_mem_going_offline_callback(void *arg)
@@ -3418,7 +3513,7 @@ static int slab_mem_going_offline_callback(void *arg)
mutex_lock(&slab_mutex);
list_for_each_entry(s, &slab_caches, list)
- __kmem_cache_shrink(s);
+ __kmem_cache_shrink(s, false);
mutex_unlock(&slab_mutex);
return 0;
@@ -3566,6 +3661,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
p->slab_cache = s;
#endif
}
+ slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
return s;
}
@@ -3624,13 +3720,10 @@ struct kmem_cache *
__kmem_cache_alias(const char *name, size_t size, size_t align,
unsigned long flags, void (*ctor)(void *))
{
- struct kmem_cache *s;
+ struct kmem_cache *s, *c;
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
- int i;
- struct kmem_cache *c;
-
s->refcount++;
/*
@@ -3640,10 +3733,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align,
s->object_size = max(s->object_size, (int)size);
s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
- for_each_memcg_cache_index(i) {
- c = cache_from_memcg_idx(s, i);
- if (!c)
- continue;
+ for_each_memcg_cache(c, s) {
c->object_size = s->object_size;
c->inuse = max_t(int, c->inuse,
ALIGN(size, sizeof(void *)));
@@ -4070,20 +4160,16 @@ static int list_locations(struct kmem_cache *s, char *buf,
if (num_online_cpus() > 1 &&
!cpumask_empty(to_cpumask(l->cpus)) &&
- len < PAGE_SIZE - 60) {
- len += sprintf(buf + len, " cpus=");
- len += cpulist_scnprintf(buf + len,
- PAGE_SIZE - len - 50,
- to_cpumask(l->cpus));
- }
+ len < PAGE_SIZE - 60)
+ len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+ " cpus=%*pbl",
+ cpumask_pr_args(to_cpumask(l->cpus)));
if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
- len < PAGE_SIZE - 60) {
- len += sprintf(buf + len, " nodes=");
- len += nodelist_scnprintf(buf + len,
- PAGE_SIZE - len - 50,
- l->nodes);
- }
+ len < PAGE_SIZE - 60)
+ len += scnprintf(buf + len, PAGE_SIZE - len - 50,
+ " nodes=%*pbl",
+ nodemask_pr_args(&l->nodes));
len += sprintf(buf + len, "\n");
}
@@ -4680,12 +4766,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
static ssize_t shrink_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- if (buf[0] == '1') {
- int rc = kmem_cache_shrink(s);
-
- if (rc)
- return rc;
- } else
+ if (buf[0] == '1')
+ kmem_cache_shrink(s);
+ else
return -EINVAL;
return length;
}
@@ -4909,7 +4992,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
err = attribute->store(s, buf, len);
#ifdef CONFIG_MEMCG_KMEM
if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
- int i;
+ struct kmem_cache *c;
mutex_lock(&slab_mutex);
if (s->max_attr_size < len)
@@ -4932,11 +5015,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
* directly either failed or succeeded, in which case we loop
* through the descendants with best-effort propagation.
*/
- for_each_memcg_cache_index(i) {
- struct kmem_cache *c = cache_from_memcg_idx(s, i);
- if (c)
- attribute->store(c, buf, len);
- }
+ for_each_memcg_cache(c, s)
+ attribute->store(c, buf, len);
mutex_unlock(&slab_mutex);
}
#endif
@@ -4953,7 +5033,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
if (is_root_cache(s))
return;
- root_cache = s->memcg_params->root_cache;
+ root_cache = s->memcg_params.root_cache;
/*
* This mean this cache had no attribute written. Therefore, no point
@@ -5033,7 +5113,7 @@ static inline struct kset *cache_kset(struct kmem_cache *s)
{
#ifdef CONFIG_MEMCG_KMEM
if (!is_root_cache(s))
- return s->memcg_params->root_cache->memcg_kset;
+ return s->memcg_params.root_cache->memcg_kset;
#endif
return slab_kset;
}
diff --git a/mm/swap.c b/mm/swap.c
index 8a12b33..cd3a5e6 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -1138,12 +1138,8 @@ void __init swap_setup(void)
#ifdef CONFIG_SWAP
int i;
- if (bdi_init(swapper_spaces[0].backing_dev_info))
- panic("Failed to init swap bdi");
- for (i = 0; i < MAX_SWAPFILES; i++) {
+ for (i = 0; i < MAX_SWAPFILES; i++)
spin_lock_init(&swapper_spaces[i].tree_lock);
- INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
- }
#endif
/* Use a smaller cluster for small-memory machines */
diff --git a/mm/swap_state.c b/mm/swap_state.c
index 9711342..405923f 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -32,17 +32,11 @@ static const struct address_space_operations swap_aops = {
#endif
};
-static struct backing_dev_info swap_backing_dev_info = {
- .name = "swap",
- .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED,
-};
-
struct address_space swapper_spaces[MAX_SWAPFILES] = {
[0 ... MAX_SWAPFILES - 1] = {
.page_tree = RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
.i_mmap_writable = ATOMIC_INIT(0),
.a_ops = &swap_aops,
- .backing_dev_info = &swap_backing_dev_info,
}
};
diff --git a/mm/truncate.c b/mm/truncate.c
index f1e4d60..ddec5a5 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -112,7 +112,7 @@ void cancel_dirty_page(struct page *page, unsigned int account_size)
struct address_space *mapping = page->mapping;
if (mapping && mapping_cap_account_dirty(mapping)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
- dec_bdi_stat(mapping->backing_dev_info,
+ dec_bdi_stat(inode_to_bdi(mapping->host),
BDI_RECLAIMABLE);
if (account_size)
task_io_account_cancelled_write(account_size);
diff --git a/mm/util.c b/mm/util.c
index fec39d4..3981ae9 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -12,10 +12,30 @@
#include <linux/hugetlb.h>
#include <linux/vmalloc.h>
+#include <asm/sections.h>
#include <asm/uaccess.h>
#include "internal.h"
+static inline int is_kernel_rodata(unsigned long addr)
+{
+ return addr >= (unsigned long)__start_rodata &&
+ addr < (unsigned long)__end_rodata;
+}
+
+/**
+ * kfree_const - conditionally free memory
+ * @x: pointer to the memory
+ *
+ * Function calls kfree only if @x is not in .rodata section.
+ */
+void kfree_const(const void *x)
+{
+ if (!is_kernel_rodata((unsigned long)x))
+ kfree(x);
+}
+EXPORT_SYMBOL(kfree_const);
+
/**
* kstrdup - allocate space for and copy an existing string
* @s: the string to duplicate
@@ -38,6 +58,24 @@ char *kstrdup(const char *s, gfp_t gfp)
EXPORT_SYMBOL(kstrdup);
/**
+ * kstrdup_const - conditionally duplicate an existing const string
+ * @s: the string to duplicate
+ * @gfp: the GFP mask used in the kmalloc() call when allocating memory
+ *
+ * Function returns source string if it is in .rodata section otherwise it
+ * fallbacks to kstrdup.
+ * Strings allocated by kstrdup_const should be freed by kfree_const.
+ */
+const char *kstrdup_const(const char *s, gfp_t gfp)
+{
+ if (is_kernel_rodata((unsigned long)s))
+ return s;
+
+ return kstrdup(s, gfp);
+}
+EXPORT_SYMBOL(kstrdup_const);
+
+/**
* kstrndup - allocate space for and copy an existing string
* @s: the string to duplicate
* @max: read at most @max chars from @s
@@ -240,14 +278,8 @@ int __weak get_user_pages_fast(unsigned long start,
int nr_pages, int write, struct page **pages)
{
struct mm_struct *mm = current->mm;
- int ret;
-
- down_read(&mm->mmap_sem);
- ret = get_user_pages(current, mm, start, nr_pages,
- write, 0, pages, NULL);
- up_read(&mm->mmap_sem);
-
- return ret;
+ return get_user_pages_unlocked(current, mm, start, nr_pages,
+ write, 0, pages);
}
EXPORT_SYMBOL_GPL(get_user_pages_fast);
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 39c3388..35b25e1 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1324,10 +1324,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
if (unlikely(!area))
return NULL;
- /*
- * We always allocate a guard page.
- */
- size += PAGE_SIZE;
+ if (!(flags & VM_NO_GUARD))
+ size += PAGE_SIZE;
va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
if (IS_ERR(va)) {
@@ -1621,6 +1619,7 @@ fail:
* @end: vm area range end
* @gfp_mask: flags for the page level allocator
* @prot: protection mask for the allocated pages
+ * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
@@ -1630,7 +1629,8 @@ fail:
*/
void *__vmalloc_node_range(unsigned long size, unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
- pgprot_t prot, int node, const void *caller)
+ pgprot_t prot, unsigned long vm_flags, int node,
+ const void *caller)
{
struct vm_struct *area;
void *addr;
@@ -1640,8 +1640,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
goto fail;
- area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
- start, end, node, gfp_mask, caller);
+ area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED |
+ vm_flags, start, end, node, gfp_mask, caller);
if (!area)
goto fail;
@@ -1690,7 +1690,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
- gfp_mask, prot, node, caller);
+ gfp_mask, prot, 0, node, caller);
}
void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index dcd90c8..5e8eadd 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -91,6 +91,9 @@ struct scan_control {
/* Can pages be swapped as part of reclaim? */
unsigned int may_swap:1;
+ /* Can cgroups be reclaimed below their normal consumption range? */
+ unsigned int may_thrash:1;
+
unsigned int hibernation_mode:1;
/* One of the zones is ready for compaction */
@@ -229,10 +232,10 @@ EXPORT_SYMBOL(unregister_shrinker);
#define SHRINK_BATCH 128
-static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
- struct shrinker *shrinker,
- unsigned long nr_scanned,
- unsigned long nr_eligible)
+static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+ struct shrinker *shrinker,
+ unsigned long nr_scanned,
+ unsigned long nr_eligible)
{
unsigned long freed = 0;
unsigned long long delta;
@@ -341,9 +344,10 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
}
/**
- * shrink_node_slabs - shrink slab caches of a given node
+ * shrink_slab - shrink slab caches
* @gfp_mask: allocation context
* @nid: node whose slab caches to target
+ * @memcg: memory cgroup whose slab caches to target
* @nr_scanned: pressure numerator
* @nr_eligible: pressure denominator
*
@@ -352,6 +356,12 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
* @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
* unaware shrinkers will receive a node id of 0 instead.
*
+ * @memcg specifies the memory cgroup to target. If it is not NULL,
+ * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
+ * objects from the memory cgroup specified. Otherwise all shrinkers
+ * are called, and memcg aware shrinkers are supposed to scan the
+ * global list then.
+ *
* @nr_scanned and @nr_eligible form a ratio that indicate how much of
* the available objects should be scanned. Page reclaim for example
* passes the number of pages scanned and the number of pages on the
@@ -362,13 +372,17 @@ static unsigned long shrink_slabs(struct shrink_control *shrinkctl,
*
* Returns the number of reclaimed slab objects.
*/
-unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
- unsigned long nr_scanned,
- unsigned long nr_eligible)
+static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
+ struct mem_cgroup *memcg,
+ unsigned long nr_scanned,
+ unsigned long nr_eligible)
{
struct shrinker *shrinker;
unsigned long freed = 0;
+ if (memcg && !memcg_kmem_is_active(memcg))
+ return 0;
+
if (nr_scanned == 0)
nr_scanned = SWAP_CLUSTER_MAX;
@@ -387,12 +401,16 @@ unsigned long shrink_node_slabs(gfp_t gfp_mask, int nid,
struct shrink_control sc = {
.gfp_mask = gfp_mask,
.nid = nid,
+ .memcg = memcg,
};
+ if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+ continue;
+
if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
sc.nid = 0;
- freed += shrink_slabs(&sc, shrinker, nr_scanned, nr_eligible);
+ freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
}
up_read(&shrinker_rwsem);
@@ -401,6 +419,29 @@ out:
return freed;
}
+void drop_slab_node(int nid)
+{
+ unsigned long freed;
+
+ do {
+ struct mem_cgroup *memcg = NULL;
+
+ freed = 0;
+ do {
+ freed += shrink_slab(GFP_KERNEL, nid, memcg,
+ 1000, 1000);
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+ } while (freed > 10);
+}
+
+void drop_slab(void)
+{
+ int nid;
+
+ for_each_online_node(nid)
+ drop_slab_node(nid);
+}
+
static inline int is_page_cache_freeable(struct page *page)
{
/*
@@ -497,7 +538,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping,
}
if (mapping->a_ops->writepage == NULL)
return PAGE_ACTIVATE;
- if (!may_write_to_queue(mapping->backing_dev_info, sc))
+ if (!may_write_to_queue(inode_to_bdi(mapping->host), sc))
return PAGE_KEEP;
if (clear_page_dirty_for_io(page)) {
@@ -876,7 +917,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
mapping = page_mapping(page);
if (((dirty || writeback) && mapping &&
- bdi_write_congested(mapping->backing_dev_info)) ||
+ bdi_write_congested(inode_to_bdi(mapping->host))) ||
(writeback && PageReclaim(page)))
nr_congested++;
@@ -1903,8 +1944,12 @@ static void get_scan_count(struct lruvec *lruvec, int swappiness,
* latencies, so it's better to scan a minimum amount there as
* well.
*/
- if (current_is_kswapd() && !zone_reclaimable(zone))
- force_scan = true;
+ if (current_is_kswapd()) {
+ if (!zone_reclaimable(zone))
+ force_scan = true;
+ if (!mem_cgroup_lruvec_online(lruvec))
+ force_scan = true;
+ }
if (!global_reclaim(sc))
force_scan = true;
@@ -2269,6 +2314,7 @@ static inline bool should_continue_reclaim(struct zone *zone,
static bool shrink_zone(struct zone *zone, struct scan_control *sc,
bool is_classzone)
{
+ struct reclaim_state *reclaim_state = current->reclaim_state;
unsigned long nr_reclaimed, nr_scanned;
bool reclaimable = false;
@@ -2287,15 +2333,28 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
memcg = mem_cgroup_iter(root, NULL, &reclaim);
do {
unsigned long lru_pages;
+ unsigned long scanned;
struct lruvec *lruvec;
int swappiness;
+ if (mem_cgroup_low(root, memcg)) {
+ if (!sc->may_thrash)
+ continue;
+ mem_cgroup_events(memcg, MEMCG_LOW, 1);
+ }
+
lruvec = mem_cgroup_zone_lruvec(zone, memcg);
swappiness = mem_cgroup_swappiness(memcg);
+ scanned = sc->nr_scanned;
shrink_lruvec(lruvec, swappiness, sc, &lru_pages);
zone_lru_pages += lru_pages;
+ if (memcg && is_classzone)
+ shrink_slab(sc->gfp_mask, zone_to_nid(zone),
+ memcg, sc->nr_scanned - scanned,
+ lru_pages);
+
/*
* Direct reclaim and kswapd have to scan all memory
* cgroups to fulfill the overall scan target for the
@@ -2311,26 +2370,20 @@ static bool shrink_zone(struct zone *zone, struct scan_control *sc,
mem_cgroup_iter_break(root, memcg);
break;
}
- memcg = mem_cgroup_iter(root, memcg, &reclaim);
- } while (memcg);
+ } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
/*
* Shrink the slab caches in the same proportion that
* the eligible LRU pages were scanned.
*/
- if (global_reclaim(sc) && is_classzone) {
- struct reclaim_state *reclaim_state;
-
- shrink_node_slabs(sc->gfp_mask, zone_to_nid(zone),
- sc->nr_scanned - nr_scanned,
- zone_lru_pages);
-
- reclaim_state = current->reclaim_state;
- if (reclaim_state) {
- sc->nr_reclaimed +=
- reclaim_state->reclaimed_slab;
- reclaim_state->reclaimed_slab = 0;
- }
+ if (global_reclaim(sc) && is_classzone)
+ shrink_slab(sc->gfp_mask, zone_to_nid(zone), NULL,
+ sc->nr_scanned - nr_scanned,
+ zone_lru_pages);
+
+ if (reclaim_state) {
+ sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+ reclaim_state->reclaimed_slab = 0;
}
vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
@@ -2515,10 +2568,11 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
+ int initial_priority = sc->priority;
unsigned long total_scanned = 0;
unsigned long writeback_threshold;
bool zones_reclaimable;
-
+retry:
delayacct_freepages_start();
if (global_reclaim(sc))
@@ -2568,6 +2622,13 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
if (sc->compaction_ready)
return 1;
+ /* Untapped cgroup reserves? Don't OOM, retry. */
+ if (!sc->may_thrash) {
+ sc->priority = initial_priority;
+ sc->may_thrash = 1;
+ goto retry;
+ }
+
/* Any of the zones still reclaimable? Don't OOM. */
if (zones_reclaimable)
return 1;
@@ -3175,7 +3236,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait) &&
pfmemalloc_watermark_ok(pgdat))
- wake_up(&pgdat->pfmemalloc_wait);
+ wake_up_all(&pgdat->pfmemalloc_wait);
/*
* Fragmentation may mean that the system cannot be rebalanced
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 1284f89..4f5cd97 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -17,6 +17,9 @@
#include <linux/cpu.h>
#include <linux/cpumask.h>
#include <linux/vmstat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
#include <linux/sched.h>
#include <linux/math64.h>
#include <linux/writeback.h>
@@ -670,66 +673,6 @@ int fragmentation_index(struct zone *zone, unsigned int order)
}
#endif
-#if defined(CONFIG_PROC_FS) || defined(CONFIG_COMPACTION)
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-
-static char * const migratetype_names[MIGRATE_TYPES] = {
- "Unmovable",
- "Reclaimable",
- "Movable",
- "Reserve",
-#ifdef CONFIG_CMA
- "CMA",
-#endif
-#ifdef CONFIG_MEMORY_ISOLATION
- "Isolate",
-#endif
-};
-
-static void *frag_start(struct seq_file *m, loff_t *pos)
-{
- pg_data_t *pgdat;
- loff_t node = *pos;
- for (pgdat = first_online_pgdat();
- pgdat && node;
- pgdat = next_online_pgdat(pgdat))
- --node;
-
- return pgdat;
-}
-
-static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
-{
- pg_data_t *pgdat = (pg_data_t *)arg;
-
- (*pos)++;
- return next_online_pgdat(pgdat);
-}
-
-static void frag_stop(struct seq_file *m, void *arg)
-{
-}
-
-/* Walk all the zones in a node and print using a callback */
-static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
- void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
-{
- struct zone *zone;
- struct zone *node_zones = pgdat->node_zones;
- unsigned long flags;
-
- for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
- if (!populated_zone(zone))
- continue;
-
- spin_lock_irqsave(&zone->lock, flags);
- print(m, pgdat, zone);
- spin_unlock_irqrestore(&zone->lock, flags);
- }
-}
-#endif
-
#if defined(CONFIG_PROC_FS) || defined(CONFIG_SYSFS) || defined(CONFIG_NUMA)
#ifdef CONFIG_ZONE_DMA
#define TEXT_FOR_DMA(xx) xx "_dma",
@@ -907,7 +850,66 @@ const char * const vmstat_text[] = {
#endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */
+#if (defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)) || \
+ defined(CONFIG_PROC_FS)
+static void *frag_start(struct seq_file *m, loff_t *pos)
+{
+ pg_data_t *pgdat;
+ loff_t node = *pos;
+
+ for (pgdat = first_online_pgdat();
+ pgdat && node;
+ pgdat = next_online_pgdat(pgdat))
+ --node;
+
+ return pgdat;
+}
+
+static void *frag_next(struct seq_file *m, void *arg, loff_t *pos)
+{
+ pg_data_t *pgdat = (pg_data_t *)arg;
+
+ (*pos)++;
+ return next_online_pgdat(pgdat);
+}
+
+static void frag_stop(struct seq_file *m, void *arg)
+{
+}
+
+/* Walk all the zones in a node and print using a callback */
+static void walk_zones_in_node(struct seq_file *m, pg_data_t *pgdat,
+ void (*print)(struct seq_file *m, pg_data_t *, struct zone *))
+{
+ struct zone *zone;
+ struct zone *node_zones = pgdat->node_zones;
+ unsigned long flags;
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!populated_zone(zone))
+ continue;
+
+ spin_lock_irqsave(&zone->lock, flags);
+ print(m, pgdat, zone);
+ spin_unlock_irqrestore(&zone->lock, flags);
+ }
+}
+#endif
+
#ifdef CONFIG_PROC_FS
+static char * const migratetype_names[MIGRATE_TYPES] = {
+ "Unmovable",
+ "Reclaimable",
+ "Movable",
+ "Reserve",
+#ifdef CONFIG_CMA
+ "CMA",
+#endif
+#ifdef CONFIG_MEMORY_ISOLATION
+ "Isolate",
+#endif
+};
+
static void frag_show_print(struct seq_file *m, pg_data_t *pgdat,
struct zone *zone)
{
@@ -1435,8 +1437,8 @@ static void vmstat_shepherd(struct work_struct *w)
if (need_update(cpu) &&
cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
- schedule_delayed_work_on(cpu, &per_cpu(vmstat_work, cpu),
- __round_jiffies_relative(sysctl_stat_interval, cpu));
+ schedule_delayed_work_on(cpu,
+ &per_cpu(vmstat_work, cpu), 0);
put_online_cpus();
@@ -1450,7 +1452,7 @@ static void __init start_shepherd_timer(void)
int cpu;
for_each_possible_cpu(cpu)
- INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
+ INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update);
if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
@@ -1536,8 +1538,6 @@ static int __init setup_vmstat(void)
module_init(setup_vmstat)
#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_COMPACTION)
-#include <linux/debugfs.h>
-
/*
* Return an index indicating how much of the available free memory is
diff --git a/mm/workingset.c b/mm/workingset.c
index f7216fa..aa01713 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid);
+ shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
local_irq_enable();
pages = node_present_pages(sc->nid);
@@ -302,6 +302,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
}
static enum lru_status shadow_lru_isolate(struct list_head *item,
+ struct list_lru_one *lru,
spinlock_t *lru_lock,
void *arg)
{
@@ -332,7 +333,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out;
}
- list_del_init(item);
+ list_lru_isolate(lru, item);
spin_unlock(lru_lock);
/*
@@ -376,8 +377,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable();
- ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
- shadow_lru_isolate, NULL, &sc->nr_to_scan);
+ ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
+ shadow_lru_isolate, NULL);
local_irq_enable();
return ret;
}
diff --git a/mm/zbud.c b/mm/zbud.c
index 4e387be..2ee4e45 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -130,7 +130,8 @@ static struct zbud_ops zbud_zpool_ops = {
.evict = zbud_zpool_evict
};
-static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zbud_zpool_create(char *name, gfp_t gfp,
+ struct zpool_ops *zpool_ops)
{
return zbud_create_pool(gfp, zpool_ops ? &zbud_zpool_ops : NULL);
}
diff --git a/mm/zpool.c b/mm/zpool.c
index 739cdf0..bacdab6 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -129,6 +129,7 @@ static void zpool_put_driver(struct zpool_driver *driver)
/**
* zpool_create_pool() - Create a new zpool
* @type The type of the zpool to create (e.g. zbud, zsmalloc)
+ * @name The name of the zpool (e.g. zram0, zswap)
* @gfp The GFP flags to use when allocating the pool.
* @ops The optional ops callback.
*
@@ -140,7 +141,8 @@ static void zpool_put_driver(struct zpool_driver *driver)
*
* Returns: New zpool on success, NULL on failure.
*/
-struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
+struct zpool *zpool_create_pool(char *type, char *name, gfp_t gfp,
+ struct zpool_ops *ops)
{
struct zpool_driver *driver;
struct zpool *zpool;
@@ -168,7 +170,7 @@ struct zpool *zpool_create_pool(char *type, gfp_t gfp, struct zpool_ops *ops)
zpool->type = driver->type;
zpool->driver = driver;
- zpool->pool = driver->create(gfp, ops);
+ zpool->pool = driver->create(name, gfp, ops);
zpool->ops = ops;
if (!zpool->pool) {
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index b724039..0dec1fa 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -91,6 +91,7 @@
#include <linux/hardirq.h>
#include <linux/spinlock.h>
#include <linux/types.h>
+#include <linux/debugfs.h>
#include <linux/zsmalloc.h>
#include <linux/zpool.h>
@@ -168,6 +169,22 @@ enum fullness_group {
ZS_FULL
};
+enum zs_stat_type {
+ OBJ_ALLOCATED,
+ OBJ_USED,
+ NR_ZS_STAT_TYPE,
+};
+
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static struct dentry *zs_stat_root;
+
+struct zs_size_stat {
+ unsigned long objs[NR_ZS_STAT_TYPE];
+};
+
+#endif
+
/*
* number of size_classes
*/
@@ -200,6 +217,10 @@ struct size_class {
/* Number of PAGE_SIZE sized pages to combine to form a 'zspage' */
int pages_per_zspage;
+#ifdef CONFIG_ZSMALLOC_STAT
+ struct zs_size_stat stats;
+#endif
+
spinlock_t lock;
struct page *fullness_list[_ZS_NR_FULLNESS_GROUPS];
@@ -217,10 +238,16 @@ struct link_free {
};
struct zs_pool {
+ char *name;
+
struct size_class **size_class;
gfp_t flags; /* allocation flags used when growing pool */
atomic_long_t pages_allocated;
+
+#ifdef CONFIG_ZSMALLOC_STAT
+ struct dentry *stat_dentry;
+#endif
};
/*
@@ -246,9 +273,9 @@ struct mapping_area {
#ifdef CONFIG_ZPOOL
-static void *zs_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
+static void *zs_zpool_create(char *name, gfp_t gfp, struct zpool_ops *zpool_ops)
{
- return zs_create_pool(gfp);
+ return zs_create_pool(name, gfp);
}
static void zs_zpool_destroy(void *pool)
@@ -942,6 +969,166 @@ static bool can_merge(struct size_class *prev, int size, int pages_per_zspage)
return true;
}
+#ifdef CONFIG_ZSMALLOC_STAT
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] += cnt;
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+ class->stats.objs[type] -= cnt;
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return class->stats.objs[type];
+}
+
+static int __init zs_stat_init(void)
+{
+ if (!debugfs_initialized())
+ return -ENODEV;
+
+ zs_stat_root = debugfs_create_dir("zsmalloc", NULL);
+ if (!zs_stat_root)
+ return -ENOMEM;
+
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+ debugfs_remove_recursive(zs_stat_root);
+}
+
+static int zs_stats_size_show(struct seq_file *s, void *v)
+{
+ int i;
+ struct zs_pool *pool = s->private;
+ struct size_class *class;
+ int objs_per_zspage;
+ unsigned long obj_allocated, obj_used, pages_used;
+ unsigned long total_objs = 0, total_used_objs = 0, total_pages = 0;
+
+ seq_printf(s, " %5s %5s %13s %10s %10s\n", "class", "size",
+ "obj_allocated", "obj_used", "pages_used");
+
+ for (i = 0; i < zs_size_classes; i++) {
+ class = pool->size_class[i];
+
+ if (class->index != i)
+ continue;
+
+ spin_lock(&class->lock);
+ obj_allocated = zs_stat_get(class, OBJ_ALLOCATED);
+ obj_used = zs_stat_get(class, OBJ_USED);
+ spin_unlock(&class->lock);
+
+ objs_per_zspage = get_maxobj_per_zspage(class->size,
+ class->pages_per_zspage);
+ pages_used = obj_allocated / objs_per_zspage *
+ class->pages_per_zspage;
+
+ seq_printf(s, " %5u %5u %10lu %10lu %10lu\n", i,
+ class->size, obj_allocated, obj_used, pages_used);
+
+ total_objs += obj_allocated;
+ total_used_objs += obj_used;
+ total_pages += pages_used;
+ }
+
+ seq_puts(s, "\n");
+ seq_printf(s, " %5s %5s %10lu %10lu %10lu\n", "Total", "",
+ total_objs, total_used_objs, total_pages);
+
+ return 0;
+}
+
+static int zs_stats_size_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, zs_stats_size_show, inode->i_private);
+}
+
+static const struct file_operations zs_stat_size_ops = {
+ .open = zs_stats_size_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ struct dentry *entry;
+
+ if (!zs_stat_root)
+ return -ENODEV;
+
+ entry = debugfs_create_dir(name, zs_stat_root);
+ if (!entry) {
+ pr_warn("debugfs dir <%s> creation failed\n", name);
+ return -ENOMEM;
+ }
+ pool->stat_dentry = entry;
+
+ entry = debugfs_create_file("obj_in_classes", S_IFREG | S_IRUGO,
+ pool->stat_dentry, pool, &zs_stat_size_ops);
+ if (!entry) {
+ pr_warn("%s: debugfs file entry <%s> creation failed\n",
+ name, "obj_in_classes");
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+
+static void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+ debugfs_remove_recursive(pool->stat_dentry);
+}
+
+#else /* CONFIG_ZSMALLOC_STAT */
+
+static inline void zs_stat_inc(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline void zs_stat_dec(struct size_class *class,
+ enum zs_stat_type type, unsigned long cnt)
+{
+}
+
+static inline unsigned long zs_stat_get(struct size_class *class,
+ enum zs_stat_type type)
+{
+ return 0;
+}
+
+static int __init zs_stat_init(void)
+{
+ return 0;
+}
+
+static void __exit zs_stat_exit(void)
+{
+}
+
+static inline int zs_pool_stat_create(char *name, struct zs_pool *pool)
+{
+ return 0;
+}
+
+static inline void zs_pool_stat_destroy(struct zs_pool *pool)
+{
+}
+
+#endif
+
unsigned long zs_get_total_pages(struct zs_pool *pool)
{
return atomic_long_read(&pool->pages_allocated);
@@ -1074,7 +1261,10 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
set_zspage_mapping(first_page, class->index, ZS_EMPTY);
atomic_long_add(class->pages_per_zspage,
&pool->pages_allocated);
+
spin_lock(&class->lock);
+ zs_stat_inc(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
}
obj = (unsigned long)first_page->freelist;
@@ -1088,6 +1278,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size)
kunmap_atomic(vaddr);
first_page->inuse++;
+ zs_stat_inc(class, OBJ_USED, 1);
/* Now move the zspage to another fullness group, if required */
fix_fullness_group(pool, first_page);
spin_unlock(&class->lock);
@@ -1128,6 +1319,12 @@ void zs_free(struct zs_pool *pool, unsigned long obj)
first_page->inuse--;
fullness = fix_fullness_group(pool, first_page);
+
+ zs_stat_dec(class, OBJ_USED, 1);
+ if (fullness == ZS_EMPTY)
+ zs_stat_dec(class, OBJ_ALLOCATED, get_maxobj_per_zspage(
+ class->size, class->pages_per_zspage));
+
spin_unlock(&class->lock);
if (fullness == ZS_EMPTY) {
@@ -1148,7 +1345,7 @@ EXPORT_SYMBOL_GPL(zs_free);
* On success, a pointer to the newly created pool is returned,
* otherwise NULL.
*/
-struct zs_pool *zs_create_pool(gfp_t flags)
+struct zs_pool *zs_create_pool(char *name, gfp_t flags)
{
int i;
struct zs_pool *pool;
@@ -1158,9 +1355,16 @@ struct zs_pool *zs_create_pool(gfp_t flags)
if (!pool)
return NULL;
+ pool->name = kstrdup(name, GFP_KERNEL);
+ if (!pool->name) {
+ kfree(pool);
+ return NULL;
+ }
+
pool->size_class = kcalloc(zs_size_classes, sizeof(struct size_class *),
GFP_KERNEL);
if (!pool->size_class) {
+ kfree(pool->name);
kfree(pool);
return NULL;
}
@@ -1210,6 +1414,9 @@ struct zs_pool *zs_create_pool(gfp_t flags)
pool->flags = flags;
+ if (zs_pool_stat_create(name, pool))
+ goto err;
+
return pool;
err:
@@ -1222,6 +1429,8 @@ void zs_destroy_pool(struct zs_pool *pool)
{
int i;
+ zs_pool_stat_destroy(pool);
+
for (i = 0; i < zs_size_classes; i++) {
int fg;
struct size_class *class = pool->size_class[i];
@@ -1242,6 +1451,7 @@ void zs_destroy_pool(struct zs_pool *pool)
}
kfree(pool->size_class);
+ kfree(pool->name);
kfree(pool);
}
EXPORT_SYMBOL_GPL(zs_destroy_pool);
@@ -1250,17 +1460,30 @@ static int __init zs_init(void)
{
int ret = zs_register_cpu_notifier();
- if (ret) {
- zs_unregister_cpu_notifier();
- return ret;
- }
+ if (ret)
+ goto notifier_fail;
init_zs_size_classes();
#ifdef CONFIG_ZPOOL
zpool_register_driver(&zs_zpool_driver);
#endif
+
+ ret = zs_stat_init();
+ if (ret) {
+ pr_err("zs stat initialization failed\n");
+ goto stat_fail;
+ }
return 0;
+
+stat_fail:
+#ifdef CONFIG_ZPOOL
+ zpool_unregister_driver(&zs_zpool_driver);
+#endif
+notifier_fail:
+ zs_unregister_cpu_notifier();
+
+ return ret;
}
static void __exit zs_exit(void)
@@ -1269,6 +1492,8 @@ static void __exit zs_exit(void)
zpool_unregister_driver(&zs_zpool_driver);
#endif
zs_unregister_cpu_notifier();
+
+ zs_stat_exit();
}
module_init(zs_init);
diff --git a/mm/zswap.c b/mm/zswap.c
index 0cfce9b..4249e82 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -906,11 +906,12 @@ static int __init init_zswap(void)
pr_info("loading zswap\n");
- zswap_pool = zpool_create_pool(zswap_zpool_type, gfp, &zswap_zpool_ops);
+ zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
+ &zswap_zpool_ops);
if (!zswap_pool && strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
pr_info("%s zpool not available\n", zswap_zpool_type);
zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
- zswap_pool = zpool_create_pool(zswap_zpool_type, gfp,
+ zswap_pool = zpool_create_pool(zswap_zpool_type, "zswap", gfp,
&zswap_zpool_ops);
}
if (!zswap_pool) {