diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 20 | ||||
-rw-r--r-- | mm/Makefile | 6 | ||||
-rw-r--r-- | mm/backing-dev.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 8 | ||||
-rw-r--r-- | mm/filemap_xip.c | 2 | ||||
-rw-r--r-- | mm/hugetlb.c | 14 | ||||
-rw-r--r-- | mm/hwpoison-inject.c | 41 | ||||
-rw-r--r-- | mm/kmemleak.c | 5 | ||||
-rw-r--r-- | mm/ksm.c | 12 | ||||
-rw-r--r-- | mm/madvise.c | 30 | ||||
-rw-r--r-- | mm/memcontrol.c | 712 | ||||
-rw-r--r-- | mm/memory-failure.c | 832 | ||||
-rw-r--r-- | mm/memory.c | 86 | ||||
-rw-r--r-- | mm/migrate.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 4 | ||||
-rw-r--r-- | mm/nommu.c | 84 | ||||
-rw-r--r-- | mm/page-writeback.c | 60 | ||||
-rw-r--r-- | mm/page_alloc.c | 44 | ||||
-rw-r--r-- | mm/percpu.c | 84 | ||||
-rw-r--r-- | mm/quicklist.c | 3 | ||||
-rw-r--r-- | mm/rmap.c | 64 | ||||
-rw-r--r-- | mm/shmem.c | 14 | ||||
-rw-r--r-- | mm/swapfile.c | 16 | ||||
-rw-r--r-- | mm/truncate.c | 136 | ||||
-rw-r--r-- | mm/vmalloc.c | 50 | ||||
-rw-r--r-- | mm/vmscan.c | 59 |
27 files changed, 2020 insertions, 372 deletions
@@ -224,7 +224,9 @@ config KSM the many instances by a single resident page with that content, so saving memory until one or another app needs to modify the content. Recommended for use with KVM, or with other duplicative applications. - See Documentation/vm/ksm.txt for more information. + See Documentation/vm/ksm.txt for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). config DEFAULT_MMAP_MIN_ADDR int "Low address space to protect from user allocation" @@ -244,6 +246,22 @@ config DEFAULT_MMAP_MIN_ADDR This value can be changed after boot using the /proc/sys/vm/mmap_min_addr tunable. +config ARCH_SUPPORTS_MEMORY_FAILURE + bool + +config MEMORY_FAILURE + depends on MMU + depends on ARCH_SUPPORTS_MEMORY_FAILURE + bool "Enable recovery from hardware memory errors" + help + Enables code to recover from some memory failures on systems + with MCA recovery. This allows a system to continue running + even when some of its memory has uncorrected errors. This requires + special hardware support and typically ECC memory. + +config HWPOISON_INJECT + tristate "Poison pages injector" + depends on MEMORY_FAILURE && DEBUG_KERNEL config NOMMU_INITIAL_TRIM_EXCESS int "Turn on mmap() excess space trimming before booting" diff --git a/mm/Makefile b/mm/Makefile index 88193d7..ebf8490 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,14 +5,14 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o + vmalloc.o pagewalk.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ page_isolation.o mm_init.o mmu_context.o \ - pagewalk.o $(mmu-y) + $(mmu-y) obj-y += init-mm.o obj-$(CONFIG_BOUNCE) += bounce.o @@ -41,5 +41,7 @@ obj-$(CONFIG_SMP) += allocpercpu.o endif obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o +obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3d3accb..5a37e20 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -92,7 +92,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) "BdiDirtyThresh: %8lu kB\n" "DirtyThresh: %8lu kB\n" "BackgroundThresh: %8lu kB\n" - "WriteBack threads:%8lu\n" + "WritebackThreads: %8lu\n" "b_dirty: %8lu\n" "b_io: %8lu\n" "b_more_io: %8lu\n" diff --git a/mm/filemap.c b/mm/filemap.c index bcc7372..ef169f3 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -58,7 +58,7 @@ /* * Lock ordering: * - * ->i_mmap_lock (vmtruncate) + * ->i_mmap_lock (truncate_pagecache) * ->private_lock (__free_pte->__set_page_dirty_buffers) * ->swap_lock (exclusive_swap_page, others) * ->mapping->tree_lock @@ -104,6 +104,10 @@ * * ->task->proc_lock * ->dcache_lock (proc_pid_lookup) + * + * (code doesn't rely on that order, so you could switch it around) + * ->tasklist_lock (memory_failure, collect_procs_ao) + * ->i_mmap_lock */ /* @@ -1607,7 +1611,7 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { .fault = filemap_fault, }; diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c index 427dfe3..1888b2d 100644 --- a/mm/filemap_xip.c +++ b/mm/filemap_xip.c @@ -296,7 +296,7 @@ out: } } -static struct vm_operations_struct xip_file_vm_ops = { +static const struct vm_operations_struct xip_file_vm_ops = { .fault = xip_file_fault, }; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 815dbd4..5d7601b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1537,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1548,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) h->max_huge_pages = set_max_huge_pages(h, tmp); @@ -1557,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, } int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (hugepages_treat_as_movable) htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; else @@ -1569,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write, } int hugetlb_overcommit_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1580,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) { spin_lock(&hugetlb_lock); @@ -1721,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -struct vm_operations_struct hugetlb_vm_ops = { +const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c new file mode 100644 index 0000000..e1d8513 --- /dev/null +++ b/mm/hwpoison-inject.c @@ -0,0 +1,41 @@ +/* Inject a hwpoison memory failure on a arbitary pfn */ +#include <linux/module.h> +#include <linux/debugfs.h> +#include <linux/kernel.h> +#include <linux/mm.h> + +static struct dentry *hwpoison_dir, *corrupt_pfn; + +static int hwpoison_inject(void *data, u64 val) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + printk(KERN_INFO "Injecting memory failure at pfn %Lx\n", val); + return __memory_failure(val, 18, 0); +} + +DEFINE_SIMPLE_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); + +static void pfn_inject_exit(void) +{ + if (hwpoison_dir) + debugfs_remove_recursive(hwpoison_dir); +} + +static int pfn_inject_init(void) +{ + hwpoison_dir = debugfs_create_dir("hwpoison", NULL); + if (hwpoison_dir == NULL) + return -ENOMEM; + corrupt_pfn = debugfs_create_file("corrupt-pfn", 0600, hwpoison_dir, + NULL, &hwpoison_fops); + if (corrupt_pfn == NULL) { + pfn_inject_exit(); + return -ENOMEM; + } + return 0; +} + +module_init(pfn_inject_init); +module_exit(pfn_inject_exit); +MODULE_LICENSE("GPL"); diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 4ea4510..8bf765c 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -833,12 +833,15 @@ static void early_alloc(struct early_log *log) */ rcu_read_lock(); object = create_object((unsigned long)log->ptr, log->size, - log->min_count, GFP_KERNEL); + log->min_count, GFP_ATOMIC); + if (!object) + goto out; spin_lock_irqsave(&object->lock, flags); for (i = 0; i < log->trace_len; i++) object->trace[i] = log->trace[i]; object->trace_len = log->trace_len; spin_unlock_irqrestore(&object->lock, flags); +out: rcu_read_unlock(); } @@ -30,6 +30,7 @@ #include <linux/slab.h> #include <linux/rbtree.h> #include <linux/mmu_notifier.h> +#include <linux/swap.h> #include <linux/ksm.h> #include <asm/tlbflush.h> @@ -162,10 +163,10 @@ static unsigned long ksm_pages_unshared; static unsigned long ksm_rmap_items; /* Limit on the number of unswappable pages used */ -static unsigned long ksm_max_kernel_pages = 2000; +static unsigned long ksm_max_kernel_pages; /* Number of pages ksmd should scan in one batch */ -static unsigned int ksm_thread_pages_to_scan = 200; +static unsigned int ksm_thread_pages_to_scan = 100; /* Milliseconds ksmd should sleep between batches */ static unsigned int ksm_thread_sleep_millisecs = 20; @@ -173,7 +174,7 @@ static unsigned int ksm_thread_sleep_millisecs = 20; #define KSM_RUN_STOP 0 #define KSM_RUN_MERGE 1 #define KSM_RUN_UNMERGE 2 -static unsigned int ksm_run = KSM_RUN_MERGE; +static unsigned int ksm_run = KSM_RUN_STOP; static DECLARE_WAIT_QUEUE_HEAD(ksm_thread_wait); static DEFINE_MUTEX(ksm_thread_mutex); @@ -1667,6 +1668,8 @@ static int __init ksm_init(void) struct task_struct *ksm_thread; int err; + ksm_max_kernel_pages = totalram_pages / 4; + err = ksm_slab_init(); if (err) goto out; @@ -1689,6 +1692,9 @@ static int __init ksm_init(void) kthread_stop(ksm_thread); goto out_free2; } +#else + ksm_run = KSM_RUN_MERGE; /* no way for user to start it */ + #endif /* CONFIG_SYSFS */ return 0; diff --git a/mm/madvise.c b/mm/madvise.c index d9ae206..35b1479 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -218,6 +218,32 @@ static long madvise_remove(struct vm_area_struct *vma, return error; } +#ifdef CONFIG_MEMORY_FAILURE +/* + * Error injection support for memory error handling. + */ +static int madvise_hwpoison(unsigned long start, unsigned long end) +{ + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + for (; start < end; start += PAGE_SIZE) { + struct page *p; + int ret = get_user_pages(current, current->mm, start, 1, + 0, 0, &p, NULL); + if (ret != 1) + return ret; + printk(KERN_INFO "Injecting memory failure for page %lx at %lx\n", + page_to_pfn(p), start); + /* Ignore return value for now */ + __memory_failure(page_to_pfn(p), 0, 1); + put_page(p); + } + return ret; +} +#endif + static long madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, int behavior) @@ -308,6 +334,10 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) int write; size_t len; +#ifdef CONFIG_MEMORY_FAILURE + if (behavior == MADV_HWPOISON) + return madvise_hwpoison(start, start+len_in); +#endif if (!madvise_behavior_valid(behavior)) return error; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9b10d87..f99f599 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -29,6 +29,7 @@ #include <linux/rcupdate.h> #include <linux/limits.h> #include <linux/mutex.h> +#include <linux/rbtree.h> #include <linux/slab.h> #include <linux/swap.h> #include <linux/spinlock.h> @@ -43,6 +44,7 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 +struct mem_cgroup *root_mem_cgroup __read_mostly; #ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ @@ -53,6 +55,7 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/ #endif static DEFINE_MUTEX(memcg_tasklist); /* can be hold under cgroup_mutex */ +#define SOFTLIMIT_EVENTS_THRESH (1000) /* * Statistics for memory cgroup. @@ -66,6 +69,8 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_MAPPED_FILE, /* # of pages charged as file rss */ MEM_CGROUP_STAT_PGPGIN_COUNT, /* # of pages paged in */ MEM_CGROUP_STAT_PGPGOUT_COUNT, /* # of pages paged out */ + MEM_CGROUP_STAT_EVENTS, /* sum of pagein + pageout for internal use */ + MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, }; @@ -78,6 +83,20 @@ struct mem_cgroup_stat { struct mem_cgroup_stat_cpu cpustat[0]; }; +static inline void +__mem_cgroup_stat_reset_safe(struct mem_cgroup_stat_cpu *stat, + enum mem_cgroup_stat_index idx) +{ + stat->count[idx] = 0; +} + +static inline s64 +__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat, + enum mem_cgroup_stat_index idx) +{ + return stat->count[idx]; +} + /* * For accounting under irq disable, no need for increment preempt count. */ @@ -117,6 +136,12 @@ struct mem_cgroup_per_zone { unsigned long count[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; + struct rb_node tree_node; /* RB tree node */ + unsigned long long usage_in_excess;/* Set to the value by which */ + /* the soft limit is exceeded*/ + bool on_tree; + struct mem_cgroup *mem; /* Back pointer, we cannot */ + /* use container_of */ }; /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -130,6 +155,26 @@ struct mem_cgroup_lru_info { }; /* + * Cgroups above their limits are maintained in a RB-Tree, independent of + * their hierarchy representation + */ + +struct mem_cgroup_tree_per_zone { + struct rb_root rb_root; + spinlock_t lock; +}; + +struct mem_cgroup_tree_per_node { + struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; +}; + +struct mem_cgroup_tree { + struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; +}; + +static struct mem_cgroup_tree soft_limit_tree __read_mostly; + +/* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide * statistics based on the statistics developed by Rik Van Riel for clock-pro, @@ -186,6 +231,13 @@ struct mem_cgroup { struct mem_cgroup_stat stat; }; +/* + * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft + * limit reclaim to prevent infinite loops, if they ever occur. + */ +#define MEM_CGROUP_MAX_RECLAIM_LOOPS (100) +#define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS (2) + enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, @@ -200,13 +252,8 @@ enum charge_type { #define PCGF_CACHE (1UL << PCG_CACHE) #define PCGF_USED (1UL << PCG_USED) #define PCGF_LOCK (1UL << PCG_LOCK) -static const unsigned long -pcg_default_flags[NR_CHARGE_TYPE] = { - PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* File Cache */ - PCGF_USED | PCGF_LOCK, /* Anon */ - PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ - 0, /* FORCE */ -}; +/* Not used, but added here for completeness */ +#define PCGF_ACCT (1UL << PCG_ACCT) /* for encoding cft->private value on file */ #define _MEM (0) @@ -215,15 +262,237 @@ pcg_default_flags[NR_CHARGE_TYPE] = { #define MEMFILE_TYPE(val) (((val) >> 16) & 0xffff) #define MEMFILE_ATTR(val) ((val) & 0xffff) +/* + * Reclaim flags for mem_cgroup_hierarchical_reclaim + */ +#define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 +#define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) +#define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 +#define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) +#define MEM_CGROUP_RECLAIM_SOFT_BIT 0x2 +#define MEM_CGROUP_RECLAIM_SOFT (1 << MEM_CGROUP_RECLAIM_SOFT_BIT) + static void mem_cgroup_get(struct mem_cgroup *mem); static void mem_cgroup_put(struct mem_cgroup *mem); static struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *mem); +static struct mem_cgroup_per_zone * +mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) +{ + return &mem->info.nodeinfo[nid]->zoneinfo[zid]; +} + +static struct mem_cgroup_per_zone * +page_cgroup_zoneinfo(struct page_cgroup *pc) +{ + struct mem_cgroup *mem = pc->mem_cgroup; + int nid = page_cgroup_nid(pc); + int zid = page_cgroup_zid(pc); + + if (!mem) + return NULL; + + return mem_cgroup_zoneinfo(mem, nid, zid); +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_node_zone(int nid, int zid) +{ + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static struct mem_cgroup_tree_per_zone * +soft_limit_tree_from_page(struct page *page) +{ + int nid = page_to_nid(page); + int zid = page_zonenum(page); + + return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; +} + +static void +__mem_cgroup_insert_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz, + unsigned long long new_usage_in_excess) +{ + struct rb_node **p = &mctz->rb_root.rb_node; + struct rb_node *parent = NULL; + struct mem_cgroup_per_zone *mz_node; + + if (mz->on_tree) + return; + + mz->usage_in_excess = new_usage_in_excess; + if (!mz->usage_in_excess) + return; + while (*p) { + parent = *p; + mz_node = rb_entry(parent, struct mem_cgroup_per_zone, + tree_node); + if (mz->usage_in_excess < mz_node->usage_in_excess) + p = &(*p)->rb_left; + /* + * We can't avoid mem cgroups that are over their soft + * limit by the same amount + */ + else if (mz->usage_in_excess >= mz_node->usage_in_excess) + p = &(*p)->rb_right; + } + rb_link_node(&mz->tree_node, parent, p); + rb_insert_color(&mz->tree_node, &mctz->rb_root); + mz->on_tree = true; +} + +static void +__mem_cgroup_remove_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + if (!mz->on_tree) + return; + rb_erase(&mz->tree_node, &mctz->rb_root); + mz->on_tree = false; +} + +static void +mem_cgroup_remove_exceeded(struct mem_cgroup *mem, + struct mem_cgroup_per_zone *mz, + struct mem_cgroup_tree_per_zone *mctz) +{ + spin_lock(&mctz->lock); + __mem_cgroup_remove_exceeded(mem, mz, mctz); + spin_unlock(&mctz->lock); +} + +static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem) +{ + bool ret = false; + int cpu; + s64 val; + struct mem_cgroup_stat_cpu *cpustat; + + cpu = get_cpu(); + cpustat = &mem->stat.cpustat[cpu]; + val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_EVENTS); + if (unlikely(val > SOFTLIMIT_EVENTS_THRESH)) { + __mem_cgroup_stat_reset_safe(cpustat, MEM_CGROUP_STAT_EVENTS); + ret = true; + } + put_cpu(); + return ret; +} + +static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page) +{ + unsigned long long excess; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + int nid = page_to_nid(page); + int zid = page_zonenum(page); + mctz = soft_limit_tree_from_page(page); + + /* + * Necessary to update all ancestors when hierarchy is used. + * because their event counter is not touched. + */ + for (; mem; mem = parent_mem_cgroup(mem)) { + mz = mem_cgroup_zoneinfo(mem, nid, zid); + excess = res_counter_soft_limit_excess(&mem->res); + /* + * We have to update the tree if mz is on RB-tree or + * mem is over its softlimit. + */ + if (excess || mz->on_tree) { + spin_lock(&mctz->lock); + /* if on-tree, remove it */ + if (mz->on_tree) + __mem_cgroup_remove_exceeded(mem, mz, mctz); + /* + * Insert again. mz->usage_in_excess will be updated. + * If excess is 0, no tree ops. + */ + __mem_cgroup_insert_exceeded(mem, mz, mctz, excess); + spin_unlock(&mctz->lock); + } + } +} + +static void mem_cgroup_remove_from_trees(struct mem_cgroup *mem) +{ + int node, zone; + struct mem_cgroup_per_zone *mz; + struct mem_cgroup_tree_per_zone *mctz; + + for_each_node_state(node, N_POSSIBLE) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + mz = mem_cgroup_zoneinfo(mem, node, zone); + mctz = soft_limit_tree_node_zone(node, zone); + mem_cgroup_remove_exceeded(mem, mz, mctz); + } + } +} + +static inline unsigned long mem_cgroup_get_excess(struct mem_cgroup *mem) +{ + return res_counter_soft_limit_excess(&mem->res) >> PAGE_SHIFT; +} + +static struct mem_cgroup_per_zone * +__mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct rb_node *rightmost = NULL; + struct mem_cgroup_per_zone *mz; + +retry: + mz = NULL; + rightmost = rb_last(&mctz->rb_root); + if (!rightmost) + goto done; /* Nothing to reclaim from */ + + mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); + /* + * Remove the node now but someone else can add it back, + * we will to add it back at the end of reclaim to its correct + * position in the tree. + */ + __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); + if (!res_counter_soft_limit_excess(&mz->mem->res) || + !css_tryget(&mz->mem->css)) + goto retry; +done: + return mz; +} + +static struct mem_cgroup_per_zone * +mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) +{ + struct mem_cgroup_per_zone *mz; + + spin_lock(&mctz->lock); + mz = __mem_cgroup_largest_soft_limit_node(mctz); + spin_unlock(&mctz->lock); + return mz; +} + +static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, + bool charge) +{ + int val = (charge) ? 1 : -1; + struct mem_cgroup_stat *stat = &mem->stat; + struct mem_cgroup_stat_cpu *cpustat; + int cpu = get_cpu(); + + cpustat = &stat->cpustat[cpu]; + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val); + put_cpu(); +} + static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, struct page_cgroup *pc, bool charge) { - int val = (charge)? 1 : -1; + int val = (charge) ? 1 : -1; struct mem_cgroup_stat *stat = &mem->stat; struct mem_cgroup_stat_cpu *cpustat; int cpu = get_cpu(); @@ -240,28 +509,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, else __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_EVENTS, 1); put_cpu(); } -static struct mem_cgroup_per_zone * -mem_cgroup_zoneinfo(struct mem_cgroup *mem, int nid, int zid) -{ - return &mem->info.nodeinfo[nid]->zoneinfo[zid]; -} - -static struct mem_cgroup_per_zone * -page_cgroup_zoneinfo(struct page_cgroup *pc) -{ - struct mem_cgroup *mem = pc->mem_cgroup; - int nid = page_cgroup_nid(pc); - int zid = page_cgroup_zid(pc); - - if (!mem) - return NULL; - - return mem_cgroup_zoneinfo(mem, nid, zid); -} - static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem, enum lru_list idx) { @@ -354,6 +605,11 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, return ret; } +static inline bool mem_cgroup_is_root(struct mem_cgroup *mem) +{ + return (mem == root_mem_cgroup); +} + /* * Following LRU functions are allowed to be used without PCG_LOCK. * Operations are called by routine of global LRU independently from memcg. @@ -371,22 +627,24 @@ static int mem_cgroup_walk_tree(struct mem_cgroup *root, void *data, void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) { struct page_cgroup *pc; - struct mem_cgroup *mem; struct mem_cgroup_per_zone *mz; if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); /* can happen while we handle swapcache. */ - if (list_empty(&pc->lru) || !pc->mem_cgroup) + if (!TestClearPageCgroupAcctLRU(pc)) return; + VM_BUG_ON(!pc->mem_cgroup); /* * We don't check PCG_USED bit. It's cleared when the "page" is finally * removed from global LRU. */ mz = page_cgroup_zoneinfo(pc); - mem = pc->mem_cgroup; MEM_CGROUP_ZSTAT(mz, lru) -= 1; + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; + VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); return; } @@ -410,8 +668,8 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) * For making pc->mem_cgroup visible, insert smp_rmb() here. */ smp_rmb(); - /* unused page is not rotated. */ - if (!PageCgroupUsed(pc)) + /* unused or root page is not rotated. */ + if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) return; mz = page_cgroup_zoneinfo(pc); list_move(&pc->lru, &mz->lists[lru]); @@ -425,6 +683,7 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); + VM_BUG_ON(PageCgroupAcctLRU(pc)); /* * Used bit is set without atomic ops but after smp_wmb(). * For making pc->mem_cgroup visible, insert smp_rmb() here. @@ -435,6 +694,9 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) mz = page_cgroup_zoneinfo(pc); MEM_CGROUP_ZSTAT(mz, lru) += 1; + SetPageCgroupAcctLRU(pc); + if (mem_cgroup_is_root(pc->mem_cgroup)) + return; list_add(&pc->lru, &mz->lists[lru]); } @@ -469,7 +731,7 @@ static void mem_cgroup_lru_add_after_commit_swapcache(struct page *page) spin_lock_irqsave(&zone->lru_lock, flags); /* link when the page is linked to LRU but page_cgroup isn't */ - if (PageLRU(page) && list_empty(&pc->lru)) + if (PageLRU(page) && !PageCgroupAcctLRU(pc)) mem_cgroup_add_lru_list(page, page_lru(page)); spin_unlock_irqrestore(&zone->lru_lock, flags); } @@ -855,28 +1117,62 @@ mem_cgroup_select_victim(struct mem_cgroup *root_mem) * If shrink==true, for avoiding to free too much, this returns immedieately. */ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, - gfp_t gfp_mask, bool noswap, bool shrink) + struct zone *zone, + gfp_t gfp_mask, + unsigned long reclaim_options) { struct mem_cgroup *victim; int ret, total = 0; int loop = 0; + bool noswap = reclaim_options & MEM_CGROUP_RECLAIM_NOSWAP; + bool shrink = reclaim_options & MEM_CGROUP_RECLAIM_SHRINK; + bool check_soft = reclaim_options & MEM_CGROUP_RECLAIM_SOFT; + unsigned long excess = mem_cgroup_get_excess(root_mem); /* If memsw_is_minimum==1, swap-out is of-no-use. */ if (root_mem->memsw_is_minimum) noswap = true; - while (loop < 2) { + while (1) { victim = mem_cgroup_select_victim(root_mem); - if (victim == root_mem) + if (victim == root_mem) { loop++; + if (loop >= 2) { + /* + * If we have not been able to reclaim + * anything, it might because there are + * no reclaimable pages under this hierarchy + */ + if (!check_soft || !total) { + css_put(&victim->css); + break; + } + /* + * We want to do more targetted reclaim. + * excess >> 2 is not to excessive so as to + * reclaim too much, nor too less that we keep + * coming back to reclaim from this cgroup + */ + if (total >= (excess >> 2) || + (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) { + css_put(&victim->css); + break; + } + } + } if (!mem_cgroup_local_usage(&victim->stat)) { /* this cgroup's local usage == 0 */ css_put(&victim->css); continue; } /* we use swappiness of local cgroup */ - ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, noswap, - get_swappiness(victim)); + if (check_soft) + ret = mem_cgroup_shrink_node_zone(victim, gfp_mask, + noswap, get_swappiness(victim), zone, + zone->zone_pgdat->node_id); + else + ret = try_to_free_mem_cgroup_pages(victim, gfp_mask, + noswap, get_swappiness(victim)); css_put(&victim->css); /* * At shrinking usage, we can't check we should stop here or @@ -886,7 +1182,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (shrink) return ret; total += ret; - if (mem_cgroup_check_under_limit(root_mem)) + if (check_soft) { + if (res_counter_check_under_soft_limit(&root_mem->res)) + return total; + } else if (mem_cgroup_check_under_limit(root_mem)) return 1 + total; } return total; @@ -965,7 +1264,7 @@ done: */ static int __mem_cgroup_try_charge(struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcg, - bool oom) + bool oom, struct page *page) { struct mem_cgroup *mem, *mem_over_limit; int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; @@ -996,9 +1295,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, VM_BUG_ON(css_is_removed(&mem->css)); while (1) { - int ret; - bool noswap = false; + int ret = 0; + unsigned long flags = 0; + if (mem_cgroup_is_root(mem)) + goto done; ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res); if (likely(!ret)) { if (!do_swap_account) @@ -1009,7 +1310,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, break; /* mem+swap counter fails */ res_counter_uncharge(&mem->res, PAGE_SIZE); - noswap = true; + flags |= MEM_CGROUP_RECLAIM_NOSWAP; mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); } else @@ -1020,8 +1321,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (!(gfp_mask & __GFP_WAIT)) goto nomem; - ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, - noswap, false); + ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, + gfp_mask, flags); if (ret) continue; @@ -1046,13 +1347,19 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, goto nomem; } } + /* + * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. + * if they exceeds softlimit. + */ + if (mem_cgroup_soft_limit_check(mem)) + mem_cgroup_update_tree(mem, page); +done: return 0; nomem: css_put(&mem->css); return -ENOMEM; } - /* * A helper function to get mem_cgroup from ID. must be called under * rcu_read_lock(). The caller must check css_is_removed() or some if @@ -1119,15 +1426,37 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + } css_put(&mem->css); return; } + pc->mem_cgroup = mem; + /* + * We access a page_cgroup asynchronously without lock_page_cgroup(). + * Especially when a page_cgroup is taken from a page, pc->mem_cgroup + * is accessed after testing USED bit. To make pc->mem_cgroup visible + * before USED bit, we need memory barrier here. + * See mem_cgroup_add_lru_list(), etc. + */ smp_wmb(); - pc->flags = pcg_default_flags[ctype]; + switch (ctype) { + case MEM_CGROUP_CHARGE_TYPE_CACHE: + case MEM_CGROUP_CHARGE_TYPE_SHMEM: + SetPageCgroupCache(pc); + SetPageCgroupUsed(pc); + break; + case MEM_CGROUP_CHARGE_TYPE_MAPPED: + ClearPageCgroupCache(pc); + SetPageCgroupUsed(pc); + break; + default: + break; + } mem_cgroup_charge_statistics(mem, pc, true); @@ -1178,7 +1507,8 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, if (pc->mem_cgroup != from) goto out; - res_counter_uncharge(&from->res, PAGE_SIZE); + if (!mem_cgroup_is_root(from)) + res_counter_uncharge(&from->res, PAGE_SIZE); mem_cgroup_charge_statistics(from, pc, false); page = pc->page; @@ -1197,7 +1527,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, 1); } - if (do_swap_account) + if (do_swap_account && !mem_cgroup_is_root(from)) res_counter_uncharge(&from->memsw, PAGE_SIZE); css_put(&from->css); @@ -1238,7 +1568,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page); if (ret || !parent) return ret; @@ -1268,9 +1598,11 @@ uncharge: /* drop extra refcnt by try_charge() */ css_put(&parent->css); /* uncharge if move fails */ - res_counter_uncharge(&parent->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&parent->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(parent)) { + res_counter_uncharge(&parent->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&parent->memsw, PAGE_SIZE); + } return ret; } @@ -1295,7 +1627,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, prefetchw(pc); mem = memcg; - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page); if (ret || !mem) return ret; @@ -1414,14 +1746,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page); /* drop extra refcnt from tryget */ css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, ptr, true, page); } static void @@ -1459,7 +1791,9 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, * This recorded memcg can be obsolete one. So, avoid * calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); @@ -1484,9 +1818,11 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + } css_put(&mem->css); } @@ -1538,9 +1874,14 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - res_counter_uncharge(&mem->res, PAGE_SIZE); - if (do_swap_account && (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(mem)) { + res_counter_uncharge(&mem->res, PAGE_SIZE); + if (do_swap_account && + (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)) + res_counter_uncharge(&mem->memsw, PAGE_SIZE); + } + if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) + mem_cgroup_swap_statistics(mem, true); mem_cgroup_charge_statistics(mem, pc, false); ClearPageCgroupUsed(pc); @@ -1554,6 +1895,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mz = page_cgroup_zoneinfo(pc); unlock_page_cgroup(pc); + if (mem_cgroup_soft_limit_check(mem)) + mem_cgroup_update_tree(mem, page); /* at swapout, this memcg will be accessed to record to swap */ if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT) css_put(&mem->css); @@ -1629,7 +1972,9 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) * We uncharge this because swap is freed. * This memcg can be obsolete one. We avoid calling css_tryget */ - res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + if (!mem_cgroup_is_root(memcg)) + res_counter_uncharge(&memcg->memsw, PAGE_SIZE); + mem_cgroup_swap_statistics(memcg, false); mem_cgroup_put(memcg); } rcu_read_unlock(); @@ -1658,7 +2003,8 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr) unlock_page_cgroup(pc); if (mem) { - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + page); css_put(&mem->css); } *ptr = mem; @@ -1798,8 +2144,9 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - progress = mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, - false, true); + progress = mem_cgroup_hierarchical_reclaim(memcg, NULL, + GFP_KERNEL, + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->res, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -1851,7 +2198,9 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, if (!ret) break; - mem_cgroup_hierarchical_reclaim(memcg, GFP_KERNEL, true, true); + mem_cgroup_hierarchical_reclaim(memcg, NULL, GFP_KERNEL, + MEM_CGROUP_RECLAIM_NOSWAP | + MEM_CGROUP_RECLAIM_SHRINK); curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); /* Usage is reduced ? */ if (curusage >= oldusage) @@ -1862,6 +2211,97 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, return ret; } +unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, + gfp_t gfp_mask, int nid, + int zid) +{ + unsigned long nr_reclaimed = 0; + struct mem_cgroup_per_zone *mz, *next_mz = NULL; + unsigned long reclaimed; + int loop = 0; + struct mem_cgroup_tree_per_zone *mctz; + unsigned long long excess; + + if (order > 0) + return 0; + + mctz = soft_limit_tree_node_zone(nid, zid); + /* + * This loop can run a while, specially if mem_cgroup's continuously + * keep exceeding their soft limit and putting the system under + * pressure + */ + do { + if (next_mz) + mz = next_mz; + else + mz = mem_cgroup_largest_soft_limit_node(mctz); + if (!mz) + break; + + reclaimed = mem_cgroup_hierarchical_reclaim(mz->mem, zone, + gfp_mask, + MEM_CGROUP_RECLAIM_SOFT); + nr_reclaimed += reclaimed; + spin_lock(&mctz->lock); + + /* + * If we failed to reclaim anything from this memory cgroup + * it is time to move on to the next cgroup + */ + next_mz = NULL; + if (!reclaimed) { + do { + /* + * Loop until we find yet another one. + * + * By the time we get the soft_limit lock + * again, someone might have aded the + * group back on the RB tree. Iterate to + * make sure we get a different mem. + * mem_cgroup_largest_soft_limit_node returns + * NULL if no other cgroup is present on + * the tree + */ + next_mz = + __mem_cgroup_largest_soft_limit_node(mctz); + if (next_mz == mz) { + css_put(&next_mz->mem->css); + next_mz = NULL; + } else /* next_mz == NULL or other memcg */ + break; + } while (1); + } + __mem_cgroup_remove_exceeded(mz->mem, mz, mctz); + excess = res_counter_soft_limit_excess(&mz->mem->res); + /* + * One school of thought says that we should not add + * back the node to the tree if reclaim returns 0. + * But our reclaim could return 0, simply because due + * to priority we are exposing a smaller subset of + * memory to reclaim from. Consider this as a longer + * term TODO. + */ + /* If excess == 0, no tree ops */ + __mem_cgroup_insert_exceeded(mz->mem, mz, mctz, excess); + spin_unlock(&mctz->lock); + css_put(&mz->mem->css); + loop++; + /* + * Could not reclaim anything and there are no more + * mem cgroups to try or we seem to be looping without + * reclaiming anything. + */ + if (!nr_reclaimed && + (next_mz == NULL || + loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) + break; + } while (!nr_reclaimed); + if (next_mz) + css_put(&next_mz->mem->css); + return nr_reclaimed; +} + /* * This routine traverse page_cgroup in given list and drop them all. * *And* this routine doesn't reclaim page itself, just removes page_cgroup. @@ -2046,20 +2486,64 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, return retval; } +struct mem_cgroup_idx_data { + s64 val; + enum mem_cgroup_stat_index idx; +}; + +static int +mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data) +{ + struct mem_cgroup_idx_data *d = data; + d->val += mem_cgroup_read_stat(&mem->stat, d->idx); + return 0; +} + +static void +mem_cgroup_get_recursive_idx_stat(struct mem_cgroup *mem, + enum mem_cgroup_stat_index idx, s64 *val) +{ + struct mem_cgroup_idx_data d; + d.idx = idx; + d.val = 0; + mem_cgroup_walk_tree(mem, &d, mem_cgroup_get_idx_stat); + *val = d.val; +} + static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft) { struct mem_cgroup *mem = mem_cgroup_from_cont(cont); - u64 val = 0; + u64 idx_val, val; int type, name; type = MEMFILE_TYPE(cft->private); name = MEMFILE_ATTR(cft->private); switch (type) { case _MEM: - val = res_counter_read_u64(&mem->res, name); + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_CACHE, &idx_val); + val = idx_val; + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_RSS, &idx_val); + val += idx_val; + val <<= PAGE_SHIFT; + } else + val = res_counter_read_u64(&mem->res, name); break; case _MEMSWAP: - val = res_counter_read_u64(&mem->memsw, name); + if (name == RES_USAGE && mem_cgroup_is_root(mem)) { + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_CACHE, &idx_val); + val = idx_val; + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_RSS, &idx_val); + val += idx_val; + mem_cgroup_get_recursive_idx_stat(mem, + MEM_CGROUP_STAT_SWAPOUT, &idx_val); + val <<= PAGE_SHIFT; + } else + val = res_counter_read_u64(&mem->memsw, name); break; default: BUG(); @@ -2083,6 +2567,10 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, name = MEMFILE_ATTR(cft->private); switch (name) { case RES_LIMIT: + if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ + ret = -EINVAL; + break; + } /* This function does all necessary parse...reuse it */ ret = res_counter_memparse_write_strategy(buffer, &val); if (ret) @@ -2092,6 +2580,20 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft, else ret = mem_cgroup_resize_memsw_limit(memcg, val); break; + case RES_SOFT_LIMIT: + ret = res_counter_memparse_write_strategy(buffer, &val); + if (ret) + break; + /* + * For memsw, soft limits are hard to implement in terms + * of semantics, for now, we support soft limits for + * control without swap + */ + if (type == _MEM) + ret = res_counter_set_soft_limit(&memcg->res, val); + else + ret = -EINVAL; + break; default: ret = -EINVAL; /* should be BUG() ? */ break; @@ -2149,6 +2651,7 @@ static int mem_cgroup_reset(struct cgroup *cont, unsigned int event) res_counter_reset_failcnt(&mem->memsw); break; } + return 0; } @@ -2160,6 +2663,7 @@ enum { MCS_MAPPED_FILE, MCS_PGPGIN, MCS_PGPGOUT, + MCS_SWAP, MCS_INACTIVE_ANON, MCS_ACTIVE_ANON, MCS_INACTIVE_FILE, @@ -2181,6 +2685,7 @@ struct { {"mapped_file", "total_mapped_file"}, {"pgpgin", "total_pgpgin"}, {"pgpgout", "total_pgpgout"}, + {"swap", "total_swap"}, {"inactive_anon", "total_inactive_anon"}, {"active_anon", "total_active_anon"}, {"inactive_file", "total_inactive_file"}, @@ -2205,6 +2710,10 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data) s->stat[MCS_PGPGIN] += val; val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT); s->stat[MCS_PGPGOUT] += val; + if (do_swap_account) { + val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT); + s->stat[MCS_SWAP] += val * PAGE_SIZE; + } /* per zone stat */ val = mem_cgroup_get_local_zonestat(mem, LRU_INACTIVE_ANON); @@ -2236,8 +2745,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_local_stat(mem_cont, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) + for (i = 0; i < NR_MCS_STAT; i++) { + if (i == MCS_SWAP && !do_swap_account) + continue; cb->fill(cb, memcg_stat_strings[i].local_name, mystat.stat[i]); + } /* Hierarchical information */ { @@ -2250,9 +2762,11 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, memset(&mystat, 0, sizeof(mystat)); mem_cgroup_get_total_stat(mem_cont, &mystat); - for (i = 0; i < NR_MCS_STAT; i++) + for (i = 0; i < NR_MCS_STAT; i++) { + if (i == MCS_SWAP && !do_swap_account) + continue; cb->fill(cb, memcg_stat_strings[i].total_name, mystat.stat[i]); - + } #ifdef CONFIG_DEBUG_VM cb->fill(cb, "inactive_ratio", calc_inactive_ratio(mem_cont, NULL)); @@ -2345,6 +2859,12 @@ static struct cftype mem_cgroup_files[] = { .read_u64 = mem_cgroup_read, }, { + .name = "soft_limit_in_bytes", + .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), + .write_string = mem_cgroup_write, + .read_u64 = mem_cgroup_read, + }, + { .name = "failcnt", .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), .trigger = mem_cgroup_reset, @@ -2438,6 +2958,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) mz = &pn->zoneinfo[zone]; for_each_lru(l) INIT_LIST_HEAD(&mz->lists[l]); + mz->usage_in_excess = 0; + mz->on_tree = false; + mz->mem = mem; } return 0; } @@ -2483,6 +3006,7 @@ static void __mem_cgroup_free(struct mem_cgroup *mem) { int node; + mem_cgroup_remove_from_trees(mem); free_css_id(&mem_cgroup_subsys, &mem->css); for_each_node_state(node, N_POSSIBLE) @@ -2531,6 +3055,31 @@ static void __init enable_swap_cgroup(void) } #endif +static int mem_cgroup_soft_limit_tree_init(void) +{ + struct mem_cgroup_tree_per_node *rtpn; + struct mem_cgroup_tree_per_zone *rtpz; + int tmp, node, zone; + + for_each_node_state(node, N_POSSIBLE) { + tmp = node; + if (!node_state(node, N_NORMAL_MEMORY)) + tmp = -1; + rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); + if (!rtpn) + return 1; + + soft_limit_tree.rb_tree_per_node[node] = rtpn; + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + rtpz = &rtpn->rb_tree_per_zone[zone]; + rtpz->rb_root = RB_ROOT; + spin_lock_init(&rtpz->lock); + } + } + return 0; +} + static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { @@ -2545,10 +3094,15 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) for_each_node_state(node, N_POSSIBLE) if (alloc_mem_cgroup_per_zone_info(mem, node)) goto free_out; + /* root ? */ if (cont->parent == NULL) { enable_swap_cgroup(); parent = NULL; + root_mem_cgroup = mem; + if (mem_cgroup_soft_limit_tree_init()) + goto free_out; + } else { parent = mem_cgroup_from_cont(cont->parent); mem->use_hierarchy = parent->use_hierarchy; @@ -2577,6 +3131,7 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) return &mem->css; free_out: __mem_cgroup_free(mem); + root_mem_cgroup = NULL; return ERR_PTR(error); } @@ -2612,7 +3167,8 @@ static int mem_cgroup_populate(struct cgroup_subsys *ss, static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct cgroup *cont, struct cgroup *old_cont, - struct task_struct *p) + struct task_struct *p, + bool threadgroup) { mutex_lock(&memcg_tasklist); /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c new file mode 100644 index 0000000..729d4b1 --- /dev/null +++ b/mm/memory-failure.c @@ -0,0 +1,832 @@ +/* + * Copyright (C) 2008, 2009 Intel Corporation + * Authors: Andi Kleen, Fengguang Wu + * + * This software may be redistributed and/or modified under the terms of + * the GNU General Public License ("GPL") version 2 only as published by the + * Free Software Foundation. + * + * High level machine check handler. Handles pages reported by the + * hardware as being corrupted usually due to a 2bit ECC memory or cache + * failure. + * + * Handles page cache pages in various states. The tricky part + * here is that we can access any page asynchronous to other VM + * users, because memory failures could happen anytime and anywhere, + * possibly violating some of their assumptions. This is why this code + * has to be extremely careful. Generally it tries to use normal locking + * rules, as in get the standard locks, even if that means the + * error handling takes potentially a long time. + * + * The operation to map back from RMAP chains to processes has to walk + * the complete process list and has non linear complexity with the number + * mappings. In short it can be quite slow. But since memory corruptions + * are rare we hope to get away with this. + */ + +/* + * Notebook: + * - hugetlb needs more code + * - kcore/oldmem/vmcore/mem/kmem check for hwpoison pages + * - pass bad pages to kdump next kernel + */ +#define DEBUG 1 /* remove me in 2.6.34 */ +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/page-flags.h> +#include <linux/sched.h> +#include <linux/rmap.h> +#include <linux/pagemap.h> +#include <linux/swap.h> +#include <linux/backing-dev.h> +#include "internal.h" + +int sysctl_memory_failure_early_kill __read_mostly = 0; + +int sysctl_memory_failure_recovery __read_mostly = 1; + +atomic_long_t mce_bad_pages __read_mostly = ATOMIC_LONG_INIT(0); + +/* + * Send all the processes who have the page mapped an ``action optional'' + * signal. + */ +static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, + unsigned long pfn) +{ + struct siginfo si; + int ret; + + printk(KERN_ERR + "MCE %#lx: Killing %s:%d early due to hardware memory corruption\n", + pfn, t->comm, t->pid); + si.si_signo = SIGBUS; + si.si_errno = 0; + si.si_code = BUS_MCEERR_AO; + si.si_addr = (void *)addr; +#ifdef __ARCH_SI_TRAPNO + si.si_trapno = trapno; +#endif + si.si_addr_lsb = PAGE_SHIFT; + /* + * Don't use force here, it's convenient if the signal + * can be temporarily blocked. + * This could cause a loop when the user sets SIGBUS + * to SIG_IGN, but hopefully noone will do that? + */ + ret = send_sig_info(SIGBUS, &si, t); /* synchronous? */ + if (ret < 0) + printk(KERN_INFO "MCE: Error sending signal to %s:%d: %d\n", + t->comm, t->pid, ret); + return ret; +} + +/* + * Kill all processes that have a poisoned page mapped and then isolate + * the page. + * + * General strategy: + * Find all processes having the page mapped and kill them. + * But we keep a page reference around so that the page is not + * actually freed yet. + * Then stash the page away + * + * There's no convenient way to get back to mapped processes + * from the VMAs. So do a brute-force search over all + * running processes. + * + * Remember that machine checks are not common (or rather + * if they are common you have other problems), so this shouldn't + * be a performance issue. + * + * Also there are some races possible while we get from the + * error detection to actually handle it. + */ + +struct to_kill { + struct list_head nd; + struct task_struct *tsk; + unsigned long addr; + unsigned addr_valid:1; +}; + +/* + * Failure handling: if we can't find or can't kill a process there's + * not much we can do. We just print a message and ignore otherwise. + */ + +/* + * Schedule a process for later kill. + * Uses GFP_ATOMIC allocations to avoid potential recursions in the VM. + * TBD would GFP_NOIO be enough? + */ +static void add_to_kill(struct task_struct *tsk, struct page *p, + struct vm_area_struct *vma, + struct list_head *to_kill, + struct to_kill **tkc) +{ + struct to_kill *tk; + + if (*tkc) { + tk = *tkc; + *tkc = NULL; + } else { + tk = kmalloc(sizeof(struct to_kill), GFP_ATOMIC); + if (!tk) { + printk(KERN_ERR + "MCE: Out of memory while machine check handling\n"); + return; + } + } + tk->addr = page_address_in_vma(p, vma); + tk->addr_valid = 1; + + /* + * In theory we don't have to kill when the page was + * munmaped. But it could be also a mremap. Since that's + * likely very rare kill anyways just out of paranoia, but use + * a SIGKILL because the error is not contained anymore. + */ + if (tk->addr == -EFAULT) { + pr_debug("MCE: Unable to find user space address %lx in %s\n", + page_to_pfn(p), tsk->comm); + tk->addr_valid = 0; + } + get_task_struct(tsk); + tk->tsk = tsk; + list_add_tail(&tk->nd, to_kill); +} + +/* + * Kill the processes that have been collected earlier. + * + * Only do anything when DOIT is set, otherwise just free the list + * (this is used for clean pages which do not need killing) + * Also when FAIL is set do a force kill because something went + * wrong earlier. + */ +static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno, + int fail, unsigned long pfn) +{ + struct to_kill *tk, *next; + + list_for_each_entry_safe (tk, next, to_kill, nd) { + if (doit) { + /* + * In case something went wrong with munmaping + * make sure the process doesn't catch the + * signal and then access the memory. Just kill it. + * the signal handlers + */ + if (fail || tk->addr_valid == 0) { + printk(KERN_ERR + "MCE %#lx: forcibly killing %s:%d because of failure to unmap corrupted page\n", + pfn, tk->tsk->comm, tk->tsk->pid); + force_sig(SIGKILL, tk->tsk); + } + + /* + * In theory the process could have mapped + * something else on the address in-between. We could + * check for that, but we need to tell the + * process anyways. + */ + else if (kill_proc_ao(tk->tsk, tk->addr, trapno, + pfn) < 0) + printk(KERN_ERR + "MCE %#lx: Cannot send advisory machine check signal to %s:%d\n", + pfn, tk->tsk->comm, tk->tsk->pid); + } + put_task_struct(tk->tsk); + kfree(tk); + } +} + +static int task_early_kill(struct task_struct *tsk) +{ + if (!tsk->mm) + return 0; + if (tsk->flags & PF_MCE_PROCESS) + return !!(tsk->flags & PF_MCE_EARLY); + return sysctl_memory_failure_early_kill; +} + +/* + * Collect processes when the error hit an anonymous page. + */ +static void collect_procs_anon(struct page *page, struct list_head *to_kill, + struct to_kill **tkc) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct anon_vma *av; + + read_lock(&tasklist_lock); + av = page_lock_anon_vma(page); + if (av == NULL) /* Not actually mapped anymore */ + goto out; + for_each_process (tsk) { + if (!task_early_kill(tsk)) + continue; + list_for_each_entry (vma, &av->head, anon_vma_node) { + if (!page_mapped_in_vma(page, vma)) + continue; + if (vma->vm_mm == tsk->mm) + add_to_kill(tsk, page, vma, to_kill, tkc); + } + } + page_unlock_anon_vma(av); +out: + read_unlock(&tasklist_lock); +} + +/* + * Collect processes when the error hit a file mapped page. + */ +static void collect_procs_file(struct page *page, struct list_head *to_kill, + struct to_kill **tkc) +{ + struct vm_area_struct *vma; + struct task_struct *tsk; + struct prio_tree_iter iter; + struct address_space *mapping = page->mapping; + + /* + * A note on the locking order between the two locks. + * We don't rely on this particular order. + * If you have some other code that needs a different order + * feel free to switch them around. Or add a reverse link + * from mm_struct to task_struct, then this could be all + * done without taking tasklist_lock and looping over all tasks. + */ + + read_lock(&tasklist_lock); + spin_lock(&mapping->i_mmap_lock); + for_each_process(tsk) { + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + if (!task_early_kill(tsk)) + continue; + + vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, + pgoff) { + /* + * Send early kill signal to tasks where a vma covers + * the page but the corrupted page is not necessarily + * mapped it in its pte. + * Assume applications who requested early kill want + * to be informed of all such data corruptions. + */ + if (vma->vm_mm == tsk->mm) + add_to_kill(tsk, page, vma, to_kill, tkc); + } + } + spin_unlock(&mapping->i_mmap_lock); + read_unlock(&tasklist_lock); +} + +/* + * Collect the processes who have the corrupted page mapped to kill. + * This is done in two steps for locking reasons. + * First preallocate one tokill structure outside the spin locks, + * so that we can kill at least one process reasonably reliable. + */ +static void collect_procs(struct page *page, struct list_head *tokill) +{ + struct to_kill *tk; + + if (!page->mapping) + return; + + tk = kmalloc(sizeof(struct to_kill), GFP_NOIO); + if (!tk) + return; + if (PageAnon(page)) + collect_procs_anon(page, tokill, &tk); + else + collect_procs_file(page, tokill, &tk); + kfree(tk); +} + +/* + * Error handlers for various types of pages. + */ + +enum outcome { + FAILED, /* Error handling failed */ + DELAYED, /* Will be handled later */ + IGNORED, /* Error safely ignored */ + RECOVERED, /* Successfully recovered */ +}; + +static const char *action_name[] = { + [FAILED] = "Failed", + [DELAYED] = "Delayed", + [IGNORED] = "Ignored", + [RECOVERED] = "Recovered", +}; + +/* + * Error hit kernel page. + * Do nothing, try to be lucky and not touch this instead. For a few cases we + * could be more sophisticated. + */ +static int me_kernel(struct page *p, unsigned long pfn) +{ + return DELAYED; +} + +/* + * Already poisoned page. + */ +static int me_ignore(struct page *p, unsigned long pfn) +{ + return IGNORED; +} + +/* + * Page in unknown state. Do nothing. + */ +static int me_unknown(struct page *p, unsigned long pfn) +{ + printk(KERN_ERR "MCE %#lx: Unknown page state\n", pfn); + return FAILED; +} + +/* + * Free memory + */ +static int me_free(struct page *p, unsigned long pfn) +{ + return DELAYED; +} + +/* + * Clean (or cleaned) page cache page. + */ +static int me_pagecache_clean(struct page *p, unsigned long pfn) +{ + int err; + int ret = FAILED; + struct address_space *mapping; + + if (!isolate_lru_page(p)) + page_cache_release(p); + + /* + * For anonymous pages we're done the only reference left + * should be the one m_f() holds. + */ + if (PageAnon(p)) + return RECOVERED; + + /* + * Now truncate the page in the page cache. This is really + * more like a "temporary hole punch" + * Don't do this for block devices when someone else + * has a reference, because it could be file system metadata + * and that's not safe to truncate. + */ + mapping = page_mapping(p); + if (!mapping) { + /* + * Page has been teared down in the meanwhile + */ + return FAILED; + } + + /* + * Truncation is a bit tricky. Enable it per file system for now. + * + * Open: to take i_mutex or not for this? Right now we don't. + */ + if (mapping->a_ops->error_remove_page) { + err = mapping->a_ops->error_remove_page(mapping, p); + if (err != 0) { + printk(KERN_INFO "MCE %#lx: Failed to punch page: %d\n", + pfn, err); + } else if (page_has_private(p) && + !try_to_release_page(p, GFP_NOIO)) { + pr_debug("MCE %#lx: failed to release buffers\n", pfn); + } else { + ret = RECOVERED; + } + } else { + /* + * If the file system doesn't support it just invalidate + * This fails on dirty or anything with private pages + */ + if (invalidate_inode_page(p)) + ret = RECOVERED; + else + printk(KERN_INFO "MCE %#lx: Failed to invalidate\n", + pfn); + } + return ret; +} + +/* + * Dirty cache page page + * Issues: when the error hit a hole page the error is not properly + * propagated. + */ +static int me_pagecache_dirty(struct page *p, unsigned long pfn) +{ + struct address_space *mapping = page_mapping(p); + + SetPageError(p); + /* TBD: print more information about the file. */ + if (mapping) { + /* + * IO error will be reported by write(), fsync(), etc. + * who check the mapping. + * This way the application knows that something went + * wrong with its dirty file data. + * + * There's one open issue: + * + * The EIO will be only reported on the next IO + * operation and then cleared through the IO map. + * Normally Linux has two mechanisms to pass IO error + * first through the AS_EIO flag in the address space + * and then through the PageError flag in the page. + * Since we drop pages on memory failure handling the + * only mechanism open to use is through AS_AIO. + * + * This has the disadvantage that it gets cleared on + * the first operation that returns an error, while + * the PageError bit is more sticky and only cleared + * when the page is reread or dropped. If an + * application assumes it will always get error on + * fsync, but does other operations on the fd before + * and the page is dropped inbetween then the error + * will not be properly reported. + * + * This can already happen even without hwpoisoned + * pages: first on metadata IO errors (which only + * report through AS_EIO) or when the page is dropped + * at the wrong time. + * + * So right now we assume that the application DTRT on + * the first EIO, but we're not worse than other parts + * of the kernel. + */ + mapping_set_error(mapping, EIO); + } + + return me_pagecache_clean(p, pfn); +} + +/* + * Clean and dirty swap cache. + * + * Dirty swap cache page is tricky to handle. The page could live both in page + * cache and swap cache(ie. page is freshly swapped in). So it could be + * referenced concurrently by 2 types of PTEs: + * normal PTEs and swap PTEs. We try to handle them consistently by calling + * try_to_unmap(TTU_IGNORE_HWPOISON) to convert the normal PTEs to swap PTEs, + * and then + * - clear dirty bit to prevent IO + * - remove from LRU + * - but keep in the swap cache, so that when we return to it on + * a later page fault, we know the application is accessing + * corrupted data and shall be killed (we installed simple + * interception code in do_swap_page to catch it). + * + * Clean swap cache pages can be directly isolated. A later page fault will + * bring in the known good data from disk. + */ +static int me_swapcache_dirty(struct page *p, unsigned long pfn) +{ + int ret = FAILED; + + ClearPageDirty(p); + /* Trigger EIO in shmem: */ + ClearPageUptodate(p); + + if (!isolate_lru_page(p)) { + page_cache_release(p); + ret = DELAYED; + } + + return ret; +} + +static int me_swapcache_clean(struct page *p, unsigned long pfn) +{ + int ret = FAILED; + + if (!isolate_lru_page(p)) { + page_cache_release(p); + ret = RECOVERED; + } + delete_from_swap_cache(p); + return ret; +} + +/* + * Huge pages. Needs work. + * Issues: + * No rmap support so we cannot find the original mapper. In theory could walk + * all MMs and look for the mappings, but that would be non atomic and racy. + * Need rmap for hugepages for this. Alternatively we could employ a heuristic, + * like just walking the current process and hoping it has it mapped (that + * should be usually true for the common "shared database cache" case) + * Should handle free huge pages and dequeue them too, but this needs to + * handle huge page accounting correctly. + */ +static int me_huge_page(struct page *p, unsigned long pfn) +{ + return FAILED; +} + +/* + * Various page states we can handle. + * + * A page state is defined by its current page->flags bits. + * The table matches them in order and calls the right handler. + * + * This is quite tricky because we can access page at any time + * in its live cycle, so all accesses have to be extremly careful. + * + * This is not complete. More states could be added. + * For any missing state don't attempt recovery. + */ + +#define dirty (1UL << PG_dirty) +#define sc (1UL << PG_swapcache) +#define unevict (1UL << PG_unevictable) +#define mlock (1UL << PG_mlocked) +#define writeback (1UL << PG_writeback) +#define lru (1UL << PG_lru) +#define swapbacked (1UL << PG_swapbacked) +#define head (1UL << PG_head) +#define tail (1UL << PG_tail) +#define compound (1UL << PG_compound) +#define slab (1UL << PG_slab) +#define buddy (1UL << PG_buddy) +#define reserved (1UL << PG_reserved) + +static struct page_state { + unsigned long mask; + unsigned long res; + char *msg; + int (*action)(struct page *p, unsigned long pfn); +} error_states[] = { + { reserved, reserved, "reserved kernel", me_ignore }, + { buddy, buddy, "free kernel", me_free }, + + /* + * Could in theory check if slab page is free or if we can drop + * currently unused objects without touching them. But just + * treat it as standard kernel for now. + */ + { slab, slab, "kernel slab", me_kernel }, + +#ifdef CONFIG_PAGEFLAGS_EXTENDED + { head, head, "huge", me_huge_page }, + { tail, tail, "huge", me_huge_page }, +#else + { compound, compound, "huge", me_huge_page }, +#endif + + { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, + { sc|dirty, sc, "swapcache", me_swapcache_clean }, + + { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, + { unevict, unevict, "unevictable LRU", me_pagecache_clean}, + +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT + { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, + { mlock, mlock, "mlocked LRU", me_pagecache_clean }, +#endif + + { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, + { lru|dirty, lru, "clean LRU", me_pagecache_clean }, + { swapbacked, swapbacked, "anonymous", me_pagecache_clean }, + + /* + * Catchall entry: must be at end. + */ + { 0, 0, "unknown page state", me_unknown }, +}; + +#undef lru + +static void action_result(unsigned long pfn, char *msg, int result) +{ + struct page *page = NULL; + if (pfn_valid(pfn)) + page = pfn_to_page(pfn); + + printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", + pfn, + page && PageDirty(page) ? "dirty " : "", + msg, action_name[result]); +} + +static int page_action(struct page_state *ps, struct page *p, + unsigned long pfn, int ref) +{ + int result; + + result = ps->action(p, pfn); + action_result(pfn, ps->msg, result); + if (page_count(p) != 1 + ref) + printk(KERN_ERR + "MCE %#lx: %s page still referenced by %d users\n", + pfn, ps->msg, page_count(p) - 1); + + /* Could do more checks here if page looks ok */ + /* + * Could adjust zone counters here to correct for the missing page. + */ + + return result == RECOVERED ? 0 : -EBUSY; +} + +#define N_UNMAP_TRIES 5 + +/* + * Do all that is necessary to remove user space mappings. Unmap + * the pages and send SIGBUS to the processes if the data was dirty. + */ +static void hwpoison_user_mappings(struct page *p, unsigned long pfn, + int trapno) +{ + enum ttu_flags ttu = TTU_UNMAP | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS; + struct address_space *mapping; + LIST_HEAD(tokill); + int ret; + int i; + int kill = 1; + + if (PageReserved(p) || PageCompound(p) || PageSlab(p)) + return; + + if (!PageLRU(p)) + lru_add_drain_all(); + + /* + * This check implies we don't kill processes if their pages + * are in the swap cache early. Those are always late kills. + */ + if (!page_mapped(p)) + return; + + if (PageSwapCache(p)) { + printk(KERN_ERR + "MCE %#lx: keeping poisoned page in swap cache\n", pfn); + ttu |= TTU_IGNORE_HWPOISON; + } + + /* + * Propagate the dirty bit from PTEs to struct page first, because we + * need this to decide if we should kill or just drop the page. + */ + mapping = page_mapping(p); + if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) { + if (page_mkclean(p)) { + SetPageDirty(p); + } else { + kill = 0; + ttu |= TTU_IGNORE_HWPOISON; + printk(KERN_INFO + "MCE %#lx: corrupted page was clean: dropped without side effects\n", + pfn); + } + } + + /* + * First collect all the processes that have the page + * mapped in dirty form. This has to be done before try_to_unmap, + * because ttu takes the rmap data structures down. + * + * Error handling: We ignore errors here because + * there's nothing that can be done. + */ + if (kill) + collect_procs(p, &tokill); + + /* + * try_to_unmap can fail temporarily due to races. + * Try a few times (RED-PEN better strategy?) + */ + for (i = 0; i < N_UNMAP_TRIES; i++) { + ret = try_to_unmap(p, ttu); + if (ret == SWAP_SUCCESS) + break; + pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret); + } + + if (ret != SWAP_SUCCESS) + printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", + pfn, page_mapcount(p)); + + /* + * Now that the dirty bit has been propagated to the + * struct page and all unmaps done we can decide if + * killing is needed or not. Only kill when the page + * was dirty, otherwise the tokill list is merely + * freed. When there was a problem unmapping earlier + * use a more force-full uncatchable kill to prevent + * any accesses to the poisoned memory. + */ + kill_procs_ao(&tokill, !!PageDirty(p), trapno, + ret != SWAP_SUCCESS, pfn); +} + +int __memory_failure(unsigned long pfn, int trapno, int ref) +{ + struct page_state *ps; + struct page *p; + int res; + + if (!sysctl_memory_failure_recovery) + panic("Memory failure from trap %d on page %lx", trapno, pfn); + + if (!pfn_valid(pfn)) { + action_result(pfn, "memory outside kernel control", IGNORED); + return -EIO; + } + + p = pfn_to_page(pfn); + if (TestSetPageHWPoison(p)) { + action_result(pfn, "already hardware poisoned", IGNORED); + return 0; + } + + atomic_long_add(1, &mce_bad_pages); + + /* + * We need/can do nothing about count=0 pages. + * 1) it's a free page, and therefore in safe hand: + * prep_new_page() will be the gate keeper. + * 2) it's part of a non-compound high order page. + * Implies some kernel user: cannot stop them from + * R/W the page; let's pray that the page has been + * used and will be freed some time later. + * In fact it's dangerous to directly bump up page count from 0, + * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. + */ + if (!get_page_unless_zero(compound_head(p))) { + action_result(pfn, "free or high order kernel", IGNORED); + return PageBuddy(compound_head(p)) ? 0 : -EBUSY; + } + + /* + * Lock the page and wait for writeback to finish. + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ + lock_page_nosync(p); + wait_on_page_writeback(p); + + /* + * Now take care of user space mappings. + */ + hwpoison_user_mappings(p, pfn, trapno); + + /* + * Torn down by someone else? + */ + if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { + action_result(pfn, "already truncated LRU", IGNORED); + res = 0; + goto out; + } + + res = -EBUSY; + for (ps = error_states;; ps++) { + if ((p->flags & ps->mask) == ps->res) { + res = page_action(ps, p, pfn, ref); + break; + } + } +out: + unlock_page(p); + return res; +} +EXPORT_SYMBOL_GPL(__memory_failure); + +/** + * memory_failure - Handle memory failure of a page. + * @pfn: Page Number of the corrupted page + * @trapno: Trap number reported in the signal to user space. + * + * This function is called by the low level machine check code + * of an architecture when it detects hardware memory corruption + * of a page. It tries its best to recover, which includes + * dropping pages, killing processes etc. + * + * The function is primarily of use for corruptions that + * happen outside the current execution context (e.g. when + * detected by a background scrubber) + * + * Must run in process context (e.g. a work queue) with interrupts + * enabled and no spinlocks hold. + */ +void memory_failure(unsigned long pfn, int trapno) +{ + __memory_failure(pfn, trapno, 0); +} diff --git a/mm/memory.c b/mm/memory.c index b1443ac07..7e91b5f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -297,7 +297,8 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr = vma->vm_start; /* - * Hide vma from rmap and vmtruncate before freeing pgtables + * Hide vma from rmap and truncate_pagecache before freeing + * pgtables */ anon_vma_unlink(vma); unlink_file_vma(vma); @@ -1325,7 +1326,8 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) return i ? i : -ENOMEM; - else if (ret & VM_FAULT_SIGBUS) + if (ret & + (VM_FAULT_HWPOISON|VM_FAULT_SIGBUS)) return i ? i : -EFAULT; BUG(); } @@ -2407,7 +2409,7 @@ restart: * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE - * boundary. Note that this is different from vmtruncate(), which + * boundary. Note that this is different from truncate_pagecache(), which * must keep the partial page. In contrast, we must get rid of * partial pages. * @holelen: size of prospective hole in bytes. This will be rounded @@ -2458,63 +2460,6 @@ void unmap_mapping_range(struct address_space *mapping, } EXPORT_SYMBOL(unmap_mapping_range); -/** - * vmtruncate - unmap mappings "freed" by truncate() syscall - * @inode: inode of the file used - * @offset: file offset to start truncating - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode * inode, loff_t offset) -{ - if (inode->i_size < offset) { - unsigned long limit; - - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out_big; - i_size_write(inode, offset); - } else { - struct address_space *mapping = inode->i_mapping; - - /* - * truncation of in-use swapfiles is disallowed - it would - * cause subsequent swapout to scribble on the now-freed - * blocks. - */ - if (IS_SWAPFILE(inode)) - return -ETXTBSY; - i_size_write(inode, offset); - - /* - * unmap_mapping_range is called twice, first simply for - * efficiency so that truncate_inode_pages does fewer - * single-page unmaps. However after this first call, and - * before truncate_inode_pages finishes, it is possible for - * private pages to be COWed, which remain after - * truncate_inode_pages finishes, hence the second - * unmap_mapping_range call must be made for correctness. - */ - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, offset); - unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); - } - - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; - -out_sig: - send_sig(SIGXFSZ, current, 0); -out_big: - return -EFBIG; -} -EXPORT_SYMBOL(vmtruncate); - int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) { struct address_space *mapping = inode->i_mapping; @@ -2559,8 +2504,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; entry = pte_to_swp_entry(orig_pte); - if (is_migration_entry(entry)) { - migration_entry_wait(mm, pmd, address); + if (unlikely(non_swap_entry(entry))) { + if (is_migration_entry(entry)) { + migration_entry_wait(mm, pmd, address); + } else if (is_hwpoison_entry(entry)) { + ret = VM_FAULT_HWPOISON; + } else { + print_bad_pte(vma, address, orig_pte, NULL); + ret = VM_FAULT_OOM; + } goto out; } delayacct_set_flag(DELAYACCT_PF_SWAPIN); @@ -2584,6 +2536,10 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, /* Had to read the page from swap area: Major fault */ ret = VM_FAULT_MAJOR; count_vm_event(PGMAJFAULT); + } else if (PageHWPoison(page)) { + ret = VM_FAULT_HWPOISON; + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + goto out; } lock_page(page); @@ -2760,6 +2716,12 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE))) return ret; + if (unlikely(PageHWPoison(vmf.page))) { + if (ret & VM_FAULT_LOCKED) + unlock_page(vmf.page); + return VM_FAULT_HWPOISON; + } + /* * For consistency in subsequent calls, make the faulted page always * locked. diff --git a/mm/migrate.c b/mm/migrate.c index 16052e8..1a4bf48 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -675,7 +675,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, } /* Establish migration ptes or remove ptes */ - try_to_unmap(page, 1); + try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); skip_unmap: if (!page_mapped(page)) @@ -2282,7 +2282,7 @@ static void special_mapping_close(struct vm_area_struct *vma) { } -static struct vm_operations_struct special_mapping_vmops = { +static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, }; diff --git a/mm/mremap.c b/mm/mremap.c index 20a07db..97bff25 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -86,8 +86,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, if (vma->vm_file) { /* * Subtle point from Rajesh Venkatasubramanian: before - * moving file-based ptes, we must lock vmtruncate out, - * since it might clean the dst vma before the src vma, + * moving file-based ptes, we must lock truncate_pagecache + * out, since it might clean the dst vma before the src vma, * and we propagate stale pages into the dst afterward. */ mapping = vma->vm_file->f_mapping; @@ -79,50 +79,10 @@ static struct kmem_cache *vm_region_jar; struct rb_root nommu_region_tree = RB_ROOT; DECLARE_RWSEM(nommu_region_sem); -struct vm_operations_struct generic_file_vm_ops = { +const struct vm_operations_struct generic_file_vm_ops = { }; /* - * Handle all mappings that got truncated by a "truncate()" - * system call. - * - * NOTE! We have to be ready to update the memory sharing - * between the file and the memory map for a potential last - * incomplete page. Ugly, but necessary. - */ -int vmtruncate(struct inode *inode, loff_t offset) -{ - struct address_space *mapping = inode->i_mapping; - unsigned long limit; - - if (inode->i_size < offset) - goto do_expand; - i_size_write(inode, offset); - - truncate_inode_pages(mapping, offset); - goto out_truncate; - -do_expand: - limit = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; - if (limit != RLIM_INFINITY && offset > limit) - goto out_sig; - if (offset > inode->i_sb->s_maxbytes) - goto out; - i_size_write(inode, offset); - -out_truncate: - if (inode->i_op->truncate) - inode->i_op->truncate(inode); - return 0; -out_sig: - send_sig(SIGXFSZ, current, 0); -out: - return -EFBIG; -} - -EXPORT_SYMBOL(vmtruncate); - -/* * Return the total memory allocated for this pointer, not * just what the caller asked for. * @@ -866,7 +826,7 @@ static int validate_mmap_request(struct file *file, int ret; /* do the simple checks first */ - if (flags & MAP_FIXED || addr) { + if (flags & MAP_FIXED) { printk(KERN_DEBUG "%d: Can't do fixed-address/overlay mmap of RAM\n", current->pid); @@ -1074,7 +1034,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; @@ -1091,7 +1051,8 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) */ static int do_mmap_private(struct vm_area_struct *vma, struct vm_region *region, - unsigned long len) + unsigned long len, + unsigned long capabilities) { struct page *pages; unsigned long total, point, n, rlen; @@ -1102,13 +1063,13 @@ static int do_mmap_private(struct vm_area_struct *vma, * shared mappings on devices or memory * - VM_MAYSHARE will be set if it may attempt to share */ - if (vma->vm_file) { + if (capabilities & BDI_CAP_MAP_DIRECT) { ret = vma->vm_file->f_op->mmap(vma->vm_file, vma); if (ret == 0) { /* shouldn't return success if we're not sharing */ BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); vma->vm_region->vm_top = vma->vm_region->vm_end; - return ret; + return 0; } if (ret != -ENOSYS) return ret; @@ -1221,9 +1182,6 @@ unsigned long do_mmap_pgoff(struct file *file, kenter(",%lx,%lx,%lx,%lx,%lx", addr, len, prot, flags, pgoff); - if (!(flags & MAP_FIXED)) - addr = round_hint_to_min(addr); - /* decide whether we should attempt the mapping, and if so what sort of * mapping */ ret = validate_mmap_request(file, addr, len, prot, flags, pgoff, @@ -1233,6 +1191,9 @@ unsigned long do_mmap_pgoff(struct file *file, return ret; } + /* we ignore the address hint */ + addr = 0; + /* we've determined that we can make the mapping, now translate what we * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); @@ -1346,7 +1307,7 @@ unsigned long do_mmap_pgoff(struct file *file, * - this is the hook for quasi-memory character devices to * tell us the location of a shared mapping */ - if (file && file->f_op->get_unmapped_area) { + if (capabilities & BDI_CAP_MAP_DIRECT) { addr = file->f_op->get_unmapped_area(file, addr, len, pgoff, flags); if (IS_ERR((void *) addr)) { @@ -1370,15 +1331,17 @@ unsigned long do_mmap_pgoff(struct file *file, } vma->vm_region = region; - add_nommu_region(region); - /* set up the mapping */ + /* set up the mapping + * - the region is filled in if BDI_CAP_MAP_DIRECT is still set + */ if (file && vma->vm_flags & VM_SHARED) ret = do_mmap_shared_file(vma); else - ret = do_mmap_private(vma, region, len); + ret = do_mmap_private(vma, region, len, capabilities); if (ret < 0) - goto error_put_region; + goto error_just_free; + add_nommu_region(region); /* okay... we have a mapping; now we have to register it */ result = vma->vm_start; @@ -1396,19 +1359,6 @@ share: kleave(" = %lx", result); return result; -error_put_region: - __put_nommu_region(region); - if (vma) { - if (vma->vm_file) { - fput(vma->vm_file); - if (vma->vm_flags & VM_EXECUTABLE) - removed_exe_file_vma(vma->vm_mm); - } - kmem_cache_free(vm_area_cachep, vma); - } - kleave(" = %d [pr]", ret); - return ret; - error_just_free: up_write(&nommu_region_sem); error: diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 5f378dd5..2c5d792 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -44,18 +44,21 @@ static long ratelimit_pages = 32; /* * When balance_dirty_pages decides that the caller needs to perform some * non-background writeback, this is how many pages it will attempt to write. - * It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably + * It should be somewhat larger than dirtied pages to ensure that reasonably * large amounts of I/O are submitted. */ -static inline long sync_writeback_pages(void) +static inline long sync_writeback_pages(unsigned long dirtied) { - return ratelimit_pages + ratelimit_pages / 2; + if (dirtied < ratelimit_pages) + dirtied = ratelimit_pages; + + return dirtied + dirtied / 2; } /* The following parameters are exported via /proc/sys/vm */ /* - * Start background writeback (via pdflush) at this percentage + * Start background writeback (via writeback threads) at this percentage */ int dirty_background_ratio = 10; @@ -155,37 +158,37 @@ static void update_completion_period(void) } int dirty_background_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_bytes = 0; return ret; } int dirty_background_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write) dirty_background_ratio = 0; return ret; } int dirty_ratio_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { int old_ratio = vm_dirty_ratio; int ret; - ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_ratio != old_ratio) { update_completion_period(); vm_dirty_bytes = 0; @@ -195,13 +198,13 @@ int dirty_ratio_handler(struct ctl_table *table, int write, int dirty_bytes_handler(struct ctl_table *table, int write, - struct file *filp, void __user *buffer, size_t *lenp, + void __user *buffer, size_t *lenp, loff_t *ppos) { unsigned long old_bytes = vm_dirty_bytes; int ret; - ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, lenp, ppos); if (ret == 0 && write && vm_dirty_bytes != old_bytes) { update_completion_period(); vm_dirty_ratio = 0; @@ -474,10 +477,11 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty, * balance_dirty_pages() must be called by processes which are generating dirty * data. It looks at the number of dirty pages in the machine and will force * the caller to perform writeback if the system is over `vm_dirty_ratio'. - * If we're over `background_thresh' then pdflush is woken to perform some - * writeout. + * If we're over `background_thresh' then the writeback threads are woken to + * perform some writeout. */ -static void balance_dirty_pages(struct address_space *mapping) +static void balance_dirty_pages(struct address_space *mapping, + unsigned long write_chunk) { long nr_reclaimable, bdi_nr_reclaimable; long nr_writeback, bdi_nr_writeback; @@ -485,7 +489,6 @@ static void balance_dirty_pages(struct address_space *mapping) unsigned long dirty_thresh; unsigned long bdi_thresh; unsigned long pages_written = 0; - unsigned long write_chunk = sync_writeback_pages(); unsigned long pause = 1; struct backing_dev_info *bdi = mapping->backing_dev_info; @@ -563,7 +566,8 @@ static void balance_dirty_pages(struct address_space *mapping) if (pages_written >= write_chunk) break; /* We've done our duty */ - schedule_timeout_interruptible(pause); + __set_current_state(TASK_INTERRUPTIBLE); + io_schedule_timeout(pause); /* * Increase the delay for each loop, up to our previous @@ -579,7 +583,7 @@ static void balance_dirty_pages(struct address_space *mapping) bdi->dirty_exceeded = 0; if (writeback_in_progress(bdi)) - return; /* pdflush is already working this queue */ + return; /* * In laptop mode, we wait until hitting the higher threshold before @@ -590,10 +594,10 @@ static void balance_dirty_pages(struct address_space *mapping) * background_thresh, to keep the amount of dirty memory low. */ if ((laptop_mode && pages_written) || - (!laptop_mode && ((nr_writeback = global_page_state(NR_FILE_DIRTY) - + global_page_state(NR_UNSTABLE_NFS)) + (!laptop_mode && ((global_page_state(NR_FILE_DIRTY) + + global_page_state(NR_UNSTABLE_NFS)) > background_thresh))) - bdi_start_writeback(bdi, nr_writeback); + bdi_start_writeback(bdi, NULL, 0); } void set_page_dirty_balance(struct page *page, int page_mkwrite) @@ -640,9 +644,10 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, p = &__get_cpu_var(bdp_ratelimits); *p += nr_pages_dirtied; if (unlikely(*p >= ratelimit)) { + ratelimit = sync_writeback_pages(*p); *p = 0; preempt_enable(); - balance_dirty_pages(mapping); + balance_dirty_pages(mapping, ratelimit); return; } preempt_enable(); @@ -686,9 +691,9 @@ static DEFINE_TIMER(laptop_mode_wb_timer, laptop_timer_fn, 0, 0); * sysctl handler for /proc/sys/vm/dirty_writeback_centisecs */ int dirty_writeback_centisecs_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); return 0; } @@ -1149,6 +1154,13 @@ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) EXPORT_SYMBOL(redirty_page_for_writepage); /* + * Dirty a page. + * + * For pages with a mapping this should be done under the page lock + * for the benefit of asynchronous memory errors who prefer a consistent + * dirty state. This rule can be broken in some special cases, + * but should be better not to. + * * If the mapping doesn't provide a set_page_dirty a_op, then * just fall through and assume that it wants buffer_heads. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5717f27..bf72055 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -234,6 +234,12 @@ static void bad_page(struct page *page) static unsigned long nr_shown; static unsigned long nr_unshown; + /* Don't complain about poisoned pages */ + if (PageHWPoison(page)) { + __ClearPageBuddy(page); + return; + } + /* * Allow a burst of 60 reports, then keep quiet for that minute; * or allow a steady drip of one report per second. @@ -666,7 +672,7 @@ static inline void expand(struct zone *zone, struct page *page, /* * This page is about to be returned from the page allocator */ -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static inline int check_new_page(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -675,6 +681,18 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) bad_page(page); return 1; } + return 0; +} + +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + for (i = 0; i < (1 << order); i++) { + struct page *p = page + i; + if (unlikely(check_new_page(p))) + return 1; + } set_page_private(page, 0); set_page_refcounted(page); @@ -2373,7 +2391,7 @@ early_param("numa_zonelist_order", setup_numa_zonelist_order); * sysctl handler for numa_zonelist_order */ int numa_zonelist_order_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, + void __user *buffer, size_t *length, loff_t *ppos) { char saved_string[NUMA_ZONELIST_ORDER_LEN]; @@ -2382,7 +2400,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, if (write) strncpy(saved_string, (char*)table->data, NUMA_ZONELIST_ORDER_LEN); - ret = proc_dostring(table, write, file, buffer, length, ppos); + ret = proc_dostring(table, write, buffer, length, ppos); if (ret) return ret; if (write) { @@ -4706,9 +4724,9 @@ module_init(init_per_zone_wmark_min) * changes. */ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (write) setup_per_zone_wmarks(); return 0; @@ -4716,12 +4734,12 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, #ifdef CONFIG_NUMA int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4732,12 +4750,12 @@ int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write, } int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; int rc; - rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + rc = proc_dointvec_minmax(table, write, buffer, length, ppos); if (rc) return rc; @@ -4758,9 +4776,9 @@ int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write, * if in function of the boot time zone sizes. */ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec_minmax(table, write, file, buffer, length, ppos); + proc_dointvec_minmax(table, write, buffer, length, ppos); setup_per_zone_lowmem_reserve(); return 0; } @@ -4772,13 +4790,13 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, */ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, - struct file *file, void __user *buffer, size_t *length, loff_t *ppos) + void __user *buffer, size_t *length, loff_t *ppos) { struct zone *zone; unsigned int cpu; int ret; - ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; for_each_populated_zone(zone) { diff --git a/mm/percpu.c b/mm/percpu.c index 43d8cac..6af78c1e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1043,7 +1043,9 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void) */ static void *pcpu_alloc(size_t size, size_t align, bool reserved) { + static int warn_limit = 10; struct pcpu_chunk *chunk; + const char *err; int slot, off; if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) { @@ -1059,11 +1061,14 @@ static void *pcpu_alloc(size_t size, size_t align, bool reserved) if (reserved && pcpu_reserved_chunk) { chunk = pcpu_reserved_chunk; if (size > chunk->contig_hint || - pcpu_extend_area_map(chunk) < 0) + pcpu_extend_area_map(chunk) < 0) { + err = "failed to extend area map of reserved chunk"; goto fail_unlock; + } off = pcpu_alloc_area(chunk, size, align); if (off >= 0) goto area_found; + err = "alloc from reserved chunk failed"; goto fail_unlock; } @@ -1080,6 +1085,7 @@ restart: case 1: goto restart; /* pcpu_lock dropped, restart */ default: + err = "failed to extend area map"; goto fail_unlock; } @@ -1093,8 +1099,10 @@ restart: spin_unlock_irq(&pcpu_lock); chunk = alloc_pcpu_chunk(); - if (!chunk) + if (!chunk) { + err = "failed to allocate new chunk"; goto fail_unlock_mutex; + } spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); @@ -1107,6 +1115,7 @@ area_found: if (pcpu_populate_chunk(chunk, off, size)) { spin_lock_irq(&pcpu_lock); pcpu_free_area(chunk, off); + err = "failed to populate"; goto fail_unlock; } @@ -1119,6 +1128,13 @@ fail_unlock: spin_unlock_irq(&pcpu_lock); fail_unlock_mutex: mutex_unlock(&pcpu_alloc_mutex); + if (warn_limit) { + pr_warning("PERCPU: allocation failed, size=%zu align=%zu, " + "%s\n", size, align, err); + dump_stack(); + if (!--warn_limit) + pr_info("PERCPU: limit reached, disable warning\n"); + } return NULL; } @@ -1347,6 +1363,10 @@ struct pcpu_alloc_info * __init pcpu_build_alloc_info( struct pcpu_alloc_info *ai; unsigned int *cpu_map; + /* this function may be called multiple times */ + memset(group_map, 0, sizeof(group_map)); + memset(group_cnt, 0, sizeof(group_map)); + /* * Determine min_unit_size, alloc_size and max_upa such that * alloc_size is multiple of atom_size and is the smallest @@ -1574,6 +1594,7 @@ static void pcpu_dump_alloc_info(const char *lvl, int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, void *base_addr) { + static char cpus_buf[4096] __initdata; static int smap[2], dmap[2]; size_t dyn_size = ai->dyn_size; size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; @@ -1585,17 +1606,26 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, int *unit_map; int group, unit, i; + cpumask_scnprintf(cpus_buf, sizeof(cpus_buf), cpu_possible_mask); + +#define PCPU_SETUP_BUG_ON(cond) do { \ + if (unlikely(cond)) { \ + pr_emerg("PERCPU: failed to initialize, %s", #cond); \ + pr_emerg("PERCPU: cpu_possible_mask=%s\n", cpus_buf); \ + pcpu_dump_alloc_info(KERN_EMERG, ai); \ + BUG(); \ + } \ +} while (0) + /* sanity checks */ BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC || ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC); - BUG_ON(ai->nr_groups <= 0); - BUG_ON(!ai->static_size); - BUG_ON(!base_addr); - BUG_ON(ai->unit_size < size_sum); - BUG_ON(ai->unit_size & ~PAGE_MASK); - BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); - - pcpu_dump_alloc_info(KERN_DEBUG, ai); + PCPU_SETUP_BUG_ON(ai->nr_groups <= 0); + PCPU_SETUP_BUG_ON(!ai->static_size); + PCPU_SETUP_BUG_ON(!base_addr); + PCPU_SETUP_BUG_ON(ai->unit_size < size_sum); + PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK); + PCPU_SETUP_BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE); /* process group information and build config tables accordingly */ group_offsets = alloc_bootmem(ai->nr_groups * sizeof(group_offsets[0])); @@ -1604,7 +1634,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0])); for (cpu = 0; cpu < nr_cpu_ids; cpu++) - unit_map[cpu] = NR_CPUS; + unit_map[cpu] = UINT_MAX; pcpu_first_unit_cpu = NR_CPUS; for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) { @@ -1618,8 +1648,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, if (cpu == NR_CPUS) continue; - BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu)); - BUG_ON(unit_map[cpu] != NR_CPUS); + PCPU_SETUP_BUG_ON(cpu > nr_cpu_ids); + PCPU_SETUP_BUG_ON(!cpu_possible(cpu)); + PCPU_SETUP_BUG_ON(unit_map[cpu] != UINT_MAX); unit_map[cpu] = unit + i; unit_off[cpu] = gi->base_offset + i * ai->unit_size; @@ -1632,7 +1663,11 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_units = unit; for_each_possible_cpu(cpu) - BUG_ON(unit_map[cpu] == NR_CPUS); + PCPU_SETUP_BUG_ON(unit_map[cpu] == UINT_MAX); + + /* we're done parsing the input, undefine BUG macro and dump config */ +#undef PCPU_SETUP_BUG_ON + pcpu_dump_alloc_info(KERN_INFO, ai); pcpu_nr_groups = ai->nr_groups; pcpu_group_offsets = group_offsets; @@ -1782,7 +1817,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, void *base = (void *)ULONG_MAX; void **areas = NULL; struct pcpu_alloc_info *ai; - size_t size_sum, areas_size; + size_t size_sum, areas_size, max_distance; int group, i, rc; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, @@ -1832,8 +1867,25 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size, } /* base address is now known, determine group base offsets */ - for (group = 0; group < ai->nr_groups; group++) + max_distance = 0; + for (group = 0; group < ai->nr_groups; group++) { ai->groups[group].base_offset = areas[group] - base; + max_distance = max_t(size_t, max_distance, + ai->groups[group].base_offset); + } + max_distance += ai->unit_size; + + /* warn if maximum distance is further than 75% of vmalloc space */ + if (max_distance > (VMALLOC_END - VMALLOC_START) * 3 / 4) { + pr_warning("PERCPU: max_distance=0x%zx too large for vmalloc " + "space 0x%lx\n", + max_distance, VMALLOC_END - VMALLOC_START); +#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK + /* and fail if we have fallback */ + rc = -EINVAL; + goto out_free; +#endif + } pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, diff --git a/mm/quicklist.c b/mm/quicklist.c index 6eedf7e..6633965 100644 --- a/mm/quicklist.c +++ b/mm/quicklist.c @@ -29,7 +29,6 @@ static unsigned long max_pages(unsigned long min_pages) int node = numa_node_id(); struct zone *zones = NODE_DATA(node)->node_zones; int num_cpus_on_node; - const struct cpumask *cpumask_on_node = cpumask_of_node(node); node_free_pages = #ifdef CONFIG_ZONE_DMA @@ -42,7 +41,7 @@ static unsigned long max_pages(unsigned long min_pages) max = node_free_pages / FRACTION_OF_NODE_MEM; - num_cpus_on_node = cpus_weight_nr(*cpumask_on_node); + num_cpus_on_node = cpumask_weight(cpumask_of_node(node)); max /= num_cpus_on_node; return max(max, min_pages); @@ -36,6 +36,11 @@ * mapping->tree_lock (widely used, in set_page_dirty, * in arch-dependent flush_dcache_mmap_lock, * within inode_lock in __sync_single_inode) + * + * (code doesn't rely on that order so it could be switched around) + * ->tasklist_lock + * anon_vma->lock (memory_failure, collect_procs_anon) + * pte map lock */ #include <linux/mm.h> @@ -191,7 +196,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -211,7 +216,7 @@ out: return NULL; } -static void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); @@ -237,8 +242,8 @@ vma_address(struct page *page, struct vm_area_struct *vma) } /* - * At what user virtual address is page expected in vma? checking that the - * page matches the vma: currently only used on anon pages, by unuse_vma; + * At what user virtual address is page expected in vma? + * checking that the page matches the vma. */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { @@ -311,7 +316,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, * if the page is not mapped into the page tables of this VMA. Only * valid for normal file or anonymous VMAs. */ -static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) { unsigned long address; pte_t *pte; @@ -756,7 +761,7 @@ void page_remove_rmap(struct page *page) * repeatedly from either try_to_unmap_anon or try_to_unmap_file. */ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, - int migration) + enum ttu_flags flags) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -778,11 +783,13 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration) { + if (!(flags & TTU_IGNORE_MLOCK)) { if (vma->vm_flags & VM_LOCKED) { ret = SWAP_MLOCK; goto out_unmap; } + } + if (!(flags & TTU_IGNORE_ACCESS)) { if (ptep_clear_flush_young_notify(vma, address, pte)) { ret = SWAP_FAIL; goto out_unmap; @@ -800,7 +807,14 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, /* Update high watermark before we lower rss */ update_hiwater_rss(mm); - if (PageAnon(page)) { + if (PageHWPoison(page) && !(flags & TTU_IGNORE_HWPOISON)) { + if (PageAnon(page)) + dec_mm_counter(mm, anon_rss); + else + dec_mm_counter(mm, file_rss); + set_pte_at(mm, address, pte, + swp_entry_to_pte(make_hwpoison_entry(page))); + } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; if (PageSwapCache(page)) { @@ -822,12 +836,12 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - BUG_ON(!migration); + BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); BUG_ON(pte_file(*pte)); - } else if (PAGE_MIGRATION && migration) { + } else if (PAGE_MIGRATION && (TTU_ACTION(flags) == TTU_MIGRATION)) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); @@ -996,12 +1010,13 @@ static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_anon(struct page *page, int unlock, int migration) +static int try_to_unmap_anon(struct page *page, enum ttu_flags flags) { struct anon_vma *anon_vma; struct vm_area_struct *vma; unsigned int mlocked = 0; int ret = SWAP_AGAIN; + int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; if (MLOCK_PAGES && unlikely(unlock)) ret = SWAP_SUCCESS; /* default for try_to_munlock() */ @@ -1017,7 +1032,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) continue; /* must visit all unlocked vmas */ ret = SWAP_MLOCK; /* saw at least one mlocked vma */ } else { - ret = try_to_unmap_one(page, vma, migration); + ret = try_to_unmap_one(page, vma, flags); if (ret == SWAP_FAIL || !page_mapped(page)) break; } @@ -1041,8 +1056,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) /** * try_to_unmap_file - unmap/unlock file page using the object-based rmap method * @page: the page to unmap/unlock - * @unlock: request for unlock rather than unmap [unlikely] - * @migration: unmapping for migration - ignored if @unlock + * @flags: action and flags * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. @@ -1054,7 +1068,7 @@ static int try_to_unmap_anon(struct page *page, int unlock, int migration) * vm_flags for that VMA. That should be OK, because that vma shouldn't be * 'LOCKED. */ -static int try_to_unmap_file(struct page *page, int unlock, int migration) +static int try_to_unmap_file(struct page *page, enum ttu_flags flags) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -1066,6 +1080,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) unsigned long max_nl_size = 0; unsigned int mapcount; unsigned int mlocked = 0; + int unlock = TTU_ACTION(flags) == TTU_MUNLOCK; if (MLOCK_PAGES && unlikely(unlock)) ret = SWAP_SUCCESS; /* default for try_to_munlock() */ @@ -1078,7 +1093,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) continue; /* must visit all vmas */ ret = SWAP_MLOCK; } else { - ret = try_to_unmap_one(page, vma, migration); + ret = try_to_unmap_one(page, vma, flags); if (ret == SWAP_FAIL || !page_mapped(page)) goto out; } @@ -1103,7 +1118,8 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) ret = SWAP_MLOCK; /* leave mlocked == 0 */ goto out; /* no need to look further */ } - if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) + if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && + (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -1137,7 +1153,7 @@ static int try_to_unmap_file(struct page *page, int unlock, int migration) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (!MLOCK_PAGES && !migration && + if (!MLOCK_PAGES && !(flags & TTU_IGNORE_MLOCK) && (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; @@ -1177,7 +1193,7 @@ out: /** * try_to_unmap - try to remove all page table mappings to a page * @page: the page to get unmapped - * @migration: migration flag + * @flags: action and flags * * Tries to remove all the page table entries which are mapping this * page, used in the pageout path. Caller must hold the page lock. @@ -1188,16 +1204,16 @@ out: * SWAP_FAIL - the page is unswappable * SWAP_MLOCK - page is mlocked. */ -int try_to_unmap(struct page *page, int migration) +int try_to_unmap(struct page *page, enum ttu_flags flags) { int ret; BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page, 0, migration); + ret = try_to_unmap_anon(page, flags); else - ret = try_to_unmap_file(page, 0, migration); + ret = try_to_unmap_file(page, flags); if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; @@ -1222,8 +1238,8 @@ int try_to_munlock(struct page *page) VM_BUG_ON(!PageLocked(page) || PageLRU(page)); if (PageAnon(page)) - return try_to_unmap_anon(page, 1, 0); + return try_to_unmap_anon(page, TTU_MUNLOCK); else - return try_to_unmap_file(page, 1, 0); + return try_to_unmap_file(page, TTU_MUNLOCK); } @@ -218,7 +218,7 @@ static const struct file_operations shmem_file_operations; static const struct inode_operations shmem_inode_operations; static const struct inode_operations shmem_dir_inode_operations; static const struct inode_operations shmem_special_inode_operations; -static struct vm_operations_struct shmem_vm_ops; +static const struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ @@ -1046,8 +1046,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * sync from ever calling shmem_writepage; but a stacking filesystem * may use the ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, - * and not for pdflush or sync. However, in those cases, we do still - * want to check if there's a redundant swappage to be discarded. + * and not for the writeback threads or sync. However, in those cases, + * we do still want to check if there's a redundant swappage to be + * discarded. */ if (wbc->for_reclaim) swap = get_swap_page(); @@ -1633,8 +1634,8 @@ shmem_write_end(struct file *file, struct address_space *mapping, if (pos + copied > inode->i_size) i_size_write(inode, pos + copied); - unlock_page(page); set_page_dirty(page); + unlock_page(page); page_cache_release(page); return copied; @@ -1971,13 +1972,13 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s iput(inode); return error; } - unlock_page(page); inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, symname, len); kunmap_atomic(kaddr, KM_USER0); set_page_dirty(page); + unlock_page(page); page_cache_release(page); } if (dir->i_mode & S_ISGID) @@ -2420,6 +2421,7 @@ static const struct address_space_operations shmem_aops = { .write_end = shmem_write_end, #endif .migratepage = migrate_page, + .error_remove_page = generic_error_remove_page, }; static const struct file_operations shmem_file_operations = { @@ -2496,7 +2498,7 @@ static const struct super_operations shmem_ops = { .put_super = shmem_put_super, }; -static struct vm_operations_struct shmem_vm_ops = { +static const struct vm_operations_struct shmem_vm_ops = { .fault = shmem_fault, #ifdef CONFIG_NUMA .set_policy = shmem_set_policy, diff --git a/mm/swapfile.c b/mm/swapfile.c index f1bf19d..a1bc6b9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -699,7 +699,7 @@ int free_swap_and_cache(swp_entry_t entry) struct swap_info_struct *p; struct page *page = NULL; - if (is_migration_entry(entry)) + if (non_swap_entry(entry)) return 1; p = swap_info_get(entry); @@ -1974,12 +1974,14 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } - if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { - p->flags |= SWP_SOLIDSTATE; - p->cluster_next = 1 + (random32() % p->highest_bit); + if (p->bdev) { + if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { + p->flags |= SWP_SOLIDSTATE; + p->cluster_next = 1 + (random32() % p->highest_bit); + } + if (discard_swap(p) == 0) + p->flags |= SWP_DISCARDABLE; } - if (discard_swap(p) == 0) - p->flags |= SWP_DISCARDABLE; mutex_lock(&swapon_mutex); spin_lock(&swap_lock); @@ -2085,7 +2087,7 @@ static int __swap_duplicate(swp_entry_t entry, bool cache) int count; bool has_cache; - if (is_migration_entry(entry)) + if (non_swap_entry(entry)) return -EINVAL; type = swp_type(entry); diff --git a/mm/truncate.c b/mm/truncate.c index ccc3ecf..450cebd 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -93,11 +93,11 @@ EXPORT_SYMBOL(cancel_dirty_page); * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void +static int truncate_complete_page(struct address_space *mapping, struct page *page) { if (page->mapping != mapping) - return; + return -EIO; if (page_has_private(page)) do_invalidatepage(page, 0); @@ -108,6 +108,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) remove_from_page_cache(page); ClearPageMappedToDisk(page); page_cache_release(page); /* pagecache ref */ + return 0; } /* @@ -135,6 +136,51 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) return ret; } +int truncate_inode_page(struct address_space *mapping, struct page *page) +{ + if (page_mapped(page)) { + unmap_mapping_range(mapping, + (loff_t)page->index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); + } + return truncate_complete_page(mapping, page); +} + +/* + * Used to get rid of pages on hardware memory corruption. + */ +int generic_error_remove_page(struct address_space *mapping, struct page *page) +{ + if (!mapping) + return -EINVAL; + /* + * Only punch for normal data pages for now. + * Handling other types like directories would need more auditing. + */ + if (!S_ISREG(mapping->host->i_mode)) + return -EIO; + return truncate_inode_page(mapping, page); +} +EXPORT_SYMBOL(generic_error_remove_page); + +/* + * Safely invalidate one page from its pagecache mapping. + * It only drops clean, unused pages. The page must be locked. + * + * Returns 1 if the page is successfully invalidated, otherwise 0. + */ +int invalidate_inode_page(struct page *page) +{ + struct address_space *mapping = page_mapping(page); + if (!mapping) + return 0; + if (PageDirty(page) || PageWriteback(page)) + return 0; + if (page_mapped(page)) + return 0; + return invalidate_complete_page(mapping, page); +} + /** * truncate_inode_pages - truncate range of pages specified by start & end byte offsets * @mapping: mapping to truncate @@ -196,12 +242,7 @@ void truncate_inode_pages_range(struct address_space *mapping, unlock_page(page); continue; } - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page_index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } - truncate_complete_page(mapping, page); + truncate_inode_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); @@ -238,15 +279,10 @@ void truncate_inode_pages_range(struct address_space *mapping, break; lock_page(page); wait_on_page_writeback(page); - if (page_mapped(page)) { - unmap_mapping_range(mapping, - (loff_t)page->index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); - } + truncate_inode_page(mapping, page); if (page->index > next) next = page->index; next++; - truncate_complete_page(mapping, page); unlock_page(page); } pagevec_release(&pvec); @@ -311,12 +347,8 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (lock_failed) continue; - if (PageDirty(page) || PageWriteback(page)) - goto unlock; - if (page_mapped(page)) - goto unlock; - ret += invalidate_complete_page(mapping, page); -unlock: + ret += invalidate_inode_page(page); + unlock_page(page); if (next > end) break; @@ -465,3 +497,67 @@ int invalidate_inode_pages2(struct address_space *mapping) return invalidate_inode_pages2_range(mapping, 0, -1); } EXPORT_SYMBOL_GPL(invalidate_inode_pages2); + +/** + * truncate_pagecache - unmap and remove pagecache that has been truncated + * @inode: inode + * @old: old file offset + * @new: new file offset + * + * inode's new i_size must already be written before truncate_pagecache + * is called. + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) +{ + if (new < old) { + struct address_space *mapping = inode->i_mapping; + + /* + * unmap_mapping_range is called twice, first simply for + * efficiency so that truncate_inode_pages does fewer + * single-page unmaps. However after this first call, and + * before truncate_inode_pages finishes, it is possible for + * private pages to be COWed, which remain after + * truncate_inode_pages finishes, hence the second + * unmap_mapping_range call must be made for correctness. + */ + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + truncate_inode_pages(mapping, new); + unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + } +} +EXPORT_SYMBOL(truncate_pagecache); + +/** + * vmtruncate - unmap mappings "freed" by truncate() syscall + * @inode: inode of the file used + * @offset: file offset to start truncating + * + * NOTE! We have to be ready to update the memory sharing + * between the file and the memory map for a potential last + * incomplete page. Ugly, but necessary. + */ +int vmtruncate(struct inode *inode, loff_t offset) +{ + loff_t oldsize; + int error; + + error = inode_newsize_ok(inode, offset); + if (error) + return error; + oldsize = inode->i_size; + i_size_write(inode, offset); + truncate_pagecache(inode, oldsize, offset); + if (inode->i_op->truncate) + inode->i_op->truncate(inode); + + return error; +} +EXPORT_SYMBOL(vmtruncate); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 69511e6..0f551a4 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -12,6 +12,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> +#include <linux/sched.h> #include <linux/slab.h> #include <linux/spinlock.h> #include <linux/interrupt.h> @@ -25,10 +26,10 @@ #include <linux/rcupdate.h> #include <linux/pfn.h> #include <linux/kmemleak.h> -#include <linux/highmem.h> #include <asm/atomic.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> +#include <asm/shmparam.h> /*** Page table manipulation functions ***/ @@ -1156,12 +1157,11 @@ static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, } static struct vm_struct *__get_vm_area_node(unsigned long size, - unsigned long flags, unsigned long start, unsigned long end, - int node, gfp_t gfp_mask, void *caller) + unsigned long align, unsigned long flags, unsigned long start, + unsigned long end, int node, gfp_t gfp_mask, void *caller) { static struct vmap_area *va; struct vm_struct *area; - unsigned long align = 1; BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) { @@ -1201,7 +1201,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, unsigned long start, unsigned long end) { - return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, __builtin_return_address(0)); } EXPORT_SYMBOL_GPL(__get_vm_area); @@ -1210,7 +1210,7 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, void *caller) { - return __get_vm_area_node(size, flags, start, end, -1, GFP_KERNEL, + return __get_vm_area_node(size, 1, flags, start, end, -1, GFP_KERNEL, caller); } @@ -1225,22 +1225,22 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, __builtin_return_address(0)); } struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, void *caller) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, -1, GFP_KERNEL, caller); } struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, int node, gfp_t gfp_mask) { - return __get_vm_area_node(size, flags, VMALLOC_START, VMALLOC_END, node, - gfp_mask, __builtin_return_address(0)); + return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + node, gfp_mask, __builtin_return_address(0)); } static struct vm_struct *find_vm_area(const void *addr) @@ -1403,7 +1403,8 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, int node, void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node, void *caller) @@ -1417,7 +1418,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, area->nr_pages = nr_pages; /* Please note that the recursion is strictly bounded. */ if (array_size > PAGE_SIZE) { - pages = __vmalloc_node(array_size, gfp_mask | __GFP_ZERO, + pages = __vmalloc_node(array_size, 1, gfp_mask | __GFP_ZERO, PAGE_KERNEL, node, caller); area->flags |= VM_VPAGES; } else { @@ -1476,6 +1477,7 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) /** * __vmalloc_node - allocate virtually contiguous memory * @size: allocation size + * @align: desired alignment * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 @@ -1485,8 +1487,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) { struct vm_struct *area; void *addr; @@ -1496,8 +1499,8 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, VM_ALLOC, VMALLOC_START, VMALLOC_END, - node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, + VMALLOC_END, node, gfp_mask, caller); if (!area) return NULL; @@ -1516,7 +1519,7 @@ static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { - return __vmalloc_node(size, gfp_mask, prot, -1, + return __vmalloc_node(size, 1, gfp_mask, prot, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(__vmalloc); @@ -1532,7 +1535,7 @@ EXPORT_SYMBOL(__vmalloc); */ void *vmalloc(unsigned long size) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc); @@ -1549,7 +1552,8 @@ void *vmalloc_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, + ret = __vmalloc_node(size, SHMLBA, + GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL, -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); @@ -1572,7 +1576,7 @@ EXPORT_SYMBOL(vmalloc_user); */ void *vmalloc_node(unsigned long size, int node) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL, node, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_node); @@ -1595,7 +1599,7 @@ EXPORT_SYMBOL(vmalloc_node); void *vmalloc_exec(unsigned long size) { - return __vmalloc_node(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, -1, __builtin_return_address(0)); } @@ -1616,7 +1620,7 @@ void *vmalloc_exec(unsigned long size) */ void *vmalloc_32(unsigned long size) { - return __vmalloc_node(size, GFP_VMALLOC32, PAGE_KERNEL, + return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL, -1, __builtin_return_address(0)); } EXPORT_SYMBOL(vmalloc_32); @@ -1633,7 +1637,7 @@ void *vmalloc_32_user(unsigned long size) struct vm_struct *area; void *ret; - ret = __vmalloc_node(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, + ret = __vmalloc_node(size, 1, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL, -1, __builtin_return_address(0)); if (ret) { area = find_vm_area(ret); diff --git a/mm/vmscan.c b/mm/vmscan.c index 613e89f..64e43889 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -663,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, 0)) { + switch (try_to_unmap(page, TTU_UNMAP)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -1709,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist, * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this - * caller can't do much about. We kick pdflush and take explicit naps in the - * hope that some of these pages can be written. But if the allocating task - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed @@ -1836,11 +1836,45 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, + gfp_t gfp_mask, bool noswap, + unsigned int swappiness, + struct zone *zone, int nid) +{ + struct scan_control sc = { + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .swappiness = swappiness, + .order = 0, + .mem_cgroup = mem, + .isolate_pages = mem_cgroup_isolate_pages, + }; + nodemask_t nm = nodemask_of_node(nid); + + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + sc.nodemask = &nm; + sc.nr_reclaimed = 0; + sc.nr_scanned = 0; + /* + * NOTE: Although we can get the priority field, using it + * here is not a good idea, since it limits the pages we can scan. + * if we don't reclaim here, the shrink_zone from balance_pgdat + * will pick up pages from other mem cgroup's as well. We hack + * the priority and make it zero. + */ + shrink_zone(0, zone, &sc); + return sc.nr_reclaimed; +} + unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, bool noswap, unsigned int swappiness) { + struct zonelist *zonelist; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, @@ -1852,7 +1886,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .isolate_pages = mem_cgroup_isolate_pages, .nodemask = NULL, /* we don't care the placement */ }; - struct zonelist *zonelist; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -1974,6 +2007,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; + int nid, zid; if (!populated_zone(zone)) continue; @@ -1988,6 +2022,15 @@ loop_again: temp_priority[i] = priority; sc.nr_scanned = 0; note_zone_scanning_priority(zone, priority); + + nid = pgdat->node_id; + zid = zone_idx(zone); + /* + * Call soft limit reclaim before calling shrink_zone. + * For now we ignore the return value + */ + mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, + nid, zid); /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. @@ -2801,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void) unsigned long scan_unevictable_pages; int scan_unevictable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write && *(unsigned long *)table->data) scan_all_zones_unevictable_pages(); |