From d6b09e754c23b657544f6e7f39fbf7de24c58aa2 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 31 Jul 2012 16:41:31 -0700 Subject: fault-injection: fix failcmd.sh warning "fault-injection: add tool to run command with failslab or fail_page_alloc" added tools/testing/fault-injection/failcmd.sh to make it easier to inject slab/page allocation failures by fault injection. failcmd.sh prints the following warning when running with arguments for command. # ./failcmd.sh echo aaa failcmd.sh: line 209: [: echo: binary operator expected aaa This warning is caused by an improper check whether at least one parameter is left after parsing command options. Fix it by testing the length of $1 instead of $@ Signed-off-by: Akinobu Mita Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/testing/fault-injection/failcmd.sh b/tools/testing/fault-injection/failcmd.sh index 1776e92..78a9ed7 100644 --- a/tools/testing/fault-injection/failcmd.sh +++ b/tools/testing/fault-injection/failcmd.sh @@ -206,7 +206,7 @@ while true; do esac done -[ -z "$@" ] && exit 0 +[ -z "$1" ] && exit 0 echo $oom_kill_allocating_task > /proc/sys/vm/oom_kill_allocating_task echo $task_filter > $FAULTATTR/task-filter -- cgit v0.10.2 From c66af411189fdae3bcb301fae830f078fc205fca Mon Sep 17 00:00:00 2001 From: Fengguang Wu Date: Tue, 31 Jul 2012 16:41:33 -0700 Subject: xtensa: select generic atomic64_t support This will fix build errors: block/blk-cgroup.c:609:2: error: unknown type name 'atomic64_t' block/blk-cgroup.c:609:2: error: implicit declaration of function 'ATOMIC64_INIT' [-Werror=implicit-function-declaration] Signed-off-by: Fengguang Wu Cc: Chris Zankel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 8a3f835..8ed64cf 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -7,6 +7,7 @@ config ZONE_DMA config XTENSA def_bool y select HAVE_IDE + select GENERIC_ATOMIC64 select HAVE_GENERIC_HARDIRQS select GENERIC_IRQ_SHOW select GENERIC_CPU_DEVICES -- cgit v0.10.2 From c2cddf991974a00aa7b40a21e829bc034b8199b6 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Jul 2012 16:41:34 -0700 Subject: drivers/media/video/v4l2-ioctl.c: fix build Fix zillions of these: drivers/media/video/v4l2-ioctl.c:1848: error: unknown field 'func' specified in initializer drivers/media/video/v4l2-ioctl.c:1848: warning: missing braces around initializer drivers/media/video/v4l2-ioctl.c:1848: warning: (near initialization for 'v4l2_ioctls[0].') drivers/media/video/v4l2-ioctl.c:1848: warning: initialization makes integer from pointer without a cast drivers/media/video/v4l2-ioctl.c:1848: error: initializer element is not computable at load time drivers/media/video/v4l2-ioctl.c:1848: error: (near initialization for 'v4l2_ioctls[0]..offset') Cc: Mauro Carvalho Chehab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/media/video/v4l2-ioctl.c b/drivers/media/video/v4l2-ioctl.c index 70e0efb..4b4059d 100644 --- a/drivers/media/video/v4l2-ioctl.c +++ b/drivers/media/video/v4l2-ioctl.c @@ -1831,7 +1831,9 @@ struct v4l2_ioctl_info { .ioctl = _ioctl, \ .flags = _flags | INFO_FL_STD, \ .name = #_ioctl, \ - .offset = offsetof(struct v4l2_ioctl_ops, _vidioc), \ + { \ + .offset = offsetof(struct v4l2_ioctl_ops, _vidioc),\ + }, \ .debug = _debug, \ } @@ -1840,7 +1842,9 @@ struct v4l2_ioctl_info { .ioctl = _ioctl, \ .flags = _flags | INFO_FL_FUNC, \ .name = #_ioctl, \ - .func = _func, \ + { \ + .func = _func, \ + }, \ .debug = _debug, \ } -- cgit v0.10.2 From 92ca922f0a19145f2dcc99d84fe656fa55b52c2e Mon Sep 17 00:00:00 2001 From: Hong zhi guo Date: Tue, 31 Jul 2012 16:41:35 -0700 Subject: vmalloc: walk vmap_areas by sorted list instead of rb_next() There's a walk by repeating rb_next to find a suitable hole. Could be simply replaced by walk on the sorted vmap_area_list. More simpler and efficient. Mutation of the list and tree only happens in pair within __insert_vmap_area and __free_vmap_area, under protection of vmap_area_lock. The patch code is also under vmap_area_lock, so the list walk is safe, and consistent with the tree walk. Tested on SMP by repeating batch of vmalloc anf vfree for random sizes and rounds for hours. Signed-off-by: Hong Zhiguo Cc: Nick Piggin Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index e03f4c7..7e25ee3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -413,11 +413,11 @@ nocache: if (addr + size - 1 < addr) goto overflow; - n = rb_next(&first->rb_node); - if (n) - first = rb_entry(n, struct vmap_area, rb_node); - else + if (list_is_last(&first->list, &vmap_area_list)) goto found; + + first = list_entry(first->list.next, + struct vmap_area, list); } found: -- cgit v0.10.2 From aa91c4d898c062804f4d0a5da6d8ab013cd0e868 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 31 Jul 2012 16:41:37 -0700 Subject: mm: make vb_alloc() more foolproof If someone calls vb_alloc() (or vm_map_ram() for that matter) to allocate 0 bytes (0 pages), get_order() returns BITS_PER_LONG - PAGE_CACHE_SHIFT and interesting stuff happens. So make debugging such problems easier and warn about 0-size allocation. [akpm@linux-foundation.org: use WARN_ON-return-value feature] Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7e25ee3..2bb90b1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -904,6 +904,14 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + if (WARN_ON(size == 0)) { + /* + * Allocating 0 bytes isn't what caller wants since + * get_order(0) returns funny result. Just warn and terminate + * early. + */ + return NULL; + } order = get_order(size); again: -- cgit v0.10.2 From bff6bb83f38105b39b0cc3a9ad81103edbb56f7a Mon Sep 17 00:00:00 2001 From: Kamezawa Hiroyuki Date: Tue, 31 Jul 2012 16:41:38 -0700 Subject: memcg: rename MEM_CGROUP_STAT_SWAPOUT as MEM_CGROUP_STAT_SWAP MEM_CGROUP_STAT_SWAPOUT represents the usage of swap rather than the number of swap-out events. Rename it to be MEM_CGROUP_STAT_SWAP. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Acked-by: Hugh Dickins Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f72b5e5..91ce7b2 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -87,7 +87,7 @@ enum mem_cgroup_stat_index { MEM_CGROUP_STAT_CACHE, /* # of pages charged as cache */ MEM_CGROUP_STAT_RSS, /* # of pages charged as anon rss */ MEM_CGROUP_STAT_FILE_MAPPED, /* # of pages charged as file rss */ - MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */ + MEM_CGROUP_STAT_SWAP, /* # of pages, swapped out */ MEM_CGROUP_STAT_NSTATS, }; @@ -703,7 +703,7 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, bool charge) { int val = (charge) ? 1 : -1; - this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAPOUT], val); + this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); } static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, @@ -3831,7 +3831,7 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); if (swap) - val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAPOUT); + val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); return val << PAGE_SHIFT; } @@ -4082,7 +4082,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, unsigned int i; for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { - if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); @@ -4109,7 +4109,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { long long val = 0; - if (i == MEM_CGROUP_STAT_SWAPOUT && !do_swap_account) + if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) continue; for_each_mem_cgroup_tree(mi, memcg) val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; -- cgit v0.10.2 From 41326c17fc1b45289a82567553249f60d3638a23 Mon Sep 17 00:00:00 2001 From: Kamezawa Hiroyuki Date: Tue, 31 Jul 2012 16:41:40 -0700 Subject: memcg: rename MEM_CGROUP_CHARGE_TYPE_MAPPED as MEM_CGROUP_CHARGE_TYPE_ANON Now, in memcg, 2 "MAPPED" enum/macro are found MEM_CGROUP_CHARGE_TYPE_MAPPED MEM_CGROUP_STAT_FILE_MAPPED Thier names looks similar to each other but the former is used for accounting anonymous memory. rename it as TYPE_ANON. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 91ce7b2..1b3dd27 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -378,7 +378,7 @@ static bool move_file(void) enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, - MEM_CGROUP_CHARGE_TYPE_MAPPED, + MEM_CGROUP_CHARGE_TYPE_ANON, MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ @@ -2519,7 +2519,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, spin_unlock_irq(&zone->lru_lock); } - if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) + if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) anon = true; else anon = false; @@ -2728,7 +2728,7 @@ int mem_cgroup_newpage_charge(struct page *page, VM_BUG_ON(page->mapping && !PageAnon(page)); VM_BUG_ON(!mm); return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_ANON); } static void @@ -2842,7 +2842,7 @@ void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg) { __mem_cgroup_commit_charge_swapin(page, memcg, - MEM_CGROUP_CHARGE_TYPE_MAPPED); + MEM_CGROUP_CHARGE_TYPE_ANON); } void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) @@ -2945,7 +2945,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) anon = PageAnon(page); switch (ctype) { - case MEM_CGROUP_CHARGE_TYPE_MAPPED: + case MEM_CGROUP_CHARGE_TYPE_ANON: /* * Generally PageAnon tells if it's the anon statistics to be * updated; but sometimes e.g. mem_cgroup_uncharge_page() is @@ -3005,7 +3005,7 @@ void mem_cgroup_uncharge_page(struct page *page) if (page_mapped(page)) return; VM_BUG_ON(page->mapping && !PageAnon(page)); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON); } void mem_cgroup_uncharge_cache_page(struct page *page) @@ -3248,7 +3248,7 @@ int mem_cgroup_prepare_migration(struct page *page, * mapcount will be finally 0 and we call uncharge in end_migration(). */ if (PageAnon(page)) - ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED; + ctype = MEM_CGROUP_CHARGE_TYPE_ANON; else if (page_is_file_cache(page)) ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else @@ -3287,7 +3287,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, unlock_page_cgroup(pc); anon = PageAnon(used); __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_MAPPED + anon ? MEM_CGROUP_CHARGE_TYPE_ANON : MEM_CGROUP_CHARGE_TYPE_CACHE); /* -- cgit v0.10.2 From a7d6f529fe1d96a477614eb93f40213d133029e6 Mon Sep 17 00:00:00 2001 From: Kamezawa Hiroyuki Date: Tue, 31 Jul 2012 16:41:41 -0700 Subject: memcg: remove MEM_CGROUP_CHARGE_TYPE_FORCE There are no users since commit b24028572fb69 ("memcg: remove PCG_CACHE"). Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b3dd27..1940ba8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -380,7 +380,6 @@ enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_ANON, MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ - MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ NR_CHARGE_TYPE, -- cgit v0.10.2 From 3fb5c298b04eb6e472f8db1f0fb472749d30041c Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Tue, 31 Jul 2012 16:41:44 -0700 Subject: swap: allow swap readahead to be merged Swap readahead works fine, but the I/O to disk is almost always done in page size requests, despite the fact that readahead submits 1< Acked-by: Rik van Riel Acked-by: Jens Axboe Reviewed-by: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/swap_state.c b/mm/swap_state.c index 4c5ff7f..c85b559 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -376,6 +377,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, unsigned long offset = swp_offset(entry); unsigned long start_offset, end_offset; unsigned long mask = (1UL << page_cluster) - 1; + struct blk_plug plug; /* Read a page_cluster sized and aligned cluster around offset. */ start_offset = offset & ~mask; @@ -383,6 +385,7 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, if (!start_offset) /* First page is swap header. */ start_offset++; + blk_start_plug(&plug); for (offset = start_offset; offset <= end_offset ; offset++) { /* Ok, do the async read-ahead now */ page = read_swap_cache_async(swp_entry(swp_type(entry), offset), @@ -391,6 +394,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask, continue; page_cache_release(page); } + blk_finish_plug(&plug); + lru_add_drain(); /* Push any new pages onto the LRU now */ return read_swap_cache_async(entry, gfp_mask, vma, addr); } -- cgit v0.10.2 From df858fa8276f85106f2f5c3cd49c1fa524058070 Mon Sep 17 00:00:00 2001 From: Christian Ehrhardt Date: Tue, 31 Jul 2012 16:41:46 -0700 Subject: documentation: update how page-cluster affects swap I/O Fix of the documentation of /proc/sys/vm/page-cluster to match the behavior of the code and add some comments about what the tunable will change in that behavior. Signed-off-by: Christian Ehrhardt Acked-by: Jens Axboe Reviewed-by: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 96f0ee8..84eb25c 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -574,16 +574,24 @@ of physical RAM. See above. page-cluster -page-cluster controls the number of pages which are written to swap in -a single attempt. The swap I/O size. +page-cluster controls the number of pages up to which consecutive pages +are read in from swap in a single attempt. This is the swap counterpart +to page cache readahead. +The mentioned consecutivity is not in terms of virtual/physical addresses, +but consecutive on swap space - that means they were swapped out together. It is a logarithmic value - setting it to zero means "1 page", setting it to 1 means "2 pages", setting it to 2 means "4 pages", etc. +Zero disables swap readahead completely. The default value is three (eight pages at a time). There may be some small benefits in tuning this to a different value if your workload is swap-intensive. +Lower values mean lower latencies for initial faults, but at the same time +extra faults and I/O delays for following faults if they would have been part of +that consecutive pages readahead would have brought in. + ============================================================= panic_on_oom -- cgit v0.10.2 From 44de9d0cad41f2c51ef26916842be046b582dcc9 Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Tue, 31 Jul 2012 16:41:49 -0700 Subject: mm: account the total_vm in the vm_stat_account() vm_stat_account() accounts the shared_vm, stack_vm and reserved_vm now. But we can also account for total_vm in the vm_stat_account() which makes the code tidy. Even for mprotect_fixup(), we can get the right result in the end. Signed-off-by: Huang Shijie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index d7f558c..3fa4bc5 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -2353,7 +2353,6 @@ pfm_smpl_buffer_alloc(struct task_struct *task, struct file *filp, pfm_context_t */ insert_vm_struct(mm, vma); - mm->total_vm += size >> PAGE_SHIFT; vm_stat_account(vma->vm_mm, vma->vm_flags, vma->vm_file, vma_pages(vma)); up_write(&task->mm->mmap_sem); diff --git a/include/linux/mm.h b/include/linux/mm.h index f9f279c..3955bed 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1528,6 +1528,7 @@ void vm_stat_account(struct mm_struct *, unsigned long, struct file *, long); static inline void vm_stat_account(struct mm_struct *mm, unsigned long flags, struct file *file, long pages) { + mm->total_vm += pages; } #endif /* CONFIG_PROC_FS */ diff --git a/kernel/fork.c b/kernel/fork.c index 8efac1f..aaa8813 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -381,10 +381,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { - long pages = vma_pages(mpnt); - mm->total_vm -= pages; vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -pages); + -vma_pages(mpnt)); continue; } charge = 0; diff --git a/mm/mmap.c b/mm/mmap.c index 3edfcdf..1ee2fd8 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -943,6 +943,8 @@ void vm_stat_account(struct mm_struct *mm, unsigned long flags, const unsigned long stack_flags = VM_STACK_FLAGS & (VM_GROWSUP|VM_GROWSDOWN); + mm->total_vm += pages; + if (file) { mm->shared_vm += pages; if ((flags & (VM_EXEC|VM_WRITE)) == VM_EXEC) @@ -1347,7 +1349,6 @@ munmap_back: out: perf_event_mmap(vma); - mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { if (!mlock_vma_pages_range(vma, addr, addr + len)) @@ -1707,7 +1708,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns return -ENOMEM; /* Ok, everything looks good - let it rip */ - mm->total_vm += grow; if (vma->vm_flags & VM_LOCKED) mm->locked_vm += grow; vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow); @@ -1889,7 +1889,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; - mm->total_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); } while (vma); diff --git a/mm/mremap.c b/mm/mremap.c index 21fed20..cc06d0e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -260,7 +260,6 @@ static unsigned long move_vma(struct vm_area_struct *vma, * If this were a serious issue, we'd add a flag to do_munmap(). */ hiwater_vm = mm->hiwater_vm; - mm->total_vm += new_len >> PAGE_SHIFT; vm_stat_account(mm, vma->vm_flags, vma->vm_file, new_len>>PAGE_SHIFT); if (do_munmap(mm, old_addr, old_len) < 0) { @@ -497,7 +496,6 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, goto out; } - mm->total_vm += pages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; -- cgit v0.10.2 From deaf386ee58d5336bbef8959bf304573afb67c20 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 31 Jul 2012 16:41:51 -0700 Subject: mm/buddy: cleanup on should_fail_alloc_page Currently, function should_fail() has "bool" for its return value, so it's reasonable to change the return value of function should_fail_alloc_page() into "bool" as well. The patch does cleanup on function should_fail_alloc_page() to have "bool" for its return value. Signed-off-by: Gavin Shan Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4a4f921..80ef992 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1529,16 +1529,16 @@ static int __init setup_fail_page_alloc(char *str) } __setup("fail_page_alloc=", setup_fail_page_alloc); -static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { if (order < fail_page_alloc.min_order) - return 0; + return false; if (gfp_mask & __GFP_NOFAIL) - return 0; + return false; if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM)) - return 0; + return false; if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT)) - return 0; + return false; return should_fail(&fail_page_alloc.attr, 1 << order); } @@ -1578,9 +1578,9 @@ late_initcall(fail_page_alloc_debugfs); #else /* CONFIG_FAIL_PAGE_ALLOC */ -static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) +static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) { - return 0; + return false; } #endif /* CONFIG_FAIL_PAGE_ALLOC */ -- cgit v0.10.2 From 3965c9ae47d64aadf6f13b6fcd37767b83c0689a Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 31 Jul 2012 16:41:52 -0700 Subject: mm: prepare for removal of obsolete /proc/sys/vm/nr_pdflush_threads Since per-BDI flusher threads were introduced in 2.6, the pdflush mechanism is not used any more. But the old interface exported through /proc/sys/vm/nr_pdflush_threads still exists and is obviously useless. For back-compatibility, printk warning information and return 2 to notify the users that the interface is removed. Signed-off-by: Wanpeng Li Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads new file mode 100644 index 0000000..b0b0eeb --- /dev/null +++ b/Documentation/ABI/obsolete/proc-sys-vm-nr_pdflush_threads @@ -0,0 +1,5 @@ +What: /proc/sys/vm/nr_pdflush_threads +Date: June 2012 +Contact: Wanpeng Li +Description: Since pdflush is replaced by per-BDI flusher, the interface of old pdflush + exported in /proc/sys/vm/ should be removed. diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index e9237fb..88f2fa4 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -13,6 +13,14 @@ Who: Jim Cromie , Jason Baron --------------------------- +What: /proc/sys/vm/nr_pdflush_threads +When: 2012 +Why: Since pdflush is deprecated, the interface exported in /proc/sys/vm/ + should be removed. +Who: Wanpeng Li + +--------------------------- + What: CONFIG_APM_CPU_IDLE, and its ability to call APM BIOS in idle When: 2012 Why: This optional sub-feature of APM is of dubious reliability, diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 84eb25c..06d662b 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -42,7 +42,6 @@ Currently, these files are in /proc/sys/vm: - mmap_min_addr - nr_hugepages - nr_overcommit_hugepages -- nr_pdflush_threads - nr_trim_pages (only if CONFIG_MMU=n) - numa_zonelist_order - oom_dump_tasks @@ -426,16 +425,6 @@ See Documentation/vm/hugetlbpage.txt ============================================================== -nr_pdflush_threads - -The current number of pdflush threads. This value is read-only. -The value changes according to the number of dirty pages in the system. - -When necessary, additional pdflush threads are created, one per second, up to -nr_pdflush_threads_max. - -============================================================== - nr_trim_pages This is available only on NOMMU kernels. diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 50d0b78..be3efc4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -52,11 +52,6 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; -/* - * We don't actually have pdflush, but this one is exported though /proc... - */ -int nr_pdflush_threads; - /** * writeback_in_progress - determine whether there is writeback in progress * @bdi: the device's backing_dev_info structure. diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 489de62..c97c6b9 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -17,6 +17,7 @@ #include #include #include +#include struct page; struct device; @@ -304,6 +305,8 @@ void clear_bdi_congested(struct backing_dev_info *bdi, int sync); void set_bdi_congested(struct backing_dev_info *bdi, int sync); long congestion_wait(int sync, long timeout); long wait_iff_congested(struct zone *zone, int sync, long timeout); +int pdflush_proc_obsolete(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); static inline bool bdi_cap_writeback_dirty(struct backing_dev_info *bdi) { diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 6d0a0fc..c66fe33 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -189,9 +189,4 @@ void tag_pages_for_writeback(struct address_space *mapping, void account_page_redirty(struct page *page); -/* pdflush.c */ -extern int nr_pdflush_threads; /* Global so it can be exported to sysctl - read-only. */ - - #endif /* WRITEBACK_H */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 97186b9..6502d35 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1101,11 +1101,9 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { - .procname = "nr_pdflush_threads", - .data = &nr_pdflush_threads, - .maxlen = sizeof nr_pdflush_threads, - .mode = 0444 /* read-only*/, - .proc_handler = proc_dointvec, + .procname = "nr_pdflush_threads", + .mode = 0444 /* read-only */, + .proc_handler = pdflush_proc_obsolete, }, { .procname = "swappiness", diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index a650694..65bdcf1 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -147,7 +147,7 @@ static const struct bin_table bin_vm_table[] = { { CTL_INT, VM_DIRTY_RATIO, "dirty_ratio" }, /* VM_DIRTY_WB_CS "dirty_writeback_centisecs" no longer used */ /* VM_DIRTY_EXPIRE_CS "dirty_expire_centisecs" no longer used */ - { CTL_INT, VM_NR_PDFLUSH_THREADS, "nr_pdflush_threads" }, + /* VM_NR_PDFLUSH_THREADS "nr_pdflush_threads" no longer used */ { CTL_INT, VM_OVERCOMMIT_RATIO, "overcommit_ratio" }, /* VM_PAGEBUF unused */ /* VM_HUGETLB_PAGES "nr_hugepages" no longer used */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 3387aea..6b4718e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -886,3 +886,23 @@ out: return ret; } EXPORT_SYMBOL(wait_iff_congested); + +int pdflush_proc_obsolete(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + char kbuf[] = "0\n"; + + if (*ppos) { + *lenp = 0; + return 0; + } + + if (copy_to_user(buffer, kbuf, sizeof(kbuf))) + return -EFAULT; + printk_once(KERN_WARNING "%s exported in /proc is scheduled for removal\n", + table->procname); + + *lenp = 2; + *ppos += *lenp; + return 2; +} -- cgit v0.10.2 From 47d38344abd0c7c6793b59ac741aa5b205fc197c Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:41:54 -0700 Subject: hugetlb: rename max_hstate to hugetlb_max_hstate This patchset implements a cgroup resource controller for HugeTLB pages. The controller allows to limit the HugeTLB usage per control group and enforces the controller limit during page fault. Since HugeTLB doesn't support page reclaim, enforcing the limit at page fault time implies that, the application will get SIGBUS signal if it tries to access HugeTLB pages beyond its limit. This requires the application to know beforehand how much HugeTLB pages it would require for its use. The goal is to control how many HugeTLB pages a group of task can allocate. It can be looked at as an extension of the existing quota interface which limits the number of HugeTLB pages per hugetlbfs superblock. HPC job scheduler requires jobs to specify their resource requirements in the job file. Once their requirements can be met, job schedulers like (SLURM) will schedule the job. We need to make sure that the jobs won't consume more resources than requested. If they do we should either error out or kill the application. This patch: Rename max_hstate to hugetlb_max_hstate. We will be using this from other subsystems like hugetlb controller in later patches. Signed-off-by: Aneesh Kumar K.V Acked-by: David Rientjes Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hillf Danton Acked-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e198831..c868309 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,7 +34,7 @@ const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -static int max_hstate; +static int hugetlb_max_hstate; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -46,7 +46,7 @@ static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; #define for_each_hstate(h) \ - for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) + for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages @@ -1897,9 +1897,9 @@ void __init hugetlb_add_hstate(unsigned order) printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); return; } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE); BUG_ON(order == 0); - h = &hstates[max_hstate++]; + h = &hstates[hugetlb_max_hstate++]; h->order = order; h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); h->nr_huge_pages = 0; @@ -1920,10 +1920,10 @@ static int __init hugetlb_nrpages_setup(char *s) static unsigned long *last_mhp; /* - * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet, * so this hugepages= parameter goes to the "default hstate". */ - if (!max_hstate) + if (!hugetlb_max_hstate) mhp = &default_hstate_max_huge_pages; else mhp = &parsed_hstate->max_huge_pages; @@ -1942,7 +1942,7 @@ static int __init hugetlb_nrpages_setup(char *s) * But we need to allocate >= MAX_ORDER hstates here early to still * use the bootmem allocator. */ - if (max_hstate && parsed_hstate->order >= MAX_ORDER) + if (hugetlb_max_hstate && parsed_hstate->order >= MAX_ORDER) hugetlb_hstate_alloc_pages(parsed_hstate); last_mhp = mhp; -- cgit v0.10.2 From 76dcee75c1aff61259f5ed55e2bcfab60cc4bd5f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:41:57 -0700 Subject: hugetlb: don't use ERR_PTR with VM_FAULT* values The current use of VM_FAULT_* codes with ERR_PTR requires us to ensure VM_FAULT_* values will not exceed MAX_ERRNO value. Decouple the VM_FAULT_* values from MAX_ERRNO. Signed-off-by: Aneesh Kumar K.V Acked-by: Hillf Danton Acked-by: KOSAKI Motohiro Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c868309..34a7e23 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1123,10 +1123,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, */ chg = vma_needs_reservation(h, vma, addr); if (chg < 0) - return ERR_PTR(-VM_FAULT_OOM); + return ERR_PTR(-ENOMEM); if (chg) if (hugepage_subpool_get_pages(spool, chg)) - return ERR_PTR(-VM_FAULT_SIGBUS); + return ERR_PTR(-ENOSPC); spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); @@ -1136,7 +1136,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugepage_subpool_put_pages(spool, chg); - return ERR_PTR(-VM_FAULT_SIGBUS); + return ERR_PTR(-ENOSPC); } } @@ -2496,6 +2496,7 @@ retry_avoidcopy: new_page = alloc_huge_page(vma, address, outside_reserve); if (IS_ERR(new_page)) { + long err = PTR_ERR(new_page); page_cache_release(old_page); /* @@ -2524,7 +2525,10 @@ retry_avoidcopy: /* Caller expects lock to be held */ spin_lock(&mm->page_table_lock); - return -PTR_ERR(new_page); + if (err == -ENOMEM) + return VM_FAULT_OOM; + else + return VM_FAULT_SIGBUS; } /* @@ -2642,7 +2646,11 @@ retry: goto out; page = alloc_huge_page(vma, address, 0); if (IS_ERR(page)) { - ret = -PTR_ERR(page); + ret = PTR_ERR(page); + if (ret == -ENOMEM) + ret = VM_FAULT_OOM; + else + ret = VM_FAULT_SIGBUS; goto out; } clear_huge_page(page, address, pages_per_huge_page(h)); -- cgit v0.10.2 From 972dc4de13f667a7df27ee32573b2e6fc6cc8434 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:00 -0700 Subject: hugetlb: add an inline helper for finding hstate index Add an inline helper and use it in the code. Signed-off-by: Aneesh Kumar K.V Acked-by: David Rientjes Acked-by: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d5d6bbe..217f528 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -302,6 +302,11 @@ static inline unsigned hstate_index_to_shift(unsigned index) return hstates[index].order + PAGE_SHIFT; } +static inline int hstate_index(struct hstate *h) +{ + return h - hstates; +} + #else struct hstate {}; #define alloc_huge_page_node(h, nid) NULL @@ -320,6 +325,7 @@ static inline unsigned int pages_per_huge_page(struct hstate *h) return 1; } #define hstate_index_to_shift(index) 0 +#define hstate_index(h) 0 #endif #endif /* _LINUX_HUGETLB_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 34a7e23..b1e0ed1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1646,7 +1646,7 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, struct attribute_group *hstate_attr_group) { int retval; - int hi = h - hstates; + int hi = hstate_index(h); hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); if (!hstate_kobjs[hi]) @@ -1741,11 +1741,13 @@ void hugetlb_unregister_node(struct node *node) if (!nhs->hugepages_kobj) return; /* no hstate attributes */ - for_each_hstate(h) - if (nhs->hstate_kobjs[h - hstates]) { - kobject_put(nhs->hstate_kobjs[h - hstates]); - nhs->hstate_kobjs[h - hstates] = NULL; + for_each_hstate(h) { + int idx = hstate_index(h); + if (nhs->hstate_kobjs[idx]) { + kobject_put(nhs->hstate_kobjs[idx]); + nhs->hstate_kobjs[idx] = NULL; } + } kobject_put(nhs->hugepages_kobj); nhs->hugepages_kobj = NULL; @@ -1848,7 +1850,7 @@ static void __exit hugetlb_exit(void) hugetlb_unregister_all_nodes(); for_each_hstate(h) { - kobject_put(hstate_kobjs[h - hstates]); + kobject_put(hstate_kobjs[hstate_index(h)]); } kobject_put(hugepages_kobj); @@ -1869,7 +1871,7 @@ static int __init hugetlb_init(void) if (!size_to_hstate(default_hstate_size)) hugetlb_add_hstate(HUGETLB_PAGE_ORDER); } - default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; + default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size)); if (default_hstate_max_huge_pages) default_hstate.max_huge_pages = default_hstate_max_huge_pages; @@ -2687,7 +2689,7 @@ retry: */ if (unlikely(PageHWPoison(page))) { ret = VM_FAULT_HWPOISON | - VM_FAULT_SET_HINDEX(h - hstates); + VM_FAULT_SET_HINDEX(hstate_index(h)); goto backout_unlocked; } } @@ -2760,7 +2762,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(h - hstates); + VM_FAULT_SET_HINDEX(hstate_index(h)); } ptep = huge_pte_alloc(mm, address, huge_page_size(h)); -- cgit v0.10.2 From 24669e58477e2752c1fbca9c1c988e9dd0d79d15 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:03 -0700 Subject: hugetlb: use mmu_gather instead of a temporary linked list for accumulating pages Use a mmu_gather instead of a temporary linked list for accumulating pages when we unmap a hugepage range Signed-off-by: Aneesh Kumar K.V Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Hillf Danton Cc: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e13e9bd..8349a89 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -416,8 +416,8 @@ hugetlb_vmtruncate_list(struct prio_tree_root *root, pgoff_t pgoff) else v_offset = 0; - __unmap_hugepage_range(vma, - vma->vm_start + v_offset, vma->vm_end, NULL); + unmap_hugepage_range(vma, vma->vm_start + v_offset, + vma->vm_end, NULL); } } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 217f528..0f23c18 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -7,6 +7,7 @@ struct ctl_table; struct user_struct; +struct mmu_gather; #ifdef CONFIG_HUGETLB_PAGE @@ -40,9 +41,10 @@ int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int, unsigned int flags); void unmap_hugepage_range(struct vm_area_struct *, - unsigned long, unsigned long, struct page *); -void __unmap_hugepage_range(struct vm_area_struct *, - unsigned long, unsigned long, struct page *); + unsigned long, unsigned long, struct page *); +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page); int hugetlb_prefault(struct address_space *, struct vm_area_struct *); void hugetlb_report_meminfo(struct seq_file *); int hugetlb_report_node_meminfo(int, char *); @@ -98,7 +100,6 @@ static inline unsigned long hugetlb_total_pages(void) #define follow_huge_addr(mm, addr, write) ERR_PTR(-EINVAL) #define copy_hugetlb_page_range(src, dst, vma) ({ BUG(); 0; }) #define hugetlb_prefault(mapping, vma) ({ BUG(); 0; }) -#define unmap_hugepage_range(vma, start, end, page) BUG() static inline void hugetlb_report_meminfo(struct seq_file *m) { } @@ -112,13 +113,24 @@ static inline void hugetlb_report_meminfo(struct seq_file *m) #define hugetlb_free_pgd_range(tlb, addr, end, floor, ceiling) ({BUG(); 0; }) #define hugetlb_fault(mm, vma, addr, flags) ({ BUG(); 0; }) #define huge_pte_offset(mm, address) 0 -#define dequeue_hwpoisoned_huge_page(page) 0 +static inline int dequeue_hwpoisoned_huge_page(struct page *page) +{ + return 0; +} + static inline void copy_huge_page(struct page *dst, struct page *src) { } #define hugetlb_change_protection(vma, address, end, newprot) +static inline void __unmap_hugepage_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) +{ + BUG(); +} + #endif /* !CONFIG_HUGETLB_PAGE */ #define HUGETLB_ANON_FILE "anon_hugepage" diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b1e0ed1..e54b695 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -24,8 +24,9 @@ #include #include -#include +#include +#include #include #include #include "internal.h" @@ -2310,30 +2311,26 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) return 0; } -void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, - unsigned long end, struct page *ref_page) +void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page) { + int force_flush = 0; struct mm_struct *mm = vma->vm_mm; unsigned long address; pte_t *ptep; pte_t pte; struct page *page; - struct page *tmp; struct hstate *h = hstate_vma(vma); unsigned long sz = huge_page_size(h); - /* - * A page gathering list, protected by per file i_mmap_mutex. The - * lock is used to avoid list corruption from multiple unmapping - * of the same page since we are using page->lru. - */ - LIST_HEAD(page_list); - WARN_ON(!is_vm_hugetlb_page(vma)); BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + tlb_start_vma(tlb, vma); mmu_notifier_invalidate_range_start(mm, start, end); +again: spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); @@ -2372,30 +2369,45 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } pte = huge_ptep_get_and_clear(mm, address, ptep); + tlb_remove_tlb_entry(tlb, ptep, address); if (pte_dirty(pte)) set_page_dirty(page); - list_add(&page->lru, &page_list); + page_remove_rmap(page); + force_flush = !__tlb_remove_page(tlb, page); + if (force_flush) + break; /* Bail out after unmapping reference page if supplied */ if (ref_page) break; } - flush_tlb_range(vma, start, end); spin_unlock(&mm->page_table_lock); - mmu_notifier_invalidate_range_end(mm, start, end); - list_for_each_entry_safe(page, tmp, &page_list, lru) { - page_remove_rmap(page); - list_del(&page->lru); - put_page(page); + /* + * mmu_gather ran out of room to batch pages, we break out of + * the PTE lock to avoid doing the potential expensive TLB invalidate + * and page-free while holding it. + */ + if (force_flush) { + force_flush = 0; + tlb_flush_mmu(tlb); + if (address < end && !ref_page) + goto again; } + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_end_vma(tlb, vma); } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); - __unmap_hugepage_range(vma, start, end, ref_page); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + struct mm_struct *mm; + struct mmu_gather tlb; + + mm = vma->vm_mm; + + tlb_gather_mmu(&tlb, mm, 0); + __unmap_hugepage_range(&tlb, vma, start, end, ref_page); + tlb_finish_mmu(&tlb, start, end); } /* @@ -2440,9 +2452,8 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, * from the time of fork. This would look like data corruption */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) - __unmap_hugepage_range(iter_vma, - address, address + huge_page_size(h), - page); + unmap_hugepage_range(iter_vma, address, + address + huge_page_size(h), page); } mutex_unlock(&mapping->i_mmap_mutex); diff --git a/mm/memory.c b/mm/memory.c index 91f6945..59e5beb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1343,8 +1343,11 @@ static void unmap_single_vma(struct mmu_gather *tlb, * Since no pte has actually been setup, it is * safe to do nothing in this case. */ - if (vma->vm_file) - unmap_hugepage_range(vma, start, end, NULL); + if (vma->vm_file) { + mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); + __unmap_hugepage_range(tlb, vma, start, end, NULL); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); + } } else unmap_page_range(tlb, vma, start, end, details); } -- cgit v0.10.2 From 189ebff2894a9d0f4e250dd1e154d282ef0a6779 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:06 -0700 Subject: hugetlb: simplify migrate_huge_page() Since we migrate only one hugepage, don't use linked list for passing the page around. Directly pass the page that need to be migrated as argument. This also removes the usage of page->lru in the migrate path. Signed-off-by: Aneesh Kumar K.V Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 855c337..ce7e667 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -15,7 +15,7 @@ extern int migrate_page(struct address_space *, extern int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode); -extern int migrate_huge_pages(struct list_head *l, new_page_t x, +extern int migrate_huge_page(struct page *, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode); @@ -36,7 +36,7 @@ static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode) { return -ENOSYS; } -static inline int migrate_huge_pages(struct list_head *l, new_page_t x, +static inline int migrate_huge_page(struct page *page, new_page_t x, unsigned long private, bool offlining, enum migrate_mode mode) { return -ENOSYS; } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6de0d61..b04ff2d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1416,7 +1416,6 @@ static int soft_offline_huge_page(struct page *page, int flags) int ret; unsigned long pfn = page_to_pfn(page); struct page *hpage = compound_head(page); - LIST_HEAD(pagelist); ret = get_any_page(page, pfn, flags); if (ret < 0) @@ -1431,24 +1430,18 @@ static int soft_offline_huge_page(struct page *page, int flags) } /* Keep page count to indicate a given hugepage is isolated. */ - - list_add(&hpage->lru, &pagelist); - ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, false, + ret = migrate_huge_page(hpage, new_page, MPOL_MF_MOVE_ALL, false, MIGRATE_SYNC); + put_page(hpage); if (ret) { - struct page *page1, *page2; - list_for_each_entry_safe(page1, page2, &pagelist, lru) - put_page(page1); - pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); - if (ret > 0) - ret = -EIO; return ret; } done: if (!PageHWPoison(hpage)) - atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); + atomic_long_add(1 << compound_trans_order(hpage), + &mce_bad_pages); set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); /* keep elevated page count for bad page */ diff --git a/mm/migrate.c b/mm/migrate.c index be26d5c..fdce3a2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -932,15 +932,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (anon_vma) put_anon_vma(anon_vma); unlock_page(hpage); - out: - if (rc != -EAGAIN) { - list_del(&hpage->lru); - put_page(hpage); - } - put_page(new_hpage); - if (result) { if (rc) *result = rc; @@ -1016,48 +1009,32 @@ out: return nr_failed + retry; } -int migrate_huge_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, bool offlining, - enum migrate_mode mode) +int migrate_huge_page(struct page *hpage, new_page_t get_new_page, + unsigned long private, bool offlining, + enum migrate_mode mode) { - int retry = 1; - int nr_failed = 0; - int pass = 0; - struct page *page; - struct page *page2; - int rc; - - for (pass = 0; pass < 10 && retry; pass++) { - retry = 0; - - list_for_each_entry_safe(page, page2, from, lru) { + int pass, rc; + + for (pass = 0; pass < 10; pass++) { + rc = unmap_and_move_huge_page(get_new_page, + private, hpage, pass > 2, offlining, + mode); + switch (rc) { + case -ENOMEM: + goto out; + case -EAGAIN: + /* try again */ cond_resched(); - - rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, offlining, - mode); - - switch(rc) { - case -ENOMEM: - goto out; - case -EAGAIN: - retry++; - break; - case 0: - break; - default: - /* Permanent failure */ - nr_failed++; - break; - } + break; + case 0: + goto out; + default: + rc = -EIO; + goto out; } } - rc = 0; out: - if (rc) - return rc; - - return nr_failed + retry; + return rc; } #ifdef CONFIG_NUMA -- cgit v0.10.2 From 0edaecfab218d747d30de4575e911907371e2cd2 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:07 -0700 Subject: hugetlb: add a list for tracking in-use HugeTLB pages hugepage_activelist will be used to track currently used HugeTLB pages. We need to find the in-use HugeTLB pages to support HugeTLB cgroup removal. On cgroup removal we update the page's HugeTLB cgroup to point to parent cgroup. Signed-off-by: Aneesh Kumar K.V Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 0f23c18..ed550d8 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -211,6 +211,7 @@ struct hstate { unsigned long resv_huge_pages; unsigned long surplus_huge_pages; unsigned long nr_overcommit_huge_pages; + struct list_head hugepage_activelist; struct list_head hugepage_freelists[MAX_NUMNODES]; unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index e54b695..b5b6e15 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -510,7 +510,7 @@ void copy_huge_page(struct page *dst, struct page *src) static void enqueue_huge_page(struct hstate *h, struct page *page) { int nid = page_to_nid(page); - list_add(&page->lru, &h->hugepage_freelists[nid]); + list_move(&page->lru, &h->hugepage_freelists[nid]); h->free_huge_pages++; h->free_huge_pages_node[nid]++; } @@ -522,7 +522,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) if (list_empty(&h->hugepage_freelists[nid])) return NULL; page = list_entry(h->hugepage_freelists[nid].next, struct page, lru); - list_del(&page->lru); + list_move(&page->lru, &h->hugepage_activelist); set_page_refcounted(page); h->free_huge_pages--; h->free_huge_pages_node[nid]--; @@ -626,10 +626,11 @@ static void free_huge_page(struct page *page) page->mapping = NULL; BUG_ON(page_count(page)); BUG_ON(page_mapcount(page)); - INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { + /* remove the page from active list */ + list_del(&page->lru); update_and_free_page(h, page); h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; @@ -642,6 +643,7 @@ static void free_huge_page(struct page *page) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { + INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); h->nr_huge_pages++; @@ -890,6 +892,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) spin_lock(&hugetlb_lock); if (page) { + INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); /* @@ -994,7 +997,6 @@ retry: list_for_each_entry_safe(page, tmp, &surplus_list, lru) { if ((--needed) < 0) break; - list_del(&page->lru); /* * This page is now managed by the hugetlb allocator and has * no users -- drop the buddy allocator's reference. @@ -1009,7 +1011,6 @@ free: /* Free unnecessary surplus pages to the buddy allocator */ if (!list_empty(&surplus_list)) { list_for_each_entry_safe(page, tmp, &surplus_list, lru) { - list_del(&page->lru); put_page(page); } } @@ -1909,6 +1910,7 @@ void __init hugetlb_add_hstate(unsigned order) h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); + INIT_LIST_HEAD(&h->hugepage_activelist); h->next_nid_to_alloc = first_node(node_states[N_HIGH_MEMORY]); h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", -- cgit v0.10.2 From c3f38a38715e9b4e7b1dda6840a1375f6894744d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:10 -0700 Subject: hugetlb: make some static variables global We will use them later in hugetlb_cgroup.c Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index ed550d8..3d677dd 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -21,6 +21,11 @@ struct hugepage_subpool { long max_hpages, used_hpages; }; +extern spinlock_t hugetlb_lock; +extern int hugetlb_max_hstate __read_mostly; +#define for_each_hstate(h) \ + for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) + struct hugepage_subpool *hugepage_new_subpool(long nr_blocks); void hugepage_put_subpool(struct hugepage_subpool *spool); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b5b6e15..d597159 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -35,7 +35,7 @@ const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; static gfp_t htlb_alloc_mask = GFP_HIGHUSER; unsigned long hugepages_treat_as_movable; -static int hugetlb_max_hstate; +int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -46,13 +46,10 @@ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; static unsigned long __initdata default_hstate_size; -#define for_each_hstate(h) \ - for ((h) = hstates; (h) < &hstates[hugetlb_max_hstate]; (h)++) - /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ -static DEFINE_SPINLOCK(hugetlb_lock); +DEFINE_SPINLOCK(hugetlb_lock); static inline void unlock_or_release_subpool(struct hugepage_subpool *spool) { -- cgit v0.10.2 From 2bc64a2046975410505bb119bba32705892b9255 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:12 -0700 Subject: mm/hugetlb: add new HugeTLB cgroup Implement a new controller that allows us to control HugeTLB allocations. The extension allows to limit the HugeTLB usage per control group and enforces the controller limit during page fault. Since HugeTLB doesn't support page reclaim, enforcing the limit at page fault time implies that, the application will get SIGBUS signal if it tries to access HugeTLB pages beyond its limit. This requires the application to know beforehand how much HugeTLB pages it would require for its use. The charge/uncharge calls will be added to HugeTLB code in later patch. Support for cgroup removal will be added in later patches. [akpm@linux-foundation.org: s/CONFIG_CGROUP_HUGETLB_RES_CTLR/CONFIG_MEMCG_HUGETLB/g] [akpm@linux-foundation.org: s/CONFIG_MEMCG_HUGETLB/CONFIG_CGROUP_HUGETLB/g] Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 0bd390c..5b41ce0 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -72,3 +72,9 @@ SUBSYS(net_prio) #endif /* */ + +#ifdef CONFIG_CGROUP_HUGETLB +SUBSYS(hugetlb) +#endif + +/* */ diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h new file mode 100644 index 0000000..f19889e --- /dev/null +++ b/include/linux/hugetlb_cgroup.h @@ -0,0 +1,37 @@ +/* + * Copyright IBM Corporation, 2012 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#ifndef _LINUX_HUGETLB_CGROUP_H +#define _LINUX_HUGETLB_CGROUP_H + +#include + +struct hugetlb_cgroup; + +#ifdef CONFIG_CGROUP_HUGETLB +static inline bool hugetlb_cgroup_disabled(void) +{ + if (hugetlb_subsys.disabled) + return true; + return false; +} + +#else +static inline bool hugetlb_cgroup_disabled(void) +{ + return true; +} + +#endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ +#endif diff --git a/init/Kconfig b/init/Kconfig index b3f55f1..7243776 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -751,6 +751,21 @@ config CGROUP_MEM_RES_CTLR_KMEM the kmem extension can use it to guarantee that no group of processes will ever exhaust kernel resources alone. +config CGROUP_HUGETLB + bool "HugeTLB Resource Controller for Control Groups" + depends on RESOURCE_COUNTERS && HUGETLB_PAGE && EXPERIMENTAL + default n + help + Provides a cgroup Resource Controller for HugeTLB pages. + When you enable this, you can put a per cgroup limit on HugeTLB usage. + The limit is enforced during page fault. Since HugeTLB doesn't + support page reclaim, enforcing the limit at page fault time implies + that, the application will get SIGBUS signal if it tries to access + HugeTLB pages beyond its limit. This requires the application to know + beforehand how much HugeTLB pages it would require for its use. The + control group is tracked in the third page lru pointer. This means + that we cannot use the controller with huge page less than 3 pages. + config CGROUP_PERF bool "Enable perf_event per-cpu per-container group (cgroup) monitoring" depends on PERF_EVENTS && CGROUPS diff --git a/mm/Makefile b/mm/Makefile index 8e81fe2..fd6fc1c 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -50,6 +50,7 @@ obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c new file mode 100644 index 0000000..0d1a66e --- /dev/null +++ b/mm/hugetlb_cgroup.c @@ -0,0 +1,120 @@ +/* + * + * Copyright IBM Corporation, 2012 + * Author Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of version 2.1 of the GNU Lesser General Public License + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include +#include + +struct hugetlb_cgroup { + struct cgroup_subsys_state css; + /* + * the counter to account for hugepages from hugetlb. + */ + struct res_counter hugepage[HUGE_MAX_HSTATE]; +}; + +struct cgroup_subsys hugetlb_subsys __read_mostly; +static struct hugetlb_cgroup *root_h_cgroup __read_mostly; + +static inline +struct hugetlb_cgroup *hugetlb_cgroup_from_css(struct cgroup_subsys_state *s) +{ + return container_of(s, struct hugetlb_cgroup, css); +} + +static inline +struct hugetlb_cgroup *hugetlb_cgroup_from_cgroup(struct cgroup *cgroup) +{ + return hugetlb_cgroup_from_css(cgroup_subsys_state(cgroup, + hugetlb_subsys_id)); +} + +static inline +struct hugetlb_cgroup *hugetlb_cgroup_from_task(struct task_struct *task) +{ + return hugetlb_cgroup_from_css(task_subsys_state(task, + hugetlb_subsys_id)); +} + +static inline bool hugetlb_cgroup_is_root(struct hugetlb_cgroup *h_cg) +{ + return (h_cg == root_h_cgroup); +} + +static inline struct hugetlb_cgroup *parent_hugetlb_cgroup(struct cgroup *cg) +{ + if (!cg->parent) + return NULL; + return hugetlb_cgroup_from_cgroup(cg->parent); +} + +static inline bool hugetlb_cgroup_have_usage(struct cgroup *cg) +{ + int idx; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cg); + + for (idx = 0; idx < hugetlb_max_hstate; idx++) { + if ((res_counter_read_u64(&h_cg->hugepage[idx], RES_USAGE)) > 0) + return true; + } + return false; +} + +static struct cgroup_subsys_state *hugetlb_cgroup_create(struct cgroup *cgroup) +{ + int idx; + struct cgroup *parent_cgroup; + struct hugetlb_cgroup *h_cgroup, *parent_h_cgroup; + + h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); + if (!h_cgroup) + return ERR_PTR(-ENOMEM); + + parent_cgroup = cgroup->parent; + if (parent_cgroup) { + parent_h_cgroup = hugetlb_cgroup_from_cgroup(parent_cgroup); + for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) + res_counter_init(&h_cgroup->hugepage[idx], + &parent_h_cgroup->hugepage[idx]); + } else { + root_h_cgroup = h_cgroup; + for (idx = 0; idx < HUGE_MAX_HSTATE; idx++) + res_counter_init(&h_cgroup->hugepage[idx], NULL); + } + return &h_cgroup->css; +} + +static void hugetlb_cgroup_destroy(struct cgroup *cgroup) +{ + struct hugetlb_cgroup *h_cgroup; + + h_cgroup = hugetlb_cgroup_from_cgroup(cgroup); + kfree(h_cgroup); +} + +static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) +{ + /* We will add the cgroup removal support in later patches */ + return -EBUSY; +} + +struct cgroup_subsys hugetlb_subsys = { + .name = "hugetlb", + .create = hugetlb_cgroup_create, + .pre_destroy = hugetlb_cgroup_pre_destroy, + .destroy = hugetlb_cgroup_destroy, + .subsys_id = hugetlb_subsys_id, +}; -- cgit v0.10.2 From 9dd540e23111d8884773ab942a736f3aba4040d4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:15 -0700 Subject: hugetlb/cgroup: add the cgroup pointer to page lru Add the hugetlb cgroup pointer to 3rd page lru.next. This limit the usage to hugetlb cgroup to only hugepages with 3 or more normal pages. I guess that is an acceptable limitation. Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index f19889e..e5451a3 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -18,8 +18,34 @@ #include struct hugetlb_cgroup; +/* + * Minimum page order trackable by hugetlb cgroup. + * At least 3 pages are necessary for all the tracking information. + */ +#define HUGETLB_CGROUP_MIN_ORDER 2 #ifdef CONFIG_CGROUP_HUGETLB + +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) +{ + VM_BUG_ON(!PageHuge(page)); + + if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + return NULL; + return (struct hugetlb_cgroup *)page[2].lru.next; +} + +static inline +int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) +{ + VM_BUG_ON(!PageHuge(page)); + + if (compound_order(page) < HUGETLB_CGROUP_MIN_ORDER) + return -1; + page[2].lru.next = (void *)h_cg; + return 0; +} + static inline bool hugetlb_cgroup_disabled(void) { if (hugetlb_subsys.disabled) @@ -28,6 +54,17 @@ static inline bool hugetlb_cgroup_disabled(void) } #else +static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) +{ + return NULL; +} + +static inline +int set_hugetlb_cgroup(struct page *page, struct hugetlb_cgroup *h_cg) +{ + return 0; +} + static inline bool hugetlb_cgroup_disabled(void) { return true; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d597159..efe29b5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -28,6 +28,7 @@ #include #include +#include #include #include "internal.h" @@ -591,6 +592,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1 << PG_writeback); } + VM_BUG_ON(hugetlb_cgroup_from_page(page)); set_compound_page_dtor(page, NULL); set_page_refcounted(page); arch_release_hugepage(page); @@ -643,6 +645,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) INIT_LIST_HEAD(&page->lru); set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); + set_hugetlb_cgroup(page, NULL); h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); @@ -892,6 +895,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); set_compound_page_dtor(page, free_huge_page); + set_hugetlb_cgroup(page, NULL); /* * We incremented the global counters already */ -- cgit v0.10.2 From 6d76dcf40405144a448040a350fd214ddc243d5e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:18 -0700 Subject: hugetlb/cgroup: add charge/uncharge routines for hugetlb cgroup Add the charge and uncharge routines for hugetlb cgroup. We do cgroup charging in page alloc and uncharge in compound page destructor. Assigning page's hugetlb cgroup is protected by hugetlb_lock. [liwp@linux.vnet.ibm.com: add huge_page_order check to avoid incorrect uncharge] Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Cc: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Aneesh Kumar K.V Signed-off-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index e5451a3..7d3fde9 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -53,6 +53,16 @@ static inline bool hugetlb_cgroup_disabled(void) return false; } +extern int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr); +extern void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page); +extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, + struct page *page); +extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg); + #else static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) { @@ -70,5 +80,33 @@ static inline bool hugetlb_cgroup_disabled(void) return true; } +static inline int +hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr) +{ + return 0; +} + +static inline void +hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page) +{ + return; +} + +static inline void +hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page) +{ + return; +} + +static inline void +hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg) +{ + return; +} + #endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index efe29b5..16a0f32 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -627,6 +627,8 @@ static void free_huge_page(struct page *page) BUG_ON(page_mapcount(page)); spin_lock(&hugetlb_lock); + hugetlb_cgroup_uncharge_page(hstate_index(h), + pages_per_huge_page(h), page); if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { /* remove the page from active list */ list_del(&page->lru); @@ -1115,7 +1117,10 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct page *page; long chg; + int ret, idx; + struct hugetlb_cgroup *h_cg; + idx = hstate_index(h); /* * Processes that did not create the mapping will have no * reserves and will not have accounted against subpool @@ -1131,6 +1136,11 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (hugepage_subpool_get_pages(spool, chg)) return ERR_PTR(-ENOSPC); + ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); + if (ret) { + hugepage_subpool_put_pages(spool, chg); + return ERR_PTR(-ENOSPC); + } spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); spin_unlock(&hugetlb_lock); @@ -1138,6 +1148,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, if (!page) { page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { + hugetlb_cgroup_uncharge_cgroup(idx, + pages_per_huge_page(h), + h_cg); hugepage_subpool_put_pages(spool, chg); return ERR_PTR(-ENOSPC); } @@ -1146,7 +1159,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_private(page, (unsigned long)spool); vma_commit_reservation(h, vma, addr); - + /* update page cgroup details */ + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); return page; } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 0d1a66e..63e04cf 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -111,6 +111,86 @@ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) return -EBUSY; } +int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup **ptr) +{ + int ret = 0; + struct res_counter *fail_res; + struct hugetlb_cgroup *h_cg = NULL; + unsigned long csize = nr_pages * PAGE_SIZE; + + if (hugetlb_cgroup_disabled()) + goto done; + /* + * We don't charge any cgroup if the compound page have less + * than 3 pages. + */ + if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) + goto done; +again: + rcu_read_lock(); + h_cg = hugetlb_cgroup_from_task(current); + if (!css_tryget(&h_cg->css)) { + rcu_read_unlock(); + goto again; + } + rcu_read_unlock(); + + ret = res_counter_charge(&h_cg->hugepage[idx], csize, &fail_res); + css_put(&h_cg->css); +done: + *ptr = h_cg; + return ret; +} + +void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg, + struct page *page) +{ + if (hugetlb_cgroup_disabled() || !h_cg) + return; + + spin_lock(&hugetlb_lock); + set_hugetlb_cgroup(page, h_cg); + spin_unlock(&hugetlb_lock); + return; +} + +/* + * Should be called with hugetlb_lock held + */ +void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, + struct page *page) +{ + struct hugetlb_cgroup *h_cg; + unsigned long csize = nr_pages * PAGE_SIZE; + + if (hugetlb_cgroup_disabled()) + return; + VM_BUG_ON(!spin_is_locked(&hugetlb_lock)); + h_cg = hugetlb_cgroup_from_page(page); + if (unlikely(!h_cg)) + return; + set_hugetlb_cgroup(page, NULL); + res_counter_uncharge(&h_cg->hugepage[idx], csize); + return; +} + +void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, + struct hugetlb_cgroup *h_cg) +{ + unsigned long csize = nr_pages * PAGE_SIZE; + + if (hugetlb_cgroup_disabled() || !h_cg) + return; + + if (huge_page_order(&hstates[idx]) < HUGETLB_CGROUP_MIN_ORDER) + return; + + res_counter_uncharge(&h_cg->hugepage[idx], csize); + return; +} + struct cgroup_subsys hugetlb_subsys = { .name = "hugetlb", .create = hugetlb_cgroup_create, -- cgit v0.10.2 From da1def55919f4852c4759249a78d63a0c5d2d8f9 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:21 -0700 Subject: hugetlb/cgroup: add support for cgroup removal Add support for cgroup removal. If we don't have parent cgroup, the charges are moved to root cgroup. Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 63e04cf..bc518be 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -105,10 +105,76 @@ static void hugetlb_cgroup_destroy(struct cgroup *cgroup) kfree(h_cgroup); } + +/* + * Should be called with hugetlb_lock held. + * Since we are holding hugetlb_lock, pages cannot get moved from + * active list or uncharged from the cgroup, So no need to get + * page reference and test for page active here. This function + * cannot fail. + */ +static void hugetlb_cgroup_move_parent(int idx, struct cgroup *cgroup, + struct page *page) +{ + int csize; + struct res_counter *counter; + struct res_counter *fail_res; + struct hugetlb_cgroup *page_hcg; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + struct hugetlb_cgroup *parent = parent_hugetlb_cgroup(cgroup); + + page_hcg = hugetlb_cgroup_from_page(page); + /* + * We can have pages in active list without any cgroup + * ie, hugepage with less than 3 pages. We can safely + * ignore those pages. + */ + if (!page_hcg || page_hcg != h_cg) + goto out; + + csize = PAGE_SIZE << compound_order(page); + if (!parent) { + parent = root_h_cgroup; + /* root has no limit */ + res_counter_charge_nofail(&parent->hugepage[idx], + csize, &fail_res); + } + counter = &h_cg->hugepage[idx]; + res_counter_uncharge_until(counter, counter->parent, csize); + + set_hugetlb_cgroup(page, parent); +out: + return; +} + +/* + * Force the hugetlb cgroup to empty the hugetlb resources by moving them to + * the parent cgroup. + */ static int hugetlb_cgroup_pre_destroy(struct cgroup *cgroup) { - /* We will add the cgroup removal support in later patches */ - return -EBUSY; + struct hstate *h; + struct page *page; + int ret = 0, idx = 0; + + do { + if (cgroup_task_count(cgroup) || + !list_empty(&cgroup->children)) { + ret = -EBUSY; + goto out; + } + for_each_hstate(h) { + spin_lock(&hugetlb_lock); + list_for_each_entry(page, &h->hugepage_activelist, lru) + hugetlb_cgroup_move_parent(idx, cgroup, page); + + spin_unlock(&hugetlb_lock); + idx++; + } + cond_resched(); + } while (hugetlb_cgroup_have_usage(cgroup)); +out: + return ret; } int hugetlb_cgroup_charge_cgroup(int idx, unsigned long nr_pages, -- cgit v0.10.2 From abb8206cb07734d0b7bf033c715995d6371a94c3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:24 -0700 Subject: hugetlb/cgroup: add hugetlb cgroup control files Add the control files for hugetlb controller [akpm@linux-foundation.org: s/CONFIG_CGROUP_HUGETLB_RES_CTLR/CONFIG_MEMCG_HUGETLB/g] [akpm@linux-foundation.org: s/CONFIG_MEMCG_HUGETLB/CONFIG_CGROUP_HUGETLB/] Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 3d677dd..f9db20b 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -4,6 +4,7 @@ #include #include #include +#include struct ctl_table; struct user_struct; @@ -221,6 +222,10 @@ struct hstate { unsigned int nr_huge_pages_node[MAX_NUMNODES]; unsigned int free_huge_pages_node[MAX_NUMNODES]; unsigned int surplus_huge_pages_node[MAX_NUMNODES]; +#ifdef CONFIG_CGROUP_HUGETLB + /* cgroup control files */ + struct cftype cgroup_files[5]; +#endif char name[HSTATE_NAME_LEN]; }; diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 7d3fde9..73f1e60 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -62,6 +62,7 @@ extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, struct page *page); extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); +extern int hugetlb_cgroup_file_init(int idx) __init; #else static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) @@ -108,5 +109,10 @@ hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, return; } +static inline int __init hugetlb_cgroup_file_init(int idx) +{ + return 0; +} + #endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ #endif diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 16a0f32..c57740bb2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -30,6 +30,7 @@ #include #include #include +#include #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; @@ -1930,6 +1931,13 @@ void __init hugetlb_add_hstate(unsigned order) h->next_nid_to_free = first_node(node_states[N_HIGH_MEMORY]); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); + /* + * Add cgroup control files only if the huge page consists + * of more than two normal pages. This is because we use + * page[2].lru.next for storing cgoup details. + */ + if (order >= HUGETLB_CGROUP_MIN_ORDER) + hugetlb_cgroup_file_init(hugetlb_max_hstate - 1); parsed_hstate = h; } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index bc518be..d1ca119 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -26,6 +26,10 @@ struct hugetlb_cgroup { struct res_counter hugepage[HUGE_MAX_HSTATE]; }; +#define MEMFILE_PRIVATE(x, val) (((x) << 16) | (val)) +#define MEMFILE_IDX(val) (((val) >> 16) & 0xffff) +#define MEMFILE_ATTR(val) ((val) & 0xffff) + struct cgroup_subsys hugetlb_subsys __read_mostly; static struct hugetlb_cgroup *root_h_cgroup __read_mostly; @@ -257,6 +261,131 @@ void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, return; } +static ssize_t hugetlb_cgroup_read(struct cgroup *cgroup, struct cftype *cft, + struct file *file, char __user *buf, + size_t nbytes, loff_t *ppos) +{ + u64 val; + char str[64]; + int idx, name, len; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + + idx = MEMFILE_IDX(cft->private); + name = MEMFILE_ATTR(cft->private); + + val = res_counter_read_u64(&h_cg->hugepage[idx], name); + len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); + return simple_read_from_buffer(buf, nbytes, ppos, str, len); +} + +static int hugetlb_cgroup_write(struct cgroup *cgroup, struct cftype *cft, + const char *buffer) +{ + int idx, name, ret; + unsigned long long val; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + + idx = MEMFILE_IDX(cft->private); + name = MEMFILE_ATTR(cft->private); + + switch (name) { + case RES_LIMIT: + if (hugetlb_cgroup_is_root(h_cg)) { + /* Can't set limit on root */ + ret = -EINVAL; + break; + } + /* This function does all necessary parse...reuse it */ + ret = res_counter_memparse_write_strategy(buffer, &val); + if (ret) + break; + ret = res_counter_set_limit(&h_cg->hugepage[idx], val); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static int hugetlb_cgroup_reset(struct cgroup *cgroup, unsigned int event) +{ + int idx, name, ret = 0; + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_cgroup(cgroup); + + idx = MEMFILE_IDX(event); + name = MEMFILE_ATTR(event); + + switch (name) { + case RES_MAX_USAGE: + res_counter_reset_max(&h_cg->hugepage[idx]); + break; + case RES_FAILCNT: + res_counter_reset_failcnt(&h_cg->hugepage[idx]); + break; + default: + ret = -EINVAL; + break; + } + return ret; +} + +static char *mem_fmt(char *buf, int size, unsigned long hsize) +{ + if (hsize >= (1UL << 30)) + snprintf(buf, size, "%luGB", hsize >> 30); + else if (hsize >= (1UL << 20)) + snprintf(buf, size, "%luMB", hsize >> 20); + else + snprintf(buf, size, "%luKB", hsize >> 10); + return buf; +} + +int __init hugetlb_cgroup_file_init(int idx) +{ + char buf[32]; + struct cftype *cft; + struct hstate *h = &hstates[idx]; + + /* format the size */ + mem_fmt(buf, 32, huge_page_size(h)); + + /* Add the limit file */ + cft = &h->cgroup_files[0]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_LIMIT); + cft->read = hugetlb_cgroup_read; + cft->write_string = hugetlb_cgroup_write; + + /* Add the usage file */ + cft = &h->cgroup_files[1]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_USAGE); + cft->read = hugetlb_cgroup_read; + + /* Add the MAX usage file */ + cft = &h->cgroup_files[2]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.max_usage_in_bytes", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_MAX_USAGE); + cft->trigger = hugetlb_cgroup_reset; + cft->read = hugetlb_cgroup_read; + + /* Add the failcntfile */ + cft = &h->cgroup_files[3]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", buf); + cft->private = MEMFILE_PRIVATE(idx, RES_FAILCNT); + cft->trigger = hugetlb_cgroup_reset; + cft->read = hugetlb_cgroup_read; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files[4]; + memset(cft, 0, sizeof(*cft)); + + WARN_ON(cgroup_add_cftypes(&hugetlb_subsys, h->cgroup_files)); + + return 0; +} + struct cgroup_subsys hugetlb_subsys = { .name = "hugetlb", .create = hugetlb_cgroup_create, -- cgit v0.10.2 From 8e6ac7fab374816de9a8b0a8fbb02ef761a30ff4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:27 -0700 Subject: hugetlb/cgroup: migrate hugetlb cgroup info from oldpage to new page during migration With HugeTLB pages, hugetlb cgroup is uncharged in compound page destructor. Since we are holding a hugepage reference, we can be sure that old page won't get uncharged till the last put_page(). Signed-off-by: Aneesh Kumar K.V Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Cc: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index 73f1e60..d73878c 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -63,6 +63,8 @@ extern void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, extern void hugetlb_cgroup_uncharge_cgroup(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg); extern int hugetlb_cgroup_file_init(int idx) __init; +extern void hugetlb_cgroup_migrate(struct page *oldhpage, + struct page *newhpage); #else static inline struct hugetlb_cgroup *hugetlb_cgroup_from_page(struct page *page) @@ -114,5 +116,11 @@ static inline int __init hugetlb_cgroup_file_init(int idx) return 0; } +static inline void hugetlb_cgroup_migrate(struct page *oldhpage, + struct page *newhpage) +{ + return; +} + #endif /* CONFIG_MEM_RES_CTLR_HUGETLB */ #endif diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index d1ca119..680e481 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -386,6 +386,26 @@ int __init hugetlb_cgroup_file_init(int idx) return 0; } +void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) +{ + struct hugetlb_cgroup *h_cg; + + if (hugetlb_cgroup_disabled()) + return; + + VM_BUG_ON(!PageHuge(oldhpage)); + spin_lock(&hugetlb_lock); + h_cg = hugetlb_cgroup_from_page(oldhpage); + set_hugetlb_cgroup(oldhpage, NULL); + cgroup_exclude_rmdir(&h_cg->css); + + /* move the h_cg details to new cgroup */ + set_hugetlb_cgroup(newhpage, h_cg); + spin_unlock(&hugetlb_lock); + cgroup_release_and_wakeup_rmdir(&h_cg->css); + return; +} + struct cgroup_subsys hugetlb_subsys = { .name = "hugetlb", .create = hugetlb_cgroup_create, diff --git a/mm/migrate.c b/mm/migrate.c index fdce3a2..6c37c51 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -931,6 +932,10 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (anon_vma) put_anon_vma(anon_vma); + + if (!rc) + hugetlb_cgroup_migrate(hpage, new_hpage); + unlock_page(hpage); out: put_page(new_hpage); -- cgit v0.10.2 From 585e27ea6d2e71d0091443c39a00a35e6a5c5e8f Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:30 -0700 Subject: hugetlb/cgroup: add HugeTLB controller documentation Signed-off-by: Aneesh Kumar K.V Reviewed-by: KAMEZAWA Hiroyuki Cc: David Rientjes Cc: Hillf Danton Reviewed-by: Michal Hocko Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/cgroups/hugetlb.txt b/Documentation/cgroups/hugetlb.txt new file mode 100644 index 0000000..a9faaca --- /dev/null +++ b/Documentation/cgroups/hugetlb.txt @@ -0,0 +1,45 @@ +HugeTLB Controller +------------------- + +The HugeTLB controller allows to limit the HugeTLB usage per control group and +enforces the controller limit during page fault. Since HugeTLB doesn't +support page reclaim, enforcing the limit at page fault time implies that, +the application will get SIGBUS signal if it tries to access HugeTLB pages +beyond its limit. This requires the application to know beforehand how much +HugeTLB pages it would require for its use. + +HugeTLB controller can be created by first mounting the cgroup filesystem. + +# mount -t cgroup -o hugetlb none /sys/fs/cgroup + +With the above step, the initial or the parent HugeTLB group becomes +visible at /sys/fs/cgroup. At bootup, this group includes all the tasks in +the system. /sys/fs/cgroup/tasks lists the tasks in this cgroup. + +New groups can be created under the parent group /sys/fs/cgroup. + +# cd /sys/fs/cgroup +# mkdir g1 +# echo $$ > g1/tasks + +The above steps create a new group g1 and move the current shell +process (bash) into it. + +Brief summary of control files + + hugetlb..limit_in_bytes # set/show limit of "hugepagesize" hugetlb usage + hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded + hugetlb..usage_in_bytes # show current res_counter usage for "hugepagesize" hugetlb + hugetlb..failcnt # show the number of allocation failure due to HugeTLB limit + +For a system supporting two hugepage size (16M and 16G) the control +files include: + +hugetlb.16GB.limit_in_bytes +hugetlb.16GB.max_usage_in_bytes +hugetlb.16GB.usage_in_bytes +hugetlb.16GB.failcnt +hugetlb.16MB.limit_in_bytes +hugetlb.16MB.max_usage_in_bytes +hugetlb.16MB.usage_in_bytes +hugetlb.16MB.failcnt -- cgit v0.10.2 From 79dbb2368ae3515fad9c8b7c8f831cd86be59b1d Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:32 -0700 Subject: hugetlb: move all the in use pages to active list When we fail to allocate pages from the reserve pool, hugetlb tries to allocate huge pages using alloc_buddy_huge_page. Add these to the active list. We also need to add the huge page we allocate when we soft offline the oldpage to active list. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c57740bb2..ec7b86e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -928,8 +928,14 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) page = dequeue_huge_page_node(h, nid); spin_unlock(&hugetlb_lock); - if (!page) + if (!page) { page = alloc_buddy_huge_page(h, nid); + if (page) { + spin_lock(&hugetlb_lock); + list_move(&page->lru, &h->hugepage_activelist); + spin_unlock(&hugetlb_lock); + } + } return page; } @@ -1155,6 +1161,9 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, hugepage_subpool_put_pages(spool, chg); return ERR_PTR(-ENOSPC); } + spin_lock(&hugetlb_lock); + list_move(&page->lru, &h->hugepage_activelist); + spin_unlock(&hugetlb_lock); } set_page_private(page, (unsigned long)spool); -- cgit v0.10.2 From 94ae8ba7176666d1e7d8bbb9f93670a27540b6a8 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:35 -0700 Subject: hugetlb/cgroup: assign the page hugetlb cgroup when we move the page to active list. A page's hugetlb cgroup assignment and movement to the active list should occur with hugetlb_lock held. Otherwise when we remove the hugetlb cgroup we will iterate the active list and find pages with NULL hugetlb cgroup values. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ec7b86e..c39e4be 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -928,14 +928,8 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) page = dequeue_huge_page_node(h, nid); spin_unlock(&hugetlb_lock); - if (!page) { + if (!page) page = alloc_buddy_huge_page(h, nid); - if (page) { - spin_lock(&hugetlb_lock); - list_move(&page->lru, &h->hugepage_activelist); - spin_unlock(&hugetlb_lock); - } - } return page; } @@ -1150,9 +1144,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, } spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve); - spin_unlock(&hugetlb_lock); - - if (!page) { + if (page) { + /* update page cgroup details */ + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), + h_cg, page); + spin_unlock(&hugetlb_lock); + } else { + spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); if (!page) { hugetlb_cgroup_uncharge_cgroup(idx, @@ -1162,6 +1160,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); } spin_lock(&hugetlb_lock); + hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), + h_cg, page); list_move(&page->lru, &h->hugepage_activelist); spin_unlock(&hugetlb_lock); } @@ -1169,8 +1169,6 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, set_page_private(page, (unsigned long)spool); vma_commit_reservation(h, vma, addr); - /* update page cgroup details */ - hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); return page; } diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 680e481..9834a01 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -213,6 +213,7 @@ done: return ret; } +/* Should be called with hugetlb_lock held */ void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, struct hugetlb_cgroup *h_cg, struct page *page) @@ -220,9 +221,7 @@ void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, if (hugetlb_cgroup_disabled() || !h_cg) return; - spin_lock(&hugetlb_lock); set_hugetlb_cgroup(page, h_cg); - spin_unlock(&hugetlb_lock); return; } @@ -389,6 +388,7 @@ int __init hugetlb_cgroup_file_init(int idx) void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) { struct hugetlb_cgroup *h_cg; + struct hstate *h = page_hstate(oldhpage); if (hugetlb_cgroup_disabled()) return; @@ -401,6 +401,7 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) /* move the h_cg details to new cgroup */ set_hugetlb_cgroup(newhpage, h_cg); + list_move(&newhpage->lru, &h->hugepage_activelist); spin_unlock(&hugetlb_lock); cgroup_release_and_wakeup_rmdir(&h_cg->css); return; -- cgit v0.10.2 From 75754681fe79b84dde1048470a44eeb64192fad6 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 31 Jul 2012 16:42:36 -0700 Subject: hugetlb/cgroup: remove exclude and wakeup rmdir calls from migrate We already hold the hugetlb_lock. That should prevent a parallel cgroup rmdir from touching page's hugetlb cgroup. So remove the exclude and wakeup calls. Signed-off-by: Aneesh Kumar K.V Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 9834a01..a3f358f 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -385,6 +385,10 @@ int __init hugetlb_cgroup_file_init(int idx) return 0; } +/* + * hugetlb_lock will make sure a parallel cgroup rmdir won't happen + * when we migrate hugepages + */ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) { struct hugetlb_cgroup *h_cg; @@ -397,13 +401,11 @@ void hugetlb_cgroup_migrate(struct page *oldhpage, struct page *newhpage) spin_lock(&hugetlb_lock); h_cg = hugetlb_cgroup_from_page(oldhpage); set_hugetlb_cgroup(oldhpage, NULL); - cgroup_exclude_rmdir(&h_cg->css); /* move the h_cg details to new cgroup */ set_hugetlb_cgroup(newhpage, h_cg); list_move(&newhpage->lru, &h->hugepage_activelist); spin_unlock(&hugetlb_lock); - cgroup_release_and_wakeup_rmdir(&h_cg->css); return; } -- cgit v0.10.2 From 4f774b912dd1d5752cd33b696509531b0321c3e0 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Jul 2012 16:42:37 -0700 Subject: mm, oom: do not schedule if current has been killed The oom killer currently schedules away from current in an uninterruptible sleep if it does not have access to memory reserves. It's possible that current was killed because it shares memory with the oom killed thread or because it was killed by the user in the interim, however. This patch only schedules away from current if it does not have a pending kill, i.e. if it does not share memory with the oom killed thread. It's possible that it will immediately retry its memory allocation and fail, but it will immediately be given access to memory reserves if it calls the oom killer again. This prevents the delay of memory freeing when threads that share memory with the oom killed thread get unnecessarily scheduled. Signed-off-by: David Rientjes Cc: Oleg Nesterov Acked-by: KOSAKI Motohiro Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ac300c9..e8ab3df 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -745,11 +745,11 @@ out: read_unlock(&tasklist_lock); /* - * Give "p" a good chance of killing itself before we - * retry to allocate memory unless "p" is current + * Give the killed threads a good chance of exiting before trying to + * allocate memory again. */ - if (killed && !test_thread_flag(TIF_MEMDIE)) - schedule_timeout_uninterruptible(1); + if (killed) + schedule_timeout_killable(1); } /* @@ -764,6 +764,5 @@ void pagefault_out_of_memory(void) out_of_memory(NULL, 0, 0, NULL, false); clear_system_oom(); } - if (!test_thread_flag(TIF_MEMDIE)) - schedule_timeout_uninterruptible(1); + schedule_timeout_killable(1); } -- cgit v0.10.2 From fd07383b6bbc1418b1bdd5f295d13e600222fffa Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Jul 2012 16:42:40 -0700 Subject: mm/memblock.c:memblock_double_array(): cosmetic cleanups This function is an 80-column eyesore, quite unnecessarily. Clean that up, and use standard comment layout style. Cc: Benjamin Herrenschmidt Cc: Greg Pearson Cc: Tejun Heo Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memblock.c b/mm/memblock.c index 5cc6731..4d9393c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -222,13 +222,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, /* Try to find some space for it. * * WARNING: We assume that either slab_is_available() and we use it or - * we use MEMBLOCK for allocations. That means that this is unsafe to use - * when bootmem is currently active (unless bootmem itself is implemented - * on top of MEMBLOCK which isn't the case yet) + * we use MEMBLOCK for allocations. That means that this is unsafe to + * use when bootmem is currently active (unless bootmem itself is + * implemented on top of MEMBLOCK which isn't the case yet) * * This should however not be an issue for now, as we currently only - * call into MEMBLOCK while it's still active, or much later when slab is - * active for memory hotplug operations + * call into MEMBLOCK while it's still active, or much later when slab + * is active for memory hotplug operations */ if (use_slab) { new_array = kmalloc(new_size, GFP_KERNEL); @@ -243,8 +243,8 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, new_alloc_size, PAGE_SIZE); if (!addr && new_area_size) addr = memblock_find_in_range(0, - min(new_area_start, memblock.current_limit), - new_alloc_size, PAGE_SIZE); + min(new_area_start, memblock.current_limit), + new_alloc_size, PAGE_SIZE); new_array = addr ? __va(addr) : 0; } @@ -254,12 +254,14 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, return -1; } - memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]", - memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1); + memblock_dbg("memblock: %s is doubled to %ld at [%#010llx-%#010llx]", + memblock_type_name(type), type->max * 2, (u64)addr, + (u64)addr + new_size - 1); - /* Found space, we now need to move the array over before - * we add the reserved region since it may be our reserved - * array itself that is full. + /* + * Found space, we now need to move the array over before we add the + * reserved region since it may be our reserved array itself that is + * full. */ memcpy(new_array, type->regions, old_size); memset(new_array + type->max, 0, old_size); @@ -267,17 +269,16 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, type->regions = new_array; type->max <<= 1; - /* Free old array. We needn't free it if the array is the - * static one - */ + /* Free old array. We needn't free it if the array is the static one */ if (*in_slab) kfree(old_array); else if (old_array != memblock_memory_init_regions && old_array != memblock_reserved_init_regions) memblock_free(__pa(old_array), old_alloc_size); - /* Reserve the new array if that comes from the memblock. - * Otherwise, we needn't do it + /* + * Reserve the new array if that comes from the memblock. Otherwise, we + * needn't do it */ if (!use_slab) BUG_ON(memblock_reserve(addr, new_alloc_size)); -- cgit v0.10.2 From 59b8e85c26e77d236ebdd61866ffc5e69c5f531d Mon Sep 17 00:00:00 2001 From: Kamezawa Hiroyuki Date: Tue, 31 Jul 2012 16:42:42 -0700 Subject: memcg: remove check for signal_pending() during rmdir() After bf544fdc241da8 "memcg: move charges to root cgroup if use_hierarchy=0 in mem_cgroup_move_hugetlb_parent()", no memory reclaim will occur when removing a memory cgroup. If -EINTR is returned here, cgroup will show a warning. We don't need to handle any user interruption signal. Remove this. Signed-off-by: KAMEZAWA Hiroyuki Cc: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1940ba8..8062d45 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3691,9 +3691,6 @@ move_account: ret = -EBUSY; if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) goto out; - ret = -EINTR; - if (signal_pending(current)) - goto out; /* This is for making all *used* pages to be on LRU. */ lru_add_drain_all(); drain_all_stock_sync(memcg); -- cgit v0.10.2 From d845aa2c75bcff87dacf098378b5706b12d1588f Mon Sep 17 00:00:00 2001 From: Kamezawa Hiroyuki Date: Tue, 31 Jul 2012 16:42:44 -0700 Subject: memcg: clean up force_empty_list() return value check After bf544fdc241da8 "memcg: move charges to root cgroup if use_hierarchy=0 in mem_cgroup_move_hugetlb_parent()" mem_cgroup_move_parent() returns only -EBUSY or -EINVAL. So we can remove the -ENOMEM and -EINTR checks. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8062d45..a1f00f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3653,8 +3653,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, pc = lookup_page_cgroup(page); ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); - if (ret == -ENOMEM || ret == -EINTR) - break; if (ret == -EBUSY || ret == -EINVAL) { /* found lock contention or "pc" is obsolete. */ @@ -3711,9 +3709,6 @@ move_account: } mem_cgroup_end_move(memcg); memcg_oom_recover(memcg); - /* it seems parent cgroup doesn't have enough mem */ - if (ret == -ENOMEM) - goto try_to_free; cond_resched(); /* "ret" should also be checked to ensure all lists are empty. */ } while (res_counter_read_u64(&memcg->res, RES_USAGE) > 0 || ret); -- cgit v0.10.2 From 6068bf0104becf07792b2867bc4d17c369419f8b Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 31 Jul 2012 16:42:45 -0700 Subject: memcg: mem_cgroup_move_parent() doesn't need gfp_mask Signed-off-by: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a1f00f0..9647866 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2643,8 +2643,7 @@ out: static int mem_cgroup_move_parent(struct page *page, struct page_cgroup *pc, - struct mem_cgroup *child, - gfp_t gfp_mask) + struct mem_cgroup *child) { struct mem_cgroup *parent; unsigned int nr_pages; @@ -3652,7 +3651,7 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, pc = lookup_page_cgroup(page); - ret = mem_cgroup_move_parent(page, pc, memcg, GFP_KERNEL); + ret = mem_cgroup_move_parent(page, pc, memcg); if (ret == -EBUSY || ret == -EINVAL) { /* found lock contention or "pc" is obsolete. */ -- cgit v0.10.2 From 3c935d189be9bb877c5a1110ac5fbf9c8e310658 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 31 Jul 2012 16:42:46 -0700 Subject: memcg: make mem_cgroup_force_empty_list() return bool mem_cgroup_force_empty_list() just returns 0 or -EBUSY and -EBUSY indicates 'you need to retry'. Make mem_cgroup_force_empty_list() return a bool to simplify the logic. [akpm@linux-foundation.org: rework mem_cgroup_force_empty_list()'s comment] Signed-off-by: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9647866..a2677e0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3609,10 +3609,12 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, } /* - * This routine traverse page_cgroup in given list and drop them all. - * *And* this routine doesn't reclaim page itself, just removes page_cgroup. + * Traverse a specified page_cgroup list and try to drop them all. This doesn't + * reclaim the pages page themselves - it just removes the page_cgroups. + * Returns true if some page_cgroups were not freed, indicating that the caller + * must retry this operation. */ -static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, +static bool mem_cgroup_force_empty_list(struct mem_cgroup *memcg, int node, int zid, enum lru_list lru) { struct mem_cgroup_per_zone *mz; @@ -3620,7 +3622,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, struct list_head *list; struct page *busy; struct zone *zone; - int ret = 0; zone = &NODE_DATA(node)->node_zones[zid]; mz = mem_cgroup_zoneinfo(memcg, node, zid); @@ -3634,7 +3635,6 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, struct page_cgroup *pc; struct page *page; - ret = 0; spin_lock_irqsave(&zone->lru_lock, flags); if (list_empty(list)) { spin_unlock_irqrestore(&zone->lru_lock, flags); @@ -3651,19 +3651,14 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *memcg, pc = lookup_page_cgroup(page); - ret = mem_cgroup_move_parent(page, pc, memcg); - - if (ret == -EBUSY || ret == -EINVAL) { + if (mem_cgroup_move_parent(page, pc, memcg)) { /* found lock contention or "pc" is obsolete. */ busy = page; cond_resched(); } else busy = NULL; } - - if (!ret && !list_empty(list)) - return -EBUSY; - return ret; + return !list_empty(list); } /* -- cgit v0.10.2 From c59e26104e3e0e952cd7d63e79cd71ee5a9ec25a Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 31 Jul 2012 16:42:49 -0700 Subject: mm/compaction: cleanup on compaction_deferred When CONFIG_COMPACTION is enabled, compaction_deferred() tries to recalculate the deferred limit again, which isn't necessary. When CONFIG_COMPACTION is disabled, compaction_deferred() should return "true" or "false" since it has "bool" for its return value. Signed-off-by: Gavin Shan Acked-by: Minchan Kim Acked-by: Johannes Weiner Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 51a90b7..133ddcf 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -58,7 +58,7 @@ static inline bool compaction_deferred(struct zone *zone, int order) if (++zone->compact_considered > defer_limit) zone->compact_considered = defer_limit; - return zone->compact_considered < (1UL << zone->compact_defer_shift); + return zone->compact_considered < defer_limit; } #else @@ -85,7 +85,7 @@ static inline void defer_compaction(struct zone *zone, int order) static inline bool compaction_deferred(struct zone *zone, int order) { - return 1; + return true; } #endif /* CONFIG_COMPACTION */ -- cgit v0.10.2 From 3d3727cdb07ff17ddc3c551ef8d03d37b60a0372 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 31 Jul 2012 16:42:50 -0700 Subject: mm, fadvise: don't return -EINVAL when filesystem cannot implement fadvise() Eric Wong reported his test suite failex when /tmp is tmpfs. https://lkml.org/lkml/2012/2/24/479 Currentlt the input check of POSIX_FADV_WILLNEED has two problems. - requires a_ops->readpage. But in fact, force_page_cache_readahead() requires that the target filesystem has either ->readpage or ->readpages. - returns -EINVAL when the filesystem doesn't have ->readpage. But posix says that fadvise is merely a hint. Thus fadvise() should return 0 if filesystem has no means of implementing fadvise(). The userland application should not know nor care whcih type of filesystem backs the TMPDIR directory, as Eric pointed out. There is nothing which userspace can do to solve this error. So change the return value to 0 when filesytem doesn't support readahead. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: KOSAKI Motohiro Cc: Hugh Dickins Cc: Hillf Danton Signed-off-by: Eric Wong Tested-by: Eric Wong Reviewed-by: Wanlong Gao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/fadvise.c b/mm/fadvise.c index 469491e0..9b75a04 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -93,11 +93,6 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) spin_unlock(&file->f_lock); break; case POSIX_FADV_WILLNEED: - if (!mapping->a_ops->readpage) { - ret = -EINVAL; - break; - } - /* First and last PARTIAL page! */ start_index = offset >> PAGE_CACHE_SHIFT; end_index = endbyte >> PAGE_CACHE_SHIFT; @@ -106,12 +101,13 @@ SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) nrpages = end_index - start_index + 1; if (!nrpages) nrpages = ~0UL; - - ret = force_page_cache_readahead(mapping, file, - start_index, - nrpages); - if (ret > 0) - ret = 0; + + /* + * Ignore return value because fadvise() shall return + * success even if filesystem can't retrieve a hint, + */ + force_page_cache_readahead(mapping, file, start_index, + nrpages); break; case POSIX_FADV_NOREUSE: break; -- cgit v0.10.2 From 2a13515c398bbe471bf64519ada87fd708152ced Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 31 Jul 2012 16:42:53 -0700 Subject: mm: clear pages_scanned only if draining a pcp adds pages to the buddy allocator again commit 2ff754fa8f ("mm: clear pages_scanned only if draining a pcp adds pages to the buddy allocator again") fixed one free_pcppages_bulk() misuse. But two another miuse still exist. This patch fixes it. Signed-off-by: KOSAKI Motohiro Acked-by: David Rientjes Acked-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Minchan Kim Cc: Wu Fengguang Reviewed-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 80ef992..1874752 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1158,8 +1158,10 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) to_drain = pcp->batch; else to_drain = pcp->count; - free_pcppages_bulk(zone, to_drain, pcp); - pcp->count -= to_drain; + if (to_drain > 0) { + free_pcppages_bulk(zone, to_drain, pcp); + pcp->count -= to_drain; + } local_irq_restore(flags); } #endif @@ -3915,7 +3917,8 @@ static int __zone_pcp_update(void *data) pcp = &pset->pcp; local_irq_save(flags); - free_pcppages_bulk(zone, pcp->count, pcp); + if (pcp->count > 0) + free_pcppages_bulk(zone, pcp->count, pcp); setup_pageset(pset, batch); local_irq_restore(flags); } -- cgit v0.10.2 From 121d1ba0a019e1465a53533aea133b1b0f6b442d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Jul 2012 16:42:55 -0700 Subject: mm, oom: fix potential killing of thread that is disabled from oom killing /proc/sys/vm/oom_kill_allocating_task will immediately kill current when the oom killer is called to avoid a potentially expensive tasklist scan for large systems. Currently, however, it is not checking current's oom_score_adj value which may be OOM_SCORE_ADJ_MIN, meaning that it has been disabled from oom killing. This patch avoids killing current in such a condition and simply falls back to the tasklist scan since memory still needs to be freed. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e8ab3df..50e7437 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -719,9 +719,9 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); read_lock(&tasklist_lock); - if (sysctl_oom_kill_allocating_task && + if (sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, nodemask) && - current->mm) { + current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, nodemask, "Out of memory (oom_kill_allocating_task)"); -- cgit v0.10.2 From de34d965a80d0f61a354bdefa0b15a88bcff2028 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Jul 2012 16:42:56 -0700 Subject: mm, oom: replace some information in tasklist dump The number of ptes and swap entries are used in the oom killer's badness heuristic, so they should be shown in the tasklist dump. This patch adds those fields and replaces cpu and oom_adj values that are currently emitted. Cpu isn't interesting and oom_adj is deprecated and will be removed later this year, the same information is already displayed as oom_score_adj which is used internally. At the same time, make the documentation a little more clear to state this information is helpful to determine why the oom killer chose the task it did to kill. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 06d662b..dcc2a94 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -491,9 +491,10 @@ oom_dump_tasks Enables a system-wide task dump (excluding kernel threads) to be produced when the kernel performs an OOM-killing and includes such -information as pid, uid, tgid, vm size, rss, cpu, oom_adj score, and -name. This is helpful to determine why the OOM killer was invoked -and to identify the rogue task that caused it. +information as pid, uid, tgid, vm size, rss, nr_ptes, swapents, +oom_score_adj score, and name. This is helpful to determine why the +OOM killer was invoked, to identify the rogue task that caused it, +and to determine why the OOM killer chose the task it did to kill. If this is set to zero, this information is suppressed. On very large systems with thousands of tasks it may not be feasible to dump diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 50e7437..c82ede6 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -371,8 +371,8 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * Dumps the current memory state of all eligible tasks. Tasks not in the same * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes * are not shown. - * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj - * value, oom_score_adj value, and name. + * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, + * swapents, oom_score_adj value, and name. * * Call with tasklist_lock read-locked. */ @@ -381,7 +381,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) continue; @@ -396,10 +396,11 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), - task_cpu(task), task->signal->oom_adj, + task->mm->nr_ptes, + get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); } -- cgit v0.10.2 From 97d255c816946388bab504122937730d3447c612 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:42:59 -0700 Subject: mm: do not use page_count() without a page pin d179e84ba ("mm: vmscan: do not use page_count without a page pin") fixed this problem in vmscan.c but same problem is in __count_immobile_pages(). I copy and paste d179e84ba's contents for description. "It is unsafe to run page_count during the physical pfn scan because compound_head could trip on a dangling pointer when reading page->first_page if the compound page is being freed by another CPU." Signed-off-by: Minchan Kim Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Michal Hocko Reviewed-by: KAMEZAWA Hiroyuki Cc: Wanpeng Li Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1874752..bb790f5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5500,11 +5500,18 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) continue; page = pfn_to_page(check); - if (!page_count(page)) { + /* + * We can't use page_count without pin a page + * because another CPU can free compound page. + * This check already skips compound tails of THP + * because their page->_count is zero at all time. + */ + if (!atomic_read(&page->_count)) { if (PageBuddy(page)) iter += (1 << page_order(page)) - 1; continue; } + if (!PageLRU(page)) found++; /* -- cgit v0.10.2 From 80934513b230bfcf70265f2ef0fdae89fb391633 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:43:01 -0700 Subject: mm: clean up __count_immobile_pages() The __count_immobile_pages() naming is rather awkward. Choose a more clear name and add a comment. Signed-off-by: Minchan Kim Cc: Andrea Arcangeli Cc: Mel Gorman Cc: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bb790f5..fba2a12 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5471,26 +5471,28 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, } /* - * This is designed as sub function...plz see page_isolation.c also. - * set/clear page block's type to be ISOLATE. - * page allocater never alloc memory from ISOLATE block. + * This function checks whether pageblock includes unmovable pages or not. + * If @count is not zero, it is okay to include less @count unmovable pages + * + * PageLRU check wihtout isolation or lru_lock could race so that + * MIGRATE_MOVABLE block might include unmovable pages. It means you can't + * expect this function should be exact. */ - -static int -__count_immobile_pages(struct zone *zone, struct page *page, int count) +static bool +__has_unmovable_pages(struct zone *zone, struct page *page, int count) { unsigned long pfn, iter, found; int mt; /* * For avoiding noise data, lru_add_drain_all() should be called - * If ZONE_MOVABLE, the zone never contains immobile pages + * If ZONE_MOVABLE, the zone never contains unmovable pages */ if (zone_idx(zone) == ZONE_MOVABLE) - return true; + return false; mt = get_pageblock_migratetype(page); if (mt == MIGRATE_MOVABLE || is_migrate_cma(mt)) - return true; + return false; pfn = page_to_pfn(page); for (found = 0, iter = 0; iter < pageblock_nr_pages; iter++) { @@ -5528,9 +5530,9 @@ __count_immobile_pages(struct zone *zone, struct page *page, int count) * page at boot. */ if (found > count) - return false; + return true; } - return true; + return false; } bool is_pageblock_removable_nolock(struct page *page) @@ -5554,7 +5556,7 @@ bool is_pageblock_removable_nolock(struct page *page) zone->zone_start_pfn + zone->spanned_pages <= pfn) return false; - return __count_immobile_pages(zone, page, 0); + return !__has_unmovable_pages(zone, page, 0); } int set_migratetype_isolate(struct page *page) @@ -5593,12 +5595,12 @@ int set_migratetype_isolate(struct page *page) * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (__count_immobile_pages(zone, page, arg.pages_found)) + if (!__has_unmovable_pages(zone, page, arg.pages_found)) ret = 0; - /* - * immobile means "not-on-lru" paes. If immobile is larger than - * removable-by-driver pages reported by notifier, we'll fail. + * Unmovable means "not-on-lru" pages. If Unmovable pages are + * larger than removable-by-driver pages reported by notifier, + * we'll fail. */ out: -- cgit v0.10.2 From c255a458055e459f65eb7b7f51dc5dbdd0caf1d8 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 31 Jul 2012 16:43:02 -0700 Subject: memcg: rename config variables Sanity: CONFIG_CGROUP_MEM_RES_CTLR -> CONFIG_MEMCG CONFIG_CGROUP_MEM_RES_CTLR_SWAP -> CONFIG_MEMCG_SWAP CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED -> CONFIG_MEMCG_SWAP_ENABLED CONFIG_CGROUP_MEM_RES_CTLR_KMEM -> CONFIG_MEMCG_KMEM [mhocko@suse.cz: fix missed bits] Cc: Glauber Costa Acked-by: Michal Hocko Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Hugh Dickins Cc: Tejun Heo Cc: Aneesh Kumar K.V Cc: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index dd88540..672676a 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -187,12 +187,12 @@ the cgroup that brought it in -- this will happen on memory pressure). But see section 8.2: when moving a task to another cgroup, its pages may be recharged to the new cgroup, if move_charge_at_immigrate has been chosen. -Exception: If CONFIG_CGROUP_CGROUP_MEM_RES_CTLR_SWAP is not used. +Exception: If CONFIG_CGROUP_CGROUP_MEMCG_SWAP is not used. When you do swapoff and make swapped-out pages of shmem(tmpfs) to be backed into memory in force, charges for pages are accounted against the caller of swapoff rather than the users of shmem. -2.4 Swap Extension (CONFIG_CGROUP_MEM_RES_CTLR_SWAP) +2.4 Swap Extension (CONFIG_MEMCG_SWAP) Swap Extension allows you to record charge for swap. A swapped-in page is charged back to original page allocator if possible. @@ -259,7 +259,7 @@ When oom event notifier is registered, event will be delivered. per-zone-per-cgroup LRU (cgroup's private LRU) is just guarded by zone->lru_lock, it has no lock of its own. -2.7 Kernel Memory Extension (CONFIG_CGROUP_MEM_RES_CTLR_KMEM) +2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) With the Kernel memory extension, the Memory Controller is able to limit the amount of kernel memory used by the system. Kernel memory is fundamentally @@ -286,8 +286,8 @@ per cgroup, instead of globally. a. Enable CONFIG_CGROUPS b. Enable CONFIG_RESOURCE_COUNTERS -c. Enable CONFIG_CGROUP_MEM_RES_CTLR -d. Enable CONFIG_CGROUP_MEM_RES_CTLR_SWAP (to use swap extension) +c. Enable CONFIG_MEMCG +d. Enable CONFIG_MEMCG_SWAP (to use swap extension) 1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) # mount -t tmpfs none /sys/fs/cgroup diff --git a/arch/powerpc/configs/chroma_defconfig b/arch/powerpc/configs/chroma_defconfig index b1f9597..29bb11e 100644 --- a/arch/powerpc/configs/chroma_defconfig +++ b/arch/powerpc/configs/chroma_defconfig @@ -21,8 +21,8 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y -CONFIG_CGROUP_MEM_RES_CTLR=y -CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y +CONFIG_CGROUP_MEMCG=y +CONFIG_CGROUP_MEMCG_SWAP=y CONFIG_NAMESPACES=y CONFIG_RELAY=y CONFIG_BLK_DEV_INITRD=y diff --git a/arch/s390/defconfig b/arch/s390/defconfig index 37d2bf2..de57702 100644 --- a/arch/s390/defconfig +++ b/arch/s390/defconfig @@ -13,7 +13,7 @@ CONFIG_CGROUPS=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y -CONFIG_CGROUP_MEM_RES_CTLR=y +CONFIG_CGROUP_MEMCG=y CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y diff --git a/arch/sh/configs/apsh4ad0a_defconfig b/arch/sh/configs/apsh4ad0a_defconfig index e758348..95ae23f 100644 --- a/arch/sh/configs/apsh4ad0a_defconfig +++ b/arch/sh/configs/apsh4ad0a_defconfig @@ -11,7 +11,7 @@ CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y -CONFIG_CGROUP_MEM_RES_CTLR=y +CONFIG_CGROUP_MEMCG=y CONFIG_BLK_CGROUP=y CONFIG_NAMESPACES=y CONFIG_BLK_DEV_INITRD=y diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index 8a7dd7b..76a76a2 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -18,8 +18,8 @@ CONFIG_CPUSETS=y # CONFIG_PROC_PID_CPUSET is not set CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y -CONFIG_CGROUP_MEM_RES_CTLR=y -CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y +CONFIG_CGROUP_MEMCG=y +CONFIG_CGROUP_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_BLK_CGROUP=y diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig index 72c3fad..6bc30ab 100644 --- a/arch/sh/configs/se7206_defconfig +++ b/arch/sh/configs/se7206_defconfig @@ -11,7 +11,7 @@ CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y -CONFIG_CGROUP_MEM_RES_CTLR=y +CONFIG_CGROUP_MEMCG=y CONFIG_RELAY=y CONFIG_NAMESPACES=y CONFIG_UTS_NS=y diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig index 6bb4130..cd6c519 100644 --- a/arch/sh/configs/shx3_defconfig +++ b/arch/sh/configs/shx3_defconfig @@ -13,7 +13,7 @@ CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y -CONFIG_CGROUP_MEM_RES_CTLR=y +CONFIG_CGROUP_MEMCG=y CONFIG_RELAY=y CONFIG_NAMESPACES=y CONFIG_UTS_NS=y diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index 8bfa4d0..d7f89be 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig @@ -15,8 +15,8 @@ CONFIG_CPUSETS=y # CONFIG_PROC_PID_CPUSET is not set CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y -CONFIG_CGROUP_MEM_RES_CTLR=y -CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y +CONFIG_CGROUP_MEMCG=y +CONFIG_CGROUP_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_BLK_DEV_INITRD=y diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig index b8d99ac..0270620 100644 --- a/arch/tile/configs/tilegx_defconfig +++ b/arch/tile/configs/tilegx_defconfig @@ -18,8 +18,8 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y -CONFIG_CGROUP_MEM_RES_CTLR=y -CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y +CONFIG_CGROUP_MEMCG=y +CONFIG_CGROUP_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_BLK_CGROUP=y diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig index 2b1fd31..c11de27 100644 --- a/arch/tile/configs/tilepro_defconfig +++ b/arch/tile/configs/tilepro_defconfig @@ -17,8 +17,8 @@ CONFIG_CGROUP_DEVICE=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y -CONFIG_CGROUP_MEM_RES_CTLR=y -CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y +CONFIG_CGROUP_MEMCG=y +CONFIG_CGROUP_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_BLK_CGROUP=y diff --git a/arch/um/defconfig b/arch/um/defconfig index 7823ab1..fec0d5d 100644 --- a/arch/um/defconfig +++ b/arch/um/defconfig @@ -155,10 +155,10 @@ CONFIG_CPUSETS=y CONFIG_PROC_PID_CPUSET=y CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y -CONFIG_CGROUP_MEM_RES_CTLR=y -CONFIG_CGROUP_MEM_RES_CTLR_SWAP=y -# CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED is not set -# CONFIG_CGROUP_MEM_RES_CTLR_KMEM is not set +CONFIG_CGROUP_MEMCG=y +CONFIG_CGROUP_MEMCG_SWAP=y +# CONFIG_CGROUP_MEMCG_SWAP_ENABLED is not set +# CONFIG_CGROUP_MEMCG_KMEM is not set CONFIG_CGROUP_SCHED=y CONFIG_FAIR_GROUP_SCHED=y # CONFIG_CFS_BANDWIDTH is not set diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 5b41ce0..dfae957 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -31,7 +31,7 @@ SUBSYS(cpuacct) /* */ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG SUBSYS(mem_cgroup) #endif diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 83e7ba9..1700762 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -38,7 +38,7 @@ struct mem_cgroup_reclaim_cookie { unsigned int generation; }; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG /* * All "charge" functions with gfp_mask should use GFP_KERNEL or * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't @@ -124,7 +124,7 @@ extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, extern void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; #endif @@ -193,7 +193,7 @@ void mem_cgroup_split_huge_fixup(struct page *head); bool mem_cgroup_bad_page_check(struct page *page); void mem_cgroup_print_bad_page(struct page *page); #endif -#else /* CONFIG_CGROUP_MEM_RES_CTLR */ +#else /* CONFIG_MEMCG */ struct mem_cgroup; static inline int mem_cgroup_newpage_charge(struct page *page, @@ -384,9 +384,9 @@ static inline void mem_cgroup_replace_page_cache(struct page *oldpage, struct page *newpage) { } -#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ +#endif /* CONFIG_MEMCG */ -#if !defined(CONFIG_CGROUP_MEM_RES_CTLR) || !defined(CONFIG_DEBUG_VM) +#if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM) static inline bool mem_cgroup_bad_page_check(struct page *page) { @@ -406,7 +406,7 @@ enum { }; struct sock; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM void sock_update_memcg(struct sock *sk); void sock_release_memcg(struct sock *sk); #else @@ -416,6 +416,6 @@ static inline void sock_update_memcg(struct sock *sk) static inline void sock_release_memcg(struct sock *sk) { } -#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ +#endif /* CONFIG_MEMCG_KMEM */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 458988b..3bdfa15 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -201,7 +201,7 @@ struct zone_reclaim_stat { struct lruvec { struct list_head lists[NR_LRU_LISTS]; struct zone_reclaim_stat reclaim_stat; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG struct zone *zone; #endif }; @@ -671,7 +671,7 @@ typedef struct pglist_data { int nr_zones; #ifdef CONFIG_FLAT_NODE_MEM_MAP /* means !SPARSEMEM */ struct page *node_mem_map; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG struct page_cgroup *node_page_cgroup; #endif #endif @@ -736,7 +736,7 @@ extern void lruvec_init(struct lruvec *lruvec, struct zone *zone); static inline struct zone *lruvec_zone(struct lruvec *lruvec) { -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG return lruvec->zone; #else return container_of(lruvec, struct zone, lruvec); @@ -1052,7 +1052,7 @@ struct mem_section { /* See declaration of similar field in struct zone */ unsigned long *pageblock_flags; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG /* * If !SPARSEMEM, pgdat doesn't have page_cgroup pointer. We use * section. (see memcontrol.h/page_cgroup.h about this.) diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index a88cdba..777a524 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -12,7 +12,7 @@ enum { #ifndef __GENERATING_BOUNDS_H #include -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG #include /* @@ -82,7 +82,7 @@ static inline void unlock_page_cgroup(struct page_cgroup *pc) bit_spin_unlock(PCG_LOCK, &pc->flags); } -#else /* CONFIG_CGROUP_MEM_RES_CTLR */ +#else /* CONFIG_MEMCG */ struct page_cgroup; static inline void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) @@ -102,11 +102,11 @@ static inline void __init page_cgroup_init_flatmem(void) { } -#endif /* CONFIG_CGROUP_MEM_RES_CTLR */ +#endif /* CONFIG_MEMCG */ #include -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new); extern unsigned short swap_cgroup_record(swp_entry_t ent, unsigned short id); @@ -138,7 +138,7 @@ static inline void swap_cgroup_swapoff(int type) return; } -#endif /* CONFIG_CGROUP_MEM_RES_CTLR_SWAP */ +#endif /* CONFIG_MEMCG_SWAP */ #endif /* !__GENERATING_BOUNDS_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 68dcffa..865725a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1584,7 +1584,7 @@ struct task_struct { /* bitmask and counter of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ +#ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ struct memcg_batch_info { int do_batch; /* incremented when batch uncharge started */ struct mem_cgroup *memcg; /* target memcg of uncharge */ diff --git a/include/linux/swap.h b/include/linux/swap.h index c84ec68..9a16bb1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -301,7 +301,7 @@ static inline void scan_unevictable_unregister_node(struct node *node) extern int kswapd_run(int nid); extern void kswapd_stop(int nid); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG extern int mem_cgroup_swappiness(struct mem_cgroup *mem); #else static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) @@ -309,7 +309,7 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem) return vm_swappiness; } #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP extern void mem_cgroup_uncharge_swap(swp_entry_t ent); #else static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) @@ -360,7 +360,7 @@ extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG extern void mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout); #else diff --git a/include/net/sock.h b/include/net/sock.h index e067f8c..cee528c 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -913,7 +913,7 @@ struct proto { #ifdef SOCK_REFCNT_DEBUG atomic_t socks; #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM /* * cgroup specific init/deinit functions. Called once for all * protocols that implement it, from cgroups populate function. @@ -994,7 +994,7 @@ inline void sk_refcnt_debug_release(const struct sock *sk) #define sk_refcnt_debug_release(sk) do { } while (0) #endif /* SOCK_REFCNT_DEBUG */ -#if defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) && defined(CONFIG_NET) +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_NET) extern struct static_key memcg_socket_limit_enabled; static inline struct cg_proto *parent_cg_proto(struct proto *proto, struct cg_proto *cg_proto) diff --git a/init/Kconfig b/init/Kconfig index 7243776..af6c7f8 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -686,7 +686,7 @@ config RESOURCE_COUNTERS This option enables controller independent resource accounting infrastructure that works with cgroups. -config CGROUP_MEM_RES_CTLR +config MEMCG bool "Memory Resource Controller for Control Groups" depends on RESOURCE_COUNTERS select MM_OWNER @@ -709,9 +709,9 @@ config CGROUP_MEM_RES_CTLR This config option also selects MM_OWNER config option, which could in turn add some fork/exit overhead. -config CGROUP_MEM_RES_CTLR_SWAP +config MEMCG_SWAP bool "Memory Resource Controller Swap Extension" - depends on CGROUP_MEM_RES_CTLR && SWAP + depends on MEMCG && SWAP help Add swap management feature to memory resource controller. When you enable this, you can limit mem+swap usage per cgroup. In other words, @@ -726,9 +726,9 @@ config CGROUP_MEM_RES_CTLR_SWAP if boot option "swapaccount=0" is set, swap will not be accounted. Now, memory usage of swap_cgroup is 2 bytes per entry. If swap page size is 4096bytes, 512k per 1Gbytes of swap. -config CGROUP_MEM_RES_CTLR_SWAP_ENABLED +config MEMCG_SWAP_ENABLED bool "Memory Resource Controller Swap Extension enabled by default" - depends on CGROUP_MEM_RES_CTLR_SWAP + depends on MEMCG_SWAP default y help Memory Resource Controller Swap Extension comes with its price in @@ -739,9 +739,9 @@ config CGROUP_MEM_RES_CTLR_SWAP_ENABLED For those who want to have the feature enabled by default should select this option (if, for some reason, they need to disable it then swapaccount=0 does the trick). -config CGROUP_MEM_RES_CTLR_KMEM +config MEMCG_KMEM bool "Memory Resource Controller Kernel Memory accounting (EXPERIMENTAL)" - depends on CGROUP_MEM_RES_CTLR && EXPERIMENTAL + depends on MEMCG && EXPERIMENTAL default n help The Kernel Memory extension for Memory Resource Controller can limit diff --git a/kernel/fork.c b/kernel/fork.c index aaa8813..3bd2280 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1306,7 +1306,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG p->memcg_batch.do_batch = 0; p->memcg_batch.memcg = NULL; #endif diff --git a/mm/Makefile b/mm/Makefile index fd6fc1c..290bbfe 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -49,7 +49,7 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o +obj-$(CONFIG_MEMCG) += memcontrol.o page_cgroup.o obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index cc448bb..3a61efc 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -123,7 +123,7 @@ static int pfn_inject_init(void) if (!dentry) goto fail; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP dentry = debugfs_create_u64("corrupt-filter-memcg", 0600, hwpoison_dir, &hwpoison_filter_memcg); if (!dentry) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a2677e0..55a85e1 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -61,12 +61,12 @@ struct cgroup_subsys mem_cgroup_subsys __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 static struct mem_cgroup *root_mem_cgroup __read_mostly; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ int do_swap_account __read_mostly; /* for remember boot option*/ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP_ENABLED +#ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; #else static int really_do_swap_account __initdata = 0; @@ -407,7 +407,7 @@ static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); /* Writing them here to avoid exposing memcg's inner layout */ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM #include #include @@ -466,9 +466,9 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(tcp_proto_cgroup); #endif /* CONFIG_INET */ -#endif /* CONFIG_CGROUP_MEM_RES_CTLR_KMEM */ +#endif /* CONFIG_MEMCG_KMEM */ -#if defined(CONFIG_INET) && defined(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) +#if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) static void disarm_sock_keys(struct mem_cgroup *memcg) { if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto)) @@ -3085,7 +3085,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) } #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP /* * called from swap_entry_free(). remove record in swap_cgroup and * uncharge "memsw" account. @@ -4518,7 +4518,7 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp, return 0; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { return mem_cgroup_sockets_init(memcg, ss); @@ -4608,7 +4608,7 @@ static struct cftype mem_cgroup_files[] = { .read_seq_string = mem_control_numa_stat_show, }, #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP { .name = "memsw.usage_in_bytes", .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), @@ -4795,7 +4795,7 @@ struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) } EXPORT_SYMBOL(parent_mem_cgroup); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP static void __init enable_swap_cgroup(void) { if (!mem_cgroup_disabled() && really_do_swap_account) @@ -5526,7 +5526,7 @@ struct cgroup_subsys mem_cgroup_subsys = { .__DEPRECATED_clear_css_refs = true, }; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP static int __init enable_swap_account(char *s) { /* consider enabled if no parameter or 1 is given */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index b04ff2d..a6e2141 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -128,7 +128,7 @@ static int hwpoison_filter_flags(struct page *p) * can only guarantee that the page either belongs to the memcg tasks, or is * a freed page. */ -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP u64 hwpoison_filter_memcg; EXPORT_SYMBOL_GPL(hwpoison_filter_memcg); static int hwpoison_filter_task(struct page *p) diff --git a/mm/mmzone.c b/mm/mmzone.c index 6830eab..3cef80f 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -96,7 +96,7 @@ void lruvec_init(struct lruvec *lruvec, struct zone *zone) for_each_lru(lru) INIT_LIST_HEAD(&lruvec->lists[lru]); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG lruvec->zone = zone; #endif } diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c82ede6..e6c1064 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -541,7 +541,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index eb750f8..5ddad0c 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -317,7 +317,7 @@ void __meminit pgdat_page_cgroup_init(struct pglist_data *pgdat) #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP +#ifdef CONFIG_MEMCG_SWAP static DEFINE_MUTEX(swap_cgroup_mutex); struct swap_cgroup_ctrl { diff --git a/mm/vmscan.c b/mm/vmscan.c index 347b3ff..6b1f89a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -133,7 +133,7 @@ long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG static bool global_reclaim(struct scan_control *sc) { return !sc->target_mem_cgroup; @@ -2142,7 +2142,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, return nr_reclaimed; } -#ifdef CONFIG_CGROUP_MEM_RES_CTLR +#ifdef CONFIG_MEMCG unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, diff --git a/net/core/sock.c b/net/core/sock.c index 2676a88..a67b062 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -142,7 +142,7 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss) { struct proto *proto; diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index ae2ccf2..15ca63e 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -49,7 +49,7 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o -obj-$(CONFIG_CGROUP_MEM_RES_CTLR_KMEM) += tcp_memcontrol.o +obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o obj-$(CONFIG_NETLABEL) += cipso_ipv4.o obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \ diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5840c32..ed7db3f 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -184,7 +184,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, int ret; unsigned long vec[3]; struct net *net = current->nsproxy->net_ns; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM struct mem_cgroup *memcg; #endif @@ -203,7 +203,7 @@ static int ipv4_tcp_mem(ctl_table *ctl, int write, if (ret) return ret; -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM rcu_read_lock(); memcg = mem_cgroup_from_task(current); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2fbd992..4bc8f67 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2633,7 +2633,7 @@ struct proto tcp_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM .init_cgroup = tcp_init_cgroup, .destroy_cgroup = tcp_destroy_cgroup, .proto_cgroup = tcp_proto_cgroup, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 221224e..61c7b6d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2015,7 +2015,7 @@ struct proto tcpv6_prot = { .compat_setsockopt = compat_tcp_setsockopt, .compat_getsockopt = compat_tcp_getsockopt, #endif -#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM +#ifdef CONFIG_MEMCG_KMEM .proto_cgroup = tcp_proto_cgroup, #endif }; -- cgit v0.10.2 From ca28ddc908fcfef0e5c1b6e5df632db7fc26de10 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 31 Jul 2012 16:43:04 -0700 Subject: mm: remove unused LRU_ALL_EVICTABLE Signed-off-by: Wanpeng Li Acked-by: KAMEZAWA Hiroyuki Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3bdfa15..1495d95 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -209,7 +209,6 @@ struct lruvec { /* Mask used at gathering information at once (see memcontrol.c) */ #define LRU_ALL_FILE (BIT(LRU_INACTIVE_FILE) | BIT(LRU_ACTIVE_FILE)) #define LRU_ALL_ANON (BIT(LRU_INACTIVE_ANON) | BIT(LRU_ACTIVE_ANON)) -#define LRU_ALL_EVICTABLE (LRU_ALL_FILE | LRU_ALL_ANON) #define LRU_ALL ((1 << NR_LRU_LISTS) - 1) /* Isolate clean file */ -- cgit v0.10.2 From 567fb435bb7a37afda35902b884562c40756dc45 Mon Sep 17 00:00:00 2001 From: Glauber Costa Date: Tue, 31 Jul 2012 16:43:07 -0700 Subject: memcg: fix bad behavior in use_hierarchy file I have an application that does the following: * copy the state of all controllers attached to a hierarchy * replicate it as a child of the current level. I would expect writes to the files to mostly succeed, since they are inheriting sane values from parents. But that is not the case for use_hierarchy. If it is set to 0, we succeed ok. If we're set to 1, the value of the file is automatically set to 1 in the children, but if userspace tries to write the very same 1, it will fail. That same situation happens if we set use_hierarchy, create a child, and then try to write 1 again. Now, there is no reason whatsoever for failing to write a value that is already there. It doesn't even match the comments, that states: /* If parent's use_hierarchy is set, we can't make any modifications * in the child subtrees... since we are not changing anything. So test the new value against the one we're storing, and automatically return 0 if we're not proposing a change. Signed-off-by: Glauber Costa Cc: Dhaval Giani Acked-by: Michal Hocko Cc: Kamezawa Hiroyuki Acked-by: Johannes Weiner Cc: Ying Han Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 55a85e1..6d3dd54 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3764,6 +3764,10 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, parent_memcg = mem_cgroup_from_cont(parent); cgroup_lock(); + + if (memcg->use_hierarchy == val) + goto out; + /* * If parent's use_hierarchy is set, we can't make any modifications * in the child subtrees. If it is unset, then the change can @@ -3780,6 +3784,8 @@ static int mem_cgroup_hierarchy_write(struct cgroup *cont, struct cftype *cft, retval = -EBUSY; } else retval = -EINVAL; + +out: cgroup_unlock(); return retval; -- cgit v0.10.2 From ab2158848775c7918288f2c423d3e4dbbc7d34eb Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 31 Jul 2012 16:43:09 -0700 Subject: memcg: rename mem_control_xxx to memcg_xxx Replace memory_cgroup_xxx() with memcg_xxx() Signed-off-by: Wanpeng Li Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6d3dd54..b11fb2f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4006,7 +4006,7 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp, #endif #ifdef CONFIG_NUMA -static int mem_control_numa_stat_show(struct cgroup *cont, struct cftype *cft, +static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft, struct seq_file *m) { int nid; @@ -4065,7 +4065,7 @@ static inline void mem_cgroup_lru_names_not_uptodate(void) BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); } -static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, +static int memcg_stat_show(struct cgroup *cont, struct cftype *cft, struct seq_file *m) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); @@ -4579,7 +4579,7 @@ static struct cftype mem_cgroup_files[] = { }, { .name = "stat", - .read_seq_string = mem_control_stat_show, + .read_seq_string = memcg_stat_show, }, { .name = "force_empty", @@ -4611,7 +4611,7 @@ static struct cftype mem_cgroup_files[] = { #ifdef CONFIG_NUMA { .name = "numa_stat", - .read_seq_string = mem_control_numa_stat_show, + .read_seq_string = memcg_numa_stat_show, }, #endif #ifdef CONFIG_MEMCG_SWAP -- cgit v0.10.2 From 7db8889ab05b57200158432755af318fb68854a2 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 31 Jul 2012 16:43:12 -0700 Subject: mm: have order > 0 compaction start off where it left Order > 0 compaction stops when enough free pages of the correct page order have been coalesced. When doing subsequent higher order allocations, it is possible for compaction to be invoked many times. However, the compaction code always starts out looking for things to compact at the start of the zone, and for free pages to compact things to at the end of the zone. This can cause quadratic behaviour, with isolate_freepages starting at the end of the zone each time, even though previous invocations of the compaction code already filled up all free memory on that end of the zone. This can cause isolate_freepages to take enormous amounts of CPU with certain workloads on larger memory systems. The obvious solution is to have isolate_freepages remember where it left off last time, and continue at that point the next time it gets invoked for an order > 0 compaction. This could cause compaction to fail if cc->free_pfn and cc->migrate_pfn are close together initially, in that case we restart from the end of the zone and try once more. Forced full (order == -1) compactions are left alone. [akpm@linux-foundation.org: checkpatch fixes] [akpm@linux-foundation.org: s/laste/last/, use 80 cols] Signed-off-by: Rik van Riel Reported-by: Jim Schutt Tested-by: Jim Schutt Cc: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1495d95..1aeadce 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -368,6 +368,10 @@ struct zone { */ spinlock_t lock; int all_unreclaimable; /* All pages pinned */ +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + /* pfn where the last incremental compaction isolated free pages */ + unsigned long compact_cached_free_pfn; +#endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ seqlock_t span_seqlock; diff --git a/mm/compaction.c b/mm/compaction.c index 2f42d95..e78cb96 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -422,6 +422,17 @@ static void isolate_freepages(struct zone *zone, pfn -= pageblock_nr_pages) { unsigned long isolated; + /* + * Skip ahead if another thread is compacting in the area + * simultaneously. If we wrapped around, we can only skip + * ahead if zone->compact_cached_free_pfn also wrapped to + * above our starting point. + */ + if (cc->order > 0 && (!cc->wrapped || + zone->compact_cached_free_pfn > + cc->start_free_pfn)) + pfn = min(pfn, zone->compact_cached_free_pfn); + if (!pfn_valid(pfn)) continue; @@ -461,8 +472,11 @@ static void isolate_freepages(struct zone *zone, * looking for free pages, the search will restart here as * page migration may have returned some pages to the allocator */ - if (isolated) + if (isolated) { high_pfn = max(high_pfn, pfn); + if (cc->order > 0) + zone->compact_cached_free_pfn = high_pfn; + } } /* split_free_page does not map the pages */ @@ -556,6 +570,20 @@ static isolate_migrate_t isolate_migratepages(struct zone *zone, return ISOLATE_SUCCESS; } +/* + * Returns the start pfn of the last page block in a zone. This is the starting + * point for full compaction of a zone. Compaction searches for free pages from + * the end of each zone, while isolate_freepages_block scans forward inside each + * page block. + */ +static unsigned long start_free_pfn(struct zone *zone) +{ + unsigned long free_pfn; + free_pfn = zone->zone_start_pfn + zone->spanned_pages; + free_pfn &= ~(pageblock_nr_pages-1); + return free_pfn; +} + static int compact_finished(struct zone *zone, struct compact_control *cc) { @@ -565,8 +593,26 @@ static int compact_finished(struct zone *zone, if (fatal_signal_pending(current)) return COMPACT_PARTIAL; - /* Compaction run completes if the migrate and free scanner meet */ - if (cc->free_pfn <= cc->migrate_pfn) + /* + * A full (order == -1) compaction run starts at the beginning and + * end of a zone; it completes when the migrate and free scanner meet. + * A partial (order > 0) compaction can start with the free scanner + * at a random point in the zone, and may have to restart. + */ + if (cc->free_pfn <= cc->migrate_pfn) { + if (cc->order > 0 && !cc->wrapped) { + /* We started partway through; restart at the end. */ + unsigned long free_pfn = start_free_pfn(zone); + zone->compact_cached_free_pfn = free_pfn; + cc->free_pfn = free_pfn; + cc->wrapped = 1; + return COMPACT_CONTINUE; + } + return COMPACT_COMPLETE; + } + + /* We wrapped around and ended up where we started. */ + if (cc->wrapped && cc->free_pfn <= cc->start_free_pfn) return COMPACT_COMPLETE; /* @@ -664,8 +710,15 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) /* Setup to move all movable pages to the end of the zone */ cc->migrate_pfn = zone->zone_start_pfn; - cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; - cc->free_pfn &= ~(pageblock_nr_pages-1); + + if (cc->order > 0) { + /* Incremental compaction. Start where the last one stopped. */ + cc->free_pfn = zone->compact_cached_free_pfn; + cc->start_free_pfn = cc->free_pfn; + } else { + /* Order == -1 starts at the end of the zone. */ + cc->free_pfn = start_free_pfn(zone); + } migrate_prep_local(); diff --git a/mm/internal.h b/mm/internal.h index 2ba87fb..da6b9b2 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -118,8 +118,14 @@ struct compact_control { unsigned long nr_freepages; /* Number of isolated free pages */ unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ + unsigned long start_free_pfn; /* where we started the search */ unsigned long migrate_pfn; /* isolate_migratepages search base */ bool sync; /* Synchronous migration */ + bool wrapped; /* Order > 0 compactions are + incremental, once free_pfn + and migrate_pfn meet, we restart + from the top of the zone; + remember we wrapped around. */ int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fba2a12..94fc475 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4397,6 +4397,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->spanned_pages = size; zone->present_pages = realsize; +#if defined CONFIG_COMPACTION || defined CONFIG_CMA + zone->compact_cached_free_pfn = zone->zone_start_pfn + + zone->spanned_pages; + zone->compact_cached_free_pfn &= ~(pageblock_nr_pages-1); +#endif #ifdef CONFIG_NUMA zone->node = nid; zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio) -- cgit v0.10.2 From fe03025db3f4ade1f231b174938e0fe224722759 Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Tue, 31 Jul 2012 16:43:14 -0700 Subject: mm: CONFIG_HAVE_MEMBLOCK_NODE -> CONFIG_HAVE_MEMBLOCK_NODE_MAP 0ee332c14518699 ("memblock: Kill early_node_map[]") wanted to replace CONFIG_ARCH_POPULATES_NODE_MAP with CONFIG_HAVE_MEMBLOCK_NODE_MAP but ended up replacing one occurence with a reference to the non-existent symbol CONFIG_HAVE_MEMBLOCK_NODE. The resulting omission of code would probably have been causing problems to 32-bit machines with memory hotplug. Signed-off-by: Rabin Vincent Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1aeadce..f64afa5 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -776,7 +776,7 @@ extern int movable_zone; static inline int zone_movable_is_highmem(void) { -#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE) +#if defined(CONFIG_HIGHMEM) && defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) return movable_zone == ZONE_HIGHMEM; #else return 0; -- cgit v0.10.2 From 8e125cd85517c9716695b0abfabc0a4a3fcb94f3 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:43:16 -0700 Subject: vmscan: remove obsolete shrink_control comment 09f363c7 ("vmscan: fix shrinker callback bug in fs/super.c") fixed a shrinker callback which was returning -1 when nr_to_scan is zero, which caused excessive slab scanning. But 635697c6 ("vmscan: fix initial shrinker size handling") fixed the problem, again so we can freely return -1 although nr_to_scan is zero. So let's revert 09f363c7 because the comment added in 09f363c7 made an unnecessary rule. Signed-off-by: Minchan Kim Cc: Al Viro Cc: Mikulas Patocka Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/super.c b/fs/super.c index 4c5d82f..4bf7144 100644 --- a/fs/super.c +++ b/fs/super.c @@ -62,7 +62,7 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc) return -1; if (!grab_super_passive(sb)) - return !sc->nr_to_scan ? 0 : -1; + return -1; if (sb->s_op && sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb); diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h index 07ceb97..ac6b8ee 100644 --- a/include/linux/shrinker.h +++ b/include/linux/shrinker.h @@ -20,7 +20,6 @@ struct shrink_control { * 'nr_to_scan' entries and attempt to free them up. It should return * the number of objects which remain in the cache. If it returns -1, it means * it cannot do any scanning at this time (eg. there is a risk of deadlock). - * The callback must not return -1 if nr_to_scan is zero. * * The 'gfpmask' refers to the allocation we are currently trying to * fulfil. -- cgit v0.10.2 From 51a07e50b230d14e1b8bef50d66655d003fa006c Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 31 Jul 2012 16:43:18 -0700 Subject: mm/memory.c:print_vma_addr(): call up_read(&mm->mmap_sem) directly Call up_read(&mm->mmap_sem) directly since we have already got mm via current->mm at the beginning of print_vma_addr(). Signed-off-by: Jie Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory.c b/mm/memory.c index 59e5beb..ec72a61 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3941,7 +3941,7 @@ void print_vma_addr(char *prefix, unsigned long ip) free_page((unsigned long)buf); } } - up_read(¤t->mm->mmap_sem); + up_read(&mm->mmap_sem); } #ifdef CONFIG_PROVE_LOCKING -- cgit v0.10.2 From ca57df79d4f64e1a4886606af4289d40636189c5 Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Tue, 31 Jul 2012 16:43:19 -0700 Subject: mm: setup pageblock_order before it's used by sparsemem On architectures with CONFIG_HUGETLB_PAGE_SIZE_VARIABLE set, such as Itanium, pageblock_order is a variable with default value of 0. It's set to the right value by set_pageblock_order() in function free_area_init_core(). But pageblock_order may be used by sparse_init() before free_area_init_core() is called along path: sparse_init() ->sparse_early_usemaps_alloc_node() ->usemap_size() ->SECTION_BLOCKFLAGS_BITS ->((1UL << (PFN_SECTION_SHIFT - pageblock_order)) * NR_PAGEBLOCK_BITS) The uninitialized pageblock_size will cause memory wasting because usemap_size() returns a much bigger value then it's really needed. For example, on an Itanium platform, sparse_init() pageblock_order=0 usemap_size=24576 free_area_init_core() before pageblock_order=0, usemap_size=24576 free_area_init_core() after pageblock_order=12, usemap_size=8 That means 24K memory has been wasted for each section, so fix it by calling set_pageblock_order() from sparse_init(). Signed-off-by: Xishi Qiu Signed-off-by: Jiang Liu Cc: Tony Luck Cc: Yinghai Lu Cc: KAMEZAWA Hiroyuki Cc: Benjamin Herrenschmidt Cc: KOSAKI Motohiro Cc: David Rientjes Cc: Keping Chen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/internal.h b/mm/internal.h index da6b9b2..3314f79 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -353,3 +353,5 @@ extern u32 hwpoison_filter_enable; extern unsigned long vm_mmap_pgoff(struct file *, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); + +extern void set_pageblock_order(void); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 94fc475..6c7e3bd 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4304,7 +4304,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -static inline void __init set_pageblock_order(void) +void __init set_pageblock_order(void) { unsigned int order; @@ -4332,7 +4332,7 @@ static inline void __init set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -static inline void set_pageblock_order(void) +void __init set_pageblock_order(void) { } diff --git a/mm/sparse.c b/mm/sparse.c index c7bb952..950981f 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -493,6 +493,9 @@ void __init sparse_init(void) struct page **map_map; #endif + /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ + set_pageblock_order(); + /* * map is using big page (aka 2M in x86 64 bit) * usemap is less one page (aka 24 bytes) -- cgit v0.10.2 From 05a73ed29a3aef4cf8d095ec5b08afecb959fa24 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 31 Jul 2012 16:43:21 -0700 Subject: mm/memcg: complete documentation for tcp memcg files Signed-off-by: Wanpeng Li Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 672676a..4372e6b 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -73,6 +73,8 @@ Brief summary of control files. memory.kmem.tcp.limit_in_bytes # set/show hard limit for tcp buf memory memory.kmem.tcp.usage_in_bytes # show current tcp buf memory allocation + memory.kmem.tcp.failcnt # show the number of tcp buf memory usage hits limits + memory.kmem.tcp.max_usage_in_bytes # show max tcp buf memory usage recorded 1. History -- cgit v0.10.2 From aaad153e3408a4b8784de4c8446a40e70d57481f Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 31 Jul 2012 16:43:23 -0700 Subject: mm/memcg: mem_cgroup_relize_xxx_limit can guarantee memcg->res.limit <= memcg->memsw.limit Signed-off-by: Wanpeng Li Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b11fb2f..ac87e79 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3416,7 +3416,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, /* * Rather than hide all in some function, I do this in * open coded manner. You see what this really does. - * We have to guarantee memcg->res.limit < memcg->memsw.limit. + * We have to guarantee memcg->res.limit <= memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); @@ -3477,7 +3477,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, /* * Rather than hide all in some function, I do this in * open coded manner. You see what this really does. - * We have to guarantee memcg->res.limit < memcg->memsw.limit. + * We have to guarantee memcg->res.limit <= memcg->memsw.limit. */ mutex_lock(&set_limit_mutex); memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); -- cgit v0.10.2 From da92c47d069890106484cb6605df701a54d24499 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 31 Jul 2012 16:43:26 -0700 Subject: mm/memcg: replace inexistence move_lock_page_cgroup() by move_lock_mem_cgroup() in comment Signed-off-by: Wanpeng Li Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac87e79..4f73c82 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1898,7 +1898,7 @@ again: return; /* * If this memory cgroup is not under account moving, we don't - * need to take move_lock_page_cgroup(). Because we already hold + * need to take move_lock_mem_cgroup(). Because we already hold * rcu_read_lock(), any calls to move_account will be delayed until * rcu_read_unlock() if mem_cgroup_stolen() == true. */ @@ -1920,7 +1920,7 @@ void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) /* * It's guaranteed that pc->mem_cgroup never changes while * lock is held because a routine modifies pc->mem_cgroup - * should take move_lock_page_cgroup(). + * should take move_lock_mem_cgroup(). */ move_unlock_mem_cgroup(pc->mem_cgroup, flags); } -- cgit v0.10.2 From 9adb62a5df9c0fbef7b4665919329f73a34651ed Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 31 Jul 2012 16:43:28 -0700 Subject: mm/hotplug: correctly setup fallback zonelists when creating new pgdat When hotadd_new_pgdat() is called to create new pgdat for a new node, a fallback zonelist should be created for the new node. There's code to try to achieve that in hotadd_new_pgdat() as below: /* * The node we allocated has no zone fallback lists. For avoiding * to access not-initialized zonelist, build here. */ mutex_lock(&zonelists_mutex); build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); But it doesn't work as expected. When hotadd_new_pgdat() is called, the new node is still in offline state because node_set_online(nid) hasn't been called yet. And build_all_zonelists() only builds zonelists for online nodes as: for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); build_zonelists(pgdat); build_zonelist_cache(pgdat); } Though we hope to create zonelist for the new pgdat, but it doesn't. So add a new parameter "pgdat" the build_all_zonelists() to build pgdat for the new pgdat too. Signed-off-by: Jiang Liu Signed-off-by: Xishi Qiu Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Rusty Russell Cc: Yinghai Lu Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: David Rientjes Cc: Keping Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index f64afa5..98f079b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -721,7 +721,7 @@ typedef struct pglist_data { #include extern struct mutex zonelists_mutex; -void build_all_zonelists(void *data); +void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); diff --git a/init/main.c b/init/main.c index 95316a1..e60679d 100644 --- a/init/main.c +++ b/init/main.c @@ -506,7 +506,7 @@ asmlinkage void __init start_kernel(void) setup_per_cpu_areas(); smp_prepare_boot_cpu(); /* arch-specific boot-cpu hooks */ - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); page_alloc_init(); printk(KERN_NOTICE "Kernel command line: %s\n", boot_command_line); diff --git a/kernel/cpu.c b/kernel/cpu.c index a4eb522..14d3258 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -416,7 +416,7 @@ int __cpuinit cpu_up(unsigned int cpu) if (pgdat->node_zonelists->_zonerefs->zone == NULL) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } #endif diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 427bb29..b873104 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -513,7 +513,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; if (need_zonelists_rebuild) - build_all_zonelists(zone); + build_all_zonelists(NULL, zone); else zone_pcp_update(zone); @@ -562,7 +562,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) * to access not-initialized zonelist, build here. */ mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(pgdat, NULL); mutex_unlock(&zonelists_mutex); return pgdat; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6c7e3bd..9ad6866 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3032,7 +3032,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) { mutex_lock(&zonelists_mutex); - build_all_zonelists(NULL); + build_all_zonelists(NULL, NULL); mutex_unlock(&zonelists_mutex); } } @@ -3415,10 +3415,17 @@ static __init_refok int __build_all_zonelists(void *data) { int nid; int cpu; + pg_data_t *self = data; #ifdef CONFIG_NUMA memset(node_load, 0, sizeof(node_load)); #endif + + if (self && !node_online(self->node_id)) { + build_zonelists(self); + build_zonelist_cache(self); + } + for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -3463,7 +3470,7 @@ static __init_refok int __build_all_zonelists(void *data) * Called with zonelists_mutex held always * unless system_state == SYSTEM_BOOTING. */ -void __ref build_all_zonelists(void *data) +void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) { set_zonelist_order(); @@ -3475,10 +3482,10 @@ void __ref build_all_zonelists(void *data) /* we have to stop all cpus to guarantee there is no user of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG - if (data) - setup_zone_pageset((struct zone *)data); + if (zone) + setup_zone_pageset(zone); #endif - stop_machine(__build_all_zonelists, NULL, NULL); + stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } vm_total_pages = nr_free_pagecache_pages(); -- cgit v0.10.2 From 08dff7b7d629807dbb1f398c68dd9cd58dd657a1 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 31 Jul 2012 16:43:30 -0700 Subject: mm/hotplug: correctly add new zone to all other nodes' zone lists When online_pages() is called to add new memory to an empty zone, it rebuilds all zone lists by calling build_all_zonelists(). But there's a bug which prevents the new zone to be added to other nodes' zone lists. online_pages() { build_all_zonelists() ..... node_set_state(zone_to_nid(zone), N_HIGH_MEMORY) } Here the node of the zone is put into N_HIGH_MEMORY state after calling build_all_zonelists(), but build_all_zonelists() only adds zones from nodes in N_HIGH_MEMORY state to the fallback zone lists. build_all_zonelists() ->__build_all_zonelists() ->build_zonelists() ->find_next_best_node() ->for_each_node_state(n, N_HIGH_MEMORY) So memory in the new zone will never be used by other nodes, and it may cause strange behavor when system is under memory pressure. So put node into N_HIGH_MEMORY state before calling build_all_zonelists(). Signed-off-by: Jianguo Wu Signed-off-by: Jiang Liu Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Rusty Russell Cc: Yinghai Lu Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: David Rientjes Cc: Keping Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index b873104..597d371 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -512,19 +512,20 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; - if (need_zonelists_rebuild) - build_all_zonelists(NULL, zone); - else - zone_pcp_update(zone); + if (onlined_pages) { + node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); + if (need_zonelists_rebuild) + build_all_zonelists(NULL, zone); + else + zone_pcp_update(zone); + } mutex_unlock(&zonelists_mutex); init_per_zone_wmark_min(); - if (onlined_pages) { + if (onlined_pages) kswapd_run(zone_to_nid(zone)); - node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); - } vm_total_pages = nr_free_pagecache_pages(); -- cgit v0.10.2 From 340175b7d14d5617559d0c1a54fa0ea204d9edcd Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 31 Jul 2012 16:43:32 -0700 Subject: mm/hotplug: free zone->pageset when a zone becomes empty When a zone becomes empty after memory offlining, free zone->pageset. Otherwise it will cause memory leak when adding memory to the empty zone again because build_all_zonelists() will allocate zone->pageset for an empty zone. Signed-off-by: Jiang Liu Signed-off-by: Wei Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Rusty Russell Cc: Yinghai Lu Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: David Rientjes Cc: Keping Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index 3955bed..7c6dfd2 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1331,6 +1331,7 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...); extern void setup_per_cpu_pageset(void); extern void zone_pcp_update(struct zone *zone); +extern void zone_pcp_reset(struct zone *zone); /* nommu.c */ extern atomic_long_t mmap_pages_allocated; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 597d371..3ad25f9 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -966,6 +966,9 @@ repeat: init_per_zone_wmark_min(); + if (!populated_zone(zone)) + zone_pcp_reset(zone); + if (!node_present_pages(node)) { node_clear_state(node, N_HIGH_MEMORY); kswapd_stop(node); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9ad6866..9c9a316 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5894,6 +5894,19 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) #endif #ifdef CONFIG_MEMORY_HOTREMOVE +void zone_pcp_reset(struct zone *zone) +{ + unsigned long flags; + + /* avoid races with drain_pages() */ + local_irq_save(flags); + if (zone->pageset != &boot_pageset) { + free_percpu(zone->pageset); + zone->pageset = &boot_pageset; + } + local_irq_restore(flags); +} + /* * All pages in the range must be isolated before calling this. */ -- cgit v0.10.2 From 4ed7e02222aba062bd0ed3ab12dfc8e9fc0467b5 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Tue, 31 Jul 2012 16:43:35 -0700 Subject: mm/hotplug: mark memory hotplug code in page_alloc.c as __meminit Mark functions used by both boot and memory hotplug as __meminit to reduce memory footprint when memory hotplug is disabled. Alos guard zone_pcp_update() with CONFIG_MEMORY_HOTPLUG because it's only used by memory hotplug code. Signed-off-by: Jiang Liu Cc: Wei Wang Cc: Mel Gorman Cc: Michal Hocko Cc: Minchan Kim Cc: Rusty Russell Cc: Yinghai Lu Cc: Tony Luck Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: David Rientjes Cc: Keping Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9c9a316..667338e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3411,7 +3411,7 @@ static void setup_zone_pageset(struct zone *zone); DEFINE_MUTEX(zonelists_mutex); /* return values int ....just for stop_machine() */ -static __init_refok int __build_all_zonelists(void *data) +static int __build_all_zonelists(void *data) { int nid; int cpu; @@ -3755,7 +3755,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY) #endif -static int zone_batchsize(struct zone *zone) +static int __meminit zone_batchsize(struct zone *zone) { #ifdef CONFIG_MMU int batch; @@ -3837,7 +3837,7 @@ static void setup_pagelist_highmark(struct per_cpu_pageset *p, pcp->batch = PAGE_SHIFT * 8; } -static void setup_zone_pageset(struct zone *zone) +static void __meminit setup_zone_pageset(struct zone *zone) { int cpu; @@ -3910,33 +3910,6 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) return 0; } -static int __zone_pcp_update(void *data) -{ - struct zone *zone = data; - int cpu; - unsigned long batch = zone_batchsize(zone), flags; - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; - - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; - - local_irq_save(flags); - if (pcp->count > 0) - free_pcppages_bulk(zone, pcp->count, pcp); - setup_pageset(pset, batch); - local_irq_restore(flags); - } - return 0; -} - -void zone_pcp_update(struct zone *zone) -{ - stop_machine(__zone_pcp_update, zone, NULL); -} - static __meminit void zone_pcp_init(struct zone *zone) { /* @@ -3952,7 +3925,7 @@ static __meminit void zone_pcp_init(struct zone *zone) zone_batchsize(zone)); } -__meminit int init_currently_empty_zone(struct zone *zone, +int __meminit init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size, enum memmap_context context) @@ -4765,7 +4738,7 @@ out: } /* Any regular memory on that node ? */ -static void check_for_regular_memory(pg_data_t *pgdat) +static void __init check_for_regular_memory(pg_data_t *pgdat) { #ifdef CONFIG_HIGHMEM enum zone_type zone_type; @@ -5893,6 +5866,35 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) } #endif +#ifdef CONFIG_MEMORY_HOTPLUG +static int __meminit __zone_pcp_update(void *data) +{ + struct zone *zone = data; + int cpu; + unsigned long batch = zone_batchsize(zone), flags; + + for_each_possible_cpu(cpu) { + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + + pset = per_cpu_ptr(zone->pageset, cpu); + pcp = &pset->pcp; + + local_irq_save(flags); + if (pcp->count > 0) + free_pcppages_bulk(zone, pcp->count, pcp); + setup_pageset(pset, batch); + local_irq_restore(flags); + } + return 0; +} + +void __meminit zone_pcp_update(struct zone *zone) +{ + stop_machine(__zone_pcp_update, zone, NULL); +} +#endif + #ifdef CONFIG_MEMORY_HOTREMOVE void zone_pcp_reset(struct zone *zone) { -- cgit v0.10.2 From 62ce1c706f817cb9defef3ac2dfdd815149f2968 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Jul 2012 16:43:39 -0700 Subject: mm, oom: move declaration for mem_cgroup_out_of_memory to oom.h mem_cgroup_out_of_memory() is defined in mm/oom_kill.c, so declare it in linux/oom.h rather than linux/memcontrol.h. Acked-by: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Acked-by: Michal Hocko Signed-off-by: David Rientjes Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1700762..c0bff89 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -72,8 +72,6 @@ extern void mem_cgroup_uncharge_end(void); extern void mem_cgroup_uncharge_page(struct page *page); extern void mem_cgroup_uncharge_cache_page(struct page *page); -extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, - int order); bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, struct mem_cgroup *memcg); int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg); diff --git a/include/linux/oom.h b/include/linux/oom.h index e4c29bc..eb9dc14 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -49,6 +49,8 @@ extern unsigned long oom_badness(struct task_struct *p, extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); +extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order); extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *mask, bool force_kill); extern int register_oom_notifier(struct notifier_block *nb); -- cgit v0.10.2 From 462607ecc519b197f7b5cc6b024a1c26fa6fc0ac Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Jul 2012 16:43:40 -0700 Subject: mm, oom: introduce helper function to process threads during scan This patch introduces a helper function to process each thread during the iteration over the tasklist. A new return type, enum oom_scan_t, is defined to determine the future behavior of the iteration: - OOM_SCAN_OK: continue scanning the thread and find its badness, - OOM_SCAN_CONTINUE: do not consider this thread for oom kill, it's ineligible, - OOM_SCAN_ABORT: abort the iteration and return, or - OOM_SCAN_SELECT: always select this thread with the highest badness possible. There is no functional change with this patch. This new helper function will be used in the next patch in the memory controller. Reviewed-by: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Reviewed-by: Michal Hocko Signed-off-by: David Rientjes Cc: Oleg Nesterov Reviewed-by: Sha Zhengju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e6c1064..f8eba96 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -288,6 +288,59 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, } #endif +enum oom_scan_t { + OOM_SCAN_OK, /* scan thread and find its badness */ + OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */ + OOM_SCAN_ABORT, /* abort the iteration and return */ + OOM_SCAN_SELECT, /* always select this thread first */ +}; + +static enum oom_scan_t oom_scan_process_thread(struct task_struct *task, + struct mem_cgroup *memcg, unsigned long totalpages, + const nodemask_t *nodemask, bool force_kill) +{ + if (task->exit_state) + return OOM_SCAN_CONTINUE; + if (oom_unkillable_task(task, memcg, nodemask)) + return OOM_SCAN_CONTINUE; + + /* + * This task already has access to memory reserves and is being killed. + * Don't allow any other task to have access to the reserves. + */ + if (test_tsk_thread_flag(task, TIF_MEMDIE)) { + if (unlikely(frozen(task))) + __thaw_task(task); + if (!force_kill) + return OOM_SCAN_ABORT; + } + if (!task->mm) + return OOM_SCAN_CONTINUE; + + if (task->flags & PF_EXITING) { + /* + * If task is current and is in the process of releasing memory, + * allow the "kill" to set TIF_MEMDIE, which will allow it to + * access memory reserves. Otherwise, it may stall forever. + * + * The iteration isn't broken here, however, in case other + * threads are found to have already been oom killed. + */ + if (task == current) + return OOM_SCAN_SELECT; + else if (!force_kill) { + /* + * If this task is not being ptraced on exit, then wait + * for it to finish before killing some other task + * unnecessarily. + */ + if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) + return OOM_SCAN_ABORT; + } + } + return OOM_SCAN_OK; +} + /* * Simple selection loop. We chose the process with the highest * number of 'points'. We expect the caller will lock the tasklist. @@ -305,53 +358,19 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, do_each_thread(g, p) { unsigned int points; - if (p->exit_state) - continue; - if (oom_unkillable_task(p, memcg, nodemask)) - continue; - - /* - * This task already has access to memory reserves and is - * being killed. Don't allow any other task access to the - * memory reserve. - * - * Note: this may have a chance of deadlock if it gets - * blocked waiting for another task which itself is waiting - * for memory. Is there a better alternative? - */ - if (test_tsk_thread_flag(p, TIF_MEMDIE)) { - if (unlikely(frozen(p))) - __thaw_task(p); - if (!force_kill) - return ERR_PTR(-1UL); - } - if (!p->mm) + switch (oom_scan_process_thread(p, memcg, totalpages, nodemask, + force_kill)) { + case OOM_SCAN_SELECT: + chosen = p; + chosen_points = ULONG_MAX; + /* fall through */ + case OOM_SCAN_CONTINUE: continue; - - if (p->flags & PF_EXITING) { - /* - * If p is the current task and is in the process of - * releasing memory, we allow the "kill" to set - * TIF_MEMDIE, which will allow it to gain access to - * memory reserves. Otherwise, it may stall forever. - * - * The loop isn't broken here, however, in case other - * threads are found to have already been oom killed. - */ - if (p == current) { - chosen = p; - chosen_points = ULONG_MAX; - } else if (!force_kill) { - /* - * If this task is not being ptraced on exit, - * then wait for it to finish before killing - * some other task unnecessarily. - */ - if (!(p->group_leader->ptrace & PT_TRACE_EXIT)) - return ERR_PTR(-1UL); - } - } - + case OOM_SCAN_ABORT: + return ERR_PTR(-1UL); + case OOM_SCAN_OK: + break; + }; points = oom_badness(p, memcg, nodemask, totalpages); if (points > chosen_points) { chosen = p; -- cgit v0.10.2 From 9cbb78bb314360a860a8b23723971cb6fcb54176 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Jul 2012 16:43:44 -0700 Subject: mm, memcg: introduce own oom handler to iterate only over its own threads The global oom killer is serialized by the per-zonelist try_set_zonelist_oom() which is used in the page allocator. Concurrent oom kills are thus a rare event and only occur in systems using mempolicies and with a large number of nodes. Memory controller oom kills, however, can frequently be concurrent since there is no serialization once the oom killer is called for oom conditions in several different memcgs in parallel. This creates a massive contention on tasklist_lock since the oom killer requires the readside for the tasklist iteration. If several memcgs are calling the oom killer, this lock can be held for a substantial amount of time, especially if threads continue to enter it as other threads are exiting. Since the exit path grabs the writeside of the lock with irqs disabled in a few different places, this can cause a soft lockup on cpus as a result of tasklist_lock starvation. The kernel lacks unfair writelocks, and successful calls to the oom killer usually result in at least one thread entering the exit path, so an alternative solution is needed. This patch introduces a seperate oom handler for memcgs so that they do not require tasklist_lock for as much time. Instead, it iterates only over the threads attached to the oom memcg and grabs a reference to the selected thread before calling oom_kill_process() to ensure it doesn't prematurely exit. This still requires tasklist_lock for the tasklist dump, iterating children of the selected process, and killing all other threads on the system sharing the same memory as the selected victim. So while this isn't a complete solution to tasklist_lock starvation, it significantly reduces the amount of time that it is held. Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Signed-off-by: David Rientjes Cc: Oleg Nesterov Cc: KOSAKI Motohiro Reviewed-by: Sha Zhengju Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index c0bff89..2a80544 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -180,7 +180,8 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, unsigned long *total_scanned); -u64 mem_cgroup_get_limit(struct mem_cgroup *memcg); +extern void __mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order); void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -364,12 +365,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, return 0; } -static inline -u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) -{ - return 0; -} - static inline void mem_cgroup_split_huge_fixup(struct page *head) { } diff --git a/include/linux/oom.h b/include/linux/oom.h index eb9dc14..5dc0e38 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -40,17 +40,33 @@ enum oom_constraint { CONSTRAINT_MEMCG, }; +enum oom_scan_t { + OOM_SCAN_OK, /* scan thread and find its badness */ + OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */ + OOM_SCAN_ABORT, /* abort the iteration and return */ + OOM_SCAN_SELECT, /* always select this thread first */ +}; + extern void compare_swap_oom_score_adj(int old_val, int new_val); extern int test_set_oom_score_adj(int new_val); extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, unsigned long totalpages); +extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + unsigned int points, unsigned long totalpages, + struct mem_cgroup *memcg, nodemask_t *nodemask, + const char *message); + extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); +extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, + unsigned long totalpages, const nodemask_t *nodemask, + bool force_kill); extern void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order); + extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order, nodemask_t *mask, bool force_kill); extern int register_oom_notifier(struct notifier_block *nb); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4f73c82..b78972e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1453,7 +1453,7 @@ static int mem_cgroup_count_children(struct mem_cgroup *memcg) /* * Return the memory (and swap, if configured) limit for a memcg. */ -u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) +static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) { u64 limit; u64 memsw; @@ -1469,6 +1469,65 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return min(limit, memsw); } +void __mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order) +{ + struct mem_cgroup *iter; + unsigned long chosen_points = 0; + unsigned long totalpages; + unsigned int points = 0; + struct task_struct *chosen = NULL; + + totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; + for_each_mem_cgroup_tree(iter, memcg) { + struct cgroup *cgroup = iter->css.cgroup; + struct cgroup_iter it; + struct task_struct *task; + + cgroup_iter_start(cgroup, &it); + while ((task = cgroup_iter_next(cgroup, &it))) { + switch (oom_scan_process_thread(task, totalpages, NULL, + false)) { + case OOM_SCAN_SELECT: + if (chosen) + put_task_struct(chosen); + chosen = task; + chosen_points = ULONG_MAX; + get_task_struct(chosen); + /* fall through */ + case OOM_SCAN_CONTINUE: + continue; + case OOM_SCAN_ABORT: + cgroup_iter_end(cgroup, &it); + mem_cgroup_iter_break(memcg, iter); + if (chosen) + put_task_struct(chosen); + return; + case OOM_SCAN_OK: + break; + }; + points = oom_badness(task, memcg, NULL, totalpages); + if (points > chosen_points) { + if (chosen) + put_task_struct(chosen); + chosen = task; + chosen_points = points; + get_task_struct(chosen); + } + } + cgroup_iter_end(cgroup, &it); + } + + if (!chosen) + return; + points = chosen_points * 1000 / totalpages; + read_lock(&tasklist_lock); + oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, + NULL, "Memory cgroup out of memory"); + read_unlock(&tasklist_lock); + put_task_struct(chosen); +} + static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned long flags) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index f8eba96..c0c97ae 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -288,20 +288,13 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist, } #endif -enum oom_scan_t { - OOM_SCAN_OK, /* scan thread and find its badness */ - OOM_SCAN_CONTINUE, /* do not consider thread for oom kill */ - OOM_SCAN_ABORT, /* abort the iteration and return */ - OOM_SCAN_SELECT, /* always select this thread first */ -}; - -static enum oom_scan_t oom_scan_process_thread(struct task_struct *task, - struct mem_cgroup *memcg, unsigned long totalpages, - const nodemask_t *nodemask, bool force_kill) +enum oom_scan_t oom_scan_process_thread(struct task_struct *task, + unsigned long totalpages, const nodemask_t *nodemask, + bool force_kill) { if (task->exit_state) return OOM_SCAN_CONTINUE; - if (oom_unkillable_task(task, memcg, nodemask)) + if (oom_unkillable_task(task, NULL, nodemask)) return OOM_SCAN_CONTINUE; /* @@ -348,8 +341,8 @@ static enum oom_scan_t oom_scan_process_thread(struct task_struct *task, * (not docbooked, we don't want this one cluttering up the manual) */ static struct task_struct *select_bad_process(unsigned int *ppoints, - unsigned long totalpages, struct mem_cgroup *memcg, - const nodemask_t *nodemask, bool force_kill) + unsigned long totalpages, const nodemask_t *nodemask, + bool force_kill) { struct task_struct *g, *p; struct task_struct *chosen = NULL; @@ -358,7 +351,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, do_each_thread(g, p) { unsigned int points; - switch (oom_scan_process_thread(p, memcg, totalpages, nodemask, + switch (oom_scan_process_thread(p, totalpages, nodemask, force_kill)) { case OOM_SCAN_SELECT: chosen = p; @@ -371,7 +364,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, case OOM_SCAN_OK: break; }; - points = oom_badness(p, memcg, nodemask, totalpages); + points = oom_badness(p, NULL, nodemask, totalpages); if (points > chosen_points) { chosen = p; chosen_points = points; @@ -443,10 +436,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, } #define K(x) ((x) << (PAGE_SHIFT-10)) -static void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, - unsigned int points, unsigned long totalpages, - struct mem_cgroup *memcg, nodemask_t *nodemask, - const char *message) +void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + unsigned int points, unsigned long totalpages, + struct mem_cgroup *memcg, nodemask_t *nodemask, + const char *message) { struct task_struct *victim = p; struct task_struct *child; @@ -564,10 +557,6 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, int order) { - unsigned long limit; - unsigned int points = 0; - struct task_struct *p; - /* * If current has a pending SIGKILL, then automatically select it. The * goal is to allow it to allocate so that it may quickly exit and free @@ -579,13 +568,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, } check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); - limit = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; - read_lock(&tasklist_lock); - p = select_bad_process(&points, limit, memcg, NULL, false); - if (p && PTR_ERR(p) != -1UL) - oom_kill_process(p, gfp_mask, order, points, limit, memcg, NULL, - "Memory cgroup out of memory"); - read_unlock(&tasklist_lock); + __mem_cgroup_out_of_memory(memcg, gfp_mask, order); } #endif @@ -710,7 +693,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, struct task_struct *p; unsigned long totalpages; unsigned long freed = 0; - unsigned int points; + unsigned int uninitialized_var(points); enum oom_constraint constraint = CONSTRAINT_NONE; int killed = 0; @@ -748,8 +731,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, goto out; } - p = select_bad_process(&points, totalpages, NULL, mpol_mask, - force_kill); + p = select_bad_process(&points, totalpages, mpol_mask, force_kill); /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { dump_header(NULL, gfp_mask, order, NULL, mpol_mask); -- cgit v0.10.2 From 6b0c81b3be114a93f79bd4c5639ade5107d77c21 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Jul 2012 16:43:45 -0700 Subject: mm, oom: reduce dependency on tasklist_lock Since exiting tasks require write_lock_irq(&tasklist_lock) several times, try to reduce the amount of time the readside is held for oom kills. This makes the interface with the memcg oom handler more consistent since it now never needs to take tasklist_lock unnecessarily. The only time the oom killer now takes tasklist_lock is when iterating the children of the selected task, everything else is protected by rcu_read_lock(). This requires that a reference to the selected process, p, is grabbed before calling oom_kill_process(). It may release it and grab a reference on another one of p's threads if !p->mm, but it also guarantees that it will release the reference before returning. [hughd@google.com: fix duplicate put_task_struct()] Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Reviewed-by: Michal Hocko Cc: Oleg Nesterov Cc: KOSAKI Motohiro Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b78972e..77a29ce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1521,11 +1521,8 @@ void __mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, if (!chosen) return; points = chosen_points * 1000 / totalpages; - read_lock(&tasklist_lock); oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, NULL, "Memory cgroup out of memory"); - read_unlock(&tasklist_lock); - put_task_struct(chosen); } static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c0c97ae..a3a32ae 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -336,7 +336,7 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, /* * Simple selection loop. We chose the process with the highest - * number of 'points'. We expect the caller will lock the tasklist. + * number of 'points'. * * (not docbooked, we don't want this one cluttering up the manual) */ @@ -348,6 +348,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, struct task_struct *chosen = NULL; unsigned long chosen_points = 0; + rcu_read_lock(); do_each_thread(g, p) { unsigned int points; @@ -360,6 +361,7 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, case OOM_SCAN_CONTINUE: continue; case OOM_SCAN_ABORT: + rcu_read_unlock(); return ERR_PTR(-1UL); case OOM_SCAN_OK: break; @@ -370,6 +372,9 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, chosen_points = points; } } while_each_thread(g, p); + if (chosen) + get_task_struct(chosen); + rcu_read_unlock(); *ppoints = chosen_points * 1000 / totalpages; return chosen; @@ -385,8 +390,6 @@ static struct task_struct *select_bad_process(unsigned int *ppoints, * are not shown. * State information includes task's pid, uid, tgid, vm size, rss, nr_ptes, * swapents, oom_score_adj value, and name. - * - * Call with tasklist_lock read-locked. */ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemask) { @@ -394,6 +397,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas struct task_struct *task; pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); + rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) continue; @@ -416,6 +420,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas task->signal->oom_score_adj, task->comm); task_unlock(task); } + rcu_read_unlock(); } static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, @@ -436,6 +441,10 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, } #define K(x) ((x) << (PAGE_SHIFT-10)) +/* + * Must be called while holding a reference to p, which will be released upon + * returning. + */ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, unsigned int points, unsigned long totalpages, struct mem_cgroup *memcg, nodemask_t *nodemask, @@ -455,6 +464,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, */ if (p->flags & PF_EXITING) { set_tsk_thread_flag(p, TIF_MEMDIE); + put_task_struct(p); return; } @@ -472,6 +482,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, * parent. This attempts to lose the minimal amount of work done while * still freeing memory. */ + read_lock(&tasklist_lock); do { list_for_each_entry(child, &t->children, sibling) { unsigned int child_points; @@ -484,15 +495,26 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, child_points = oom_badness(child, memcg, nodemask, totalpages); if (child_points > victim_points) { + put_task_struct(victim); victim = child; victim_points = child_points; + get_task_struct(victim); } } } while_each_thread(p, t); + read_unlock(&tasklist_lock); - victim = find_lock_task_mm(victim); - if (!victim) + rcu_read_lock(); + p = find_lock_task_mm(victim); + if (!p) { + rcu_read_unlock(); + put_task_struct(victim); return; + } else if (victim != p) { + get_task_struct(p); + put_task_struct(victim); + victim = p; + } /* mm cannot safely be dereferenced after task_unlock(victim) */ mm = victim->mm; @@ -523,9 +545,11 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, task_unlock(p); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } + rcu_read_unlock(); set_tsk_thread_flag(victim, TIF_MEMDIE); do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true); + put_task_struct(victim); } #undef K @@ -546,9 +570,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, if (constraint != CONSTRAINT_NONE) return; } - read_lock(&tasklist_lock); dump_header(NULL, gfp_mask, order, NULL, nodemask); - read_unlock(&tasklist_lock); panic("Out of memory: %s panic_on_oom is enabled\n", sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } @@ -721,10 +743,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL; check_panic_on_oom(constraint, gfp_mask, order, mpol_mask); - read_lock(&tasklist_lock); if (sysctl_oom_kill_allocating_task && current->mm && !oom_unkillable_task(current, NULL, nodemask) && current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + get_task_struct(current); oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL, nodemask, "Out of memory (oom_kill_allocating_task)"); @@ -735,7 +757,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { dump_header(NULL, gfp_mask, order, NULL, mpol_mask); - read_unlock(&tasklist_lock); panic("Out of memory and no killable processes...\n"); } if (PTR_ERR(p) != -1UL) { @@ -744,8 +765,6 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, killed = 1; } out: - read_unlock(&tasklist_lock); - /* * Give the killed threads a good chance of exiting before trying to * allocate memory again. -- cgit v0.10.2 From 876aafbfd9ba5bb352f1b14622c27f3fe9a99013 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 31 Jul 2012 16:43:48 -0700 Subject: mm, memcg: move all oom handling to memcontrol.c By globally defining check_panic_on_oom(), the memcg oom handler can be moved entirely to mm/memcontrol.c. This removes the ugly #ifdef in the oom killer and cleans up the code. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Oleg Nesterov Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2a80544..5a3ee64 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -180,8 +180,6 @@ static inline void mem_cgroup_dec_page_stat(struct page *page, unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask, unsigned long *total_scanned); -extern void __mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, - int order); void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx); #ifdef CONFIG_TRANSPARENT_HUGEPAGE diff --git a/include/linux/oom.h b/include/linux/oom.h index 5dc0e38..49a3031 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -61,6 +61,9 @@ extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags); +extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, + int order, const nodemask_t *nodemask); + extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task, unsigned long totalpages, const nodemask_t *nodemask, bool force_kill); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 77a29ce..0f692a2d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1469,8 +1469,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return min(limit, memsw); } -void __mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, - int order) +void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order) { struct mem_cgroup *iter; unsigned long chosen_points = 0; @@ -1478,6 +1478,17 @@ void __mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int points = 0; struct task_struct *chosen = NULL; + /* + * If current has a pending SIGKILL, then automatically select it. The + * goal is to allow it to allocate so that it may quickly exit and free + * its memory. + */ + if (fatal_signal_pending(current)) { + set_thread_flag(TIF_MEMDIE); + return; + } + + check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; for_each_mem_cgroup_tree(iter, memcg) { struct cgroup *cgroup = iter->css.cgroup; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a3a32ae..1986008 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -556,8 +556,8 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, - int order, const nodemask_t *nodemask) +void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, + int order, const nodemask_t *nodemask) { if (likely(!sysctl_panic_on_oom)) return; @@ -575,25 +575,6 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask, sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide"); } -#ifdef CONFIG_MEMCG -void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, - int order) -{ - /* - * If current has a pending SIGKILL, then automatically select it. The - * goal is to allow it to allocate so that it may quickly exit and free - * its memory. - */ - if (fatal_signal_pending(current)) { - set_thread_flag(TIF_MEMDIE); - return; - } - - check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); - __mem_cgroup_out_of_memory(memcg, gfp_mask, order); -} -#endif - static BLOCKING_NOTIFIER_HEAD(oom_notify_list); int register_oom_notifier(struct notifier_block *nb) -- cgit v0.10.2 From ee6f509c3274014d1f52e7a7a10aee9f85393c5e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:43:50 -0700 Subject: mm: factor out memory isolate functions mm/page_alloc.c has some memory isolation functions but they are used only when we enable CONFIG_{CMA|MEMORY_HOTPLUG|MEMORY_FAILURE}. So let's make it configurable by new CONFIG_MEMORY_ISOLATION so that it can reduce binary size and we can check it simple by CONFIG_MEMORY_ISOLATION, not if defined CONFIG_{CMA|MEMORY_HOTPLUG|MEMORY_FAILURE}. Signed-off-by: Minchan Kim Cc: Andi Kleen Cc: Marek Szyprowski Acked-by: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 9b21469..08b4c52 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -196,6 +196,7 @@ config CMA bool "Contiguous Memory Allocator (EXPERIMENTAL)" depends on HAVE_DMA_CONTIGUOUS && HAVE_MEMBLOCK && EXPERIMENTAL select MIGRATION + select MEMORY_ISOLATION help This enables the Contiguous Memory Allocator which allows drivers to allocate big physically-contiguous blocks of memory for use with diff --git a/include/linux/page-isolation.h b/include/linux/page-isolation.h index 3bdcab3..105077a 100644 --- a/include/linux/page-isolation.h +++ b/include/linux/page-isolation.h @@ -1,6 +1,11 @@ #ifndef __LINUX_PAGEISOLATION_H #define __LINUX_PAGEISOLATION_H + +bool has_unmovable_pages(struct zone *zone, struct page *page, int count); +void set_pageblock_migratetype(struct page *page, int migratetype); +int move_freepages_block(struct zone *zone, struct page *page, + int migratetype); /* * Changes migrate type in [start_pfn, end_pfn) to be MIGRATE_ISOLATE. * If specified range includes migrate types other than MOVABLE or CMA, @@ -10,7 +15,7 @@ * free all pages in the range. test_page_isolated() can be used for * test it. */ -extern int +int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype); @@ -18,7 +23,7 @@ start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * Changes MIGRATE_ISOLATE to MIGRATE_MOVABLE. * target range is [start_pfn, end_pfn) */ -extern int +int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned migratetype); @@ -30,8 +35,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn); /* * Internal functions. Changes pageblock's migrate type. */ -extern int set_migratetype_isolate(struct page *page); -extern void unset_migratetype_isolate(struct page *page, unsigned migratetype); +int set_migratetype_isolate(struct page *page); +void unset_migratetype_isolate(struct page *page, unsigned migratetype); #endif diff --git a/mm/Kconfig b/mm/Kconfig index 82fed4e..d5c8019 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -140,9 +140,13 @@ config ARCH_DISCARD_MEMBLOCK config NO_BOOTMEM boolean +config MEMORY_ISOLATION + boolean + # eventually, we can have this option just 'select SPARSEMEM' config MEMORY_HOTPLUG bool "Allow for memory hot-add" + select MEMORY_ISOLATION depends on SPARSEMEM || X86_64_ACPI_NUMA depends on HOTPLUG && ARCH_ENABLE_MEMORY_HOTPLUG depends on (IA64 || X86 || PPC_BOOK3S_64 || SUPERH || S390) @@ -272,6 +276,7 @@ config MEMORY_FAILURE depends on MMU depends on ARCH_SUPPORTS_MEMORY_FAILURE bool "Enable recovery from hardware memory errors" + select MEMORY_ISOLATION help Enables code to recover from some memory failures on systems with MCA recovery. This allows a system to continue running diff --git a/mm/Makefile b/mm/Makefile index 290bbfe..92753e2 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -15,8 +15,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \ - page_isolation.o mm_init.o mmu_context.o percpu.o \ - compaction.o slab_common.o $(mmu-y) + mm_init.o mmu_context.o percpu.o slab_common.o \ + compaction.o $(mmu-y) obj-y += init-mm.o @@ -56,3 +56,4 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o +obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 667338e..2281947 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -51,7 +51,6 @@ #include #include #include -#include #include #include #include @@ -219,7 +218,7 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -static void set_pageblock_migratetype(struct page *page, int migratetype) +void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled)) @@ -954,7 +953,7 @@ static int move_freepages(struct zone *zone, return pages_moved; } -static int move_freepages_block(struct zone *zone, struct page *page, +int move_freepages_block(struct zone *zone, struct page *page, int migratetype) { unsigned long start_pfn, end_pfn; @@ -5463,8 +5462,7 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, * MIGRATE_MOVABLE block might include unmovable pages. It means you can't * expect this function should be exact. */ -static bool -__has_unmovable_pages(struct zone *zone, struct page *page, int count) +bool has_unmovable_pages(struct zone *zone, struct page *page, int count) { unsigned long pfn, iter, found; int mt; @@ -5541,77 +5539,7 @@ bool is_pageblock_removable_nolock(struct page *page) zone->zone_start_pfn + zone->spanned_pages <= pfn) return false; - return !__has_unmovable_pages(zone, page, 0); -} - -int set_migratetype_isolate(struct page *page) -{ - struct zone *zone; - unsigned long flags, pfn; - struct memory_isolate_notify arg; - int notifier_ret; - int ret = -EBUSY; - - zone = page_zone(page); - - spin_lock_irqsave(&zone->lock, flags); - - pfn = page_to_pfn(page); - arg.start_pfn = pfn; - arg.nr_pages = pageblock_nr_pages; - arg.pages_found = 0; - - /* - * It may be possible to isolate a pageblock even if the - * migratetype is not MIGRATE_MOVABLE. The memory isolation - * notifier chain is used by balloon drivers to return the - * number of pages in a range that are held by the balloon - * driver to shrink memory. If all the pages are accounted for - * by balloons, are free, or on the LRU, isolation can continue. - * Later, for example, when memory hotplug notifier runs, these - * pages reported as "can be isolated" should be isolated(freed) - * by the balloon driver through the memory notifier chain. - */ - notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); - notifier_ret = notifier_to_errno(notifier_ret); - if (notifier_ret) - goto out; - /* - * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. - * We just check MOVABLE pages. - */ - if (!__has_unmovable_pages(zone, page, arg.pages_found)) - ret = 0; - /* - * Unmovable means "not-on-lru" pages. If Unmovable pages are - * larger than removable-by-driver pages reported by notifier, - * we'll fail. - */ - -out: - if (!ret) { - set_pageblock_migratetype(page, MIGRATE_ISOLATE); - move_freepages_block(zone, page, MIGRATE_ISOLATE); - } - - spin_unlock_irqrestore(&zone->lock, flags); - if (!ret) - drain_all_pages(); - return ret; -} - -void unset_migratetype_isolate(struct page *page, unsigned migratetype) -{ - struct zone *zone; - unsigned long flags; - zone = page_zone(page); - spin_lock_irqsave(&zone->lock, flags); - if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) - goto out; - set_pageblock_migratetype(page, migratetype); - move_freepages_block(zone, page, migratetype); -out: - spin_unlock_irqrestore(&zone->lock, flags); + return !has_unmovable_pages(zone, page, 0); } #ifdef CONFIG_CMA diff --git a/mm/page_isolation.c b/mm/page_isolation.c index c9f0477..fb482cf 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -5,8 +5,79 @@ #include #include #include +#include #include "internal.h" +int set_migratetype_isolate(struct page *page) +{ + struct zone *zone; + unsigned long flags, pfn; + struct memory_isolate_notify arg; + int notifier_ret; + int ret = -EBUSY; + + zone = page_zone(page); + + spin_lock_irqsave(&zone->lock, flags); + + pfn = page_to_pfn(page); + arg.start_pfn = pfn; + arg.nr_pages = pageblock_nr_pages; + arg.pages_found = 0; + + /* + * It may be possible to isolate a pageblock even if the + * migratetype is not MIGRATE_MOVABLE. The memory isolation + * notifier chain is used by balloon drivers to return the + * number of pages in a range that are held by the balloon + * driver to shrink memory. If all the pages are accounted for + * by balloons, are free, or on the LRU, isolation can continue. + * Later, for example, when memory hotplug notifier runs, these + * pages reported as "can be isolated" should be isolated(freed) + * by the balloon driver through the memory notifier chain. + */ + notifier_ret = memory_isolate_notify(MEM_ISOLATE_COUNT, &arg); + notifier_ret = notifier_to_errno(notifier_ret); + if (notifier_ret) + goto out; + /* + * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. + * We just check MOVABLE pages. + */ + if (!has_unmovable_pages(zone, page, arg.pages_found)) + ret = 0; + + /* + * immobile means "not-on-lru" paes. If immobile is larger than + * removable-by-driver pages reported by notifier, we'll fail. + */ + +out: + if (!ret) { + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + move_freepages_block(zone, page, MIGRATE_ISOLATE); + } + + spin_unlock_irqrestore(&zone->lock, flags); + if (!ret) + drain_all_pages(); + return ret; +} + +void unset_migratetype_isolate(struct page *page, unsigned migratetype) +{ + struct zone *zone; + unsigned long flags; + zone = page_zone(page); + spin_lock_irqsave(&zone->lock, flags); + if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) + goto out; + set_pageblock_migratetype(page, migratetype); + move_freepages_block(zone, page, migratetype); +out: + spin_unlock_irqrestore(&zone->lock, flags); +} + static inline struct page * __first_valid_page(unsigned long pfn, unsigned long nr_pages) { -- cgit v0.10.2 From 2cfed0752808625d30aca7fc9f383af386fd8a13 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:43:53 -0700 Subject: mm: fix free page check in zone_watermark_ok() __zone_watermark_ok currently compares free_pages which is a signed type with z->lowmem_reserve[classzone_idx] which is unsigned which might lead to sign overflow if free_pages doesn't satisfy the given order (or it came as negative already) and then we rely on the following order loop to fix it (which doesn't work for order-0). Let's fix the type conversion and do not rely on the given value of free_pages or follow up fixups. This patch fixes it because "memory-hotplug: fix kswapd looping forever problem" depends on this. As benefit of this patch, it doesn't rely on the loop to exit __zone_watermark_ok in case of high order check and make the first test effective.(ie, if (free_pages <= min + lowmem_reserve)) Aaditya reported this problem when he test my hotplug patch. Reported-off-by: Aaditya Kumar Tested-by: Aaditya Kumar Signed-off-by: Aaditya Kumar Signed-off-by: Minchan Kim Cc: KOSAKI Motohiro Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Reviewed-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2281947..2e66359 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1595,6 +1595,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, { /* free_pages my go negative - that's OK */ long min = mark; + long lowmem_reserve = z->lowmem_reserve[classzone_idx]; int o; free_pages -= (1 << order) - 1; @@ -1603,7 +1604,7 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, if (alloc_flags & ALLOC_HARDER) min -= min / 4; - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) + if (free_pages <= min + lowmem_reserve) return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ -- cgit v0.10.2 From 702d1a6e0766d45642c934444fd41f658d251305 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:43:56 -0700 Subject: memory-hotplug: fix kswapd looping forever problem When hotplug offlining happens on zone A, it starts to mark freed page as MIGRATE_ISOLATE type in buddy for preventing further allocation. (MIGRATE_ISOLATE is very irony type because it's apparently on buddy but we can't allocate them). When the memory shortage happens during hotplug offlining, current task starts to reclaim, then wake up kswapd. Kswapd checks watermark, then go sleep because current zone_watermark_ok_safe doesn't consider MIGRATE_ISOLATE freed page count. Current task continue to reclaim in direct reclaim path without kswapd's helping. The problem is that zone->all_unreclaimable is set by only kswapd so that current task would be looping forever like below. __alloc_pages_slowpath restart: wake_all_kswapd rebalance: __alloc_pages_direct_reclaim do_try_to_free_pages if global_reclaim && !all_unreclaimable return 1; /* It means we did did_some_progress */ skip __alloc_pages_may_oom should_alloc_retry goto rebalance; If we apply KOSAKI's patch[1] which doesn't depends on kswapd about setting zone->all_unreclaimable, we can solve this problem by killing some task in direct reclaim path. But it doesn't wake up kswapd, still. It could be a problem still if other subsystem needs GFP_ATOMIC request. So kswapd should consider MIGRATE_ISOLATE when it calculate free pages BEFORE going sleep. This patch counts the number of MIGRATE_ISOLATE page block and zone_watermark_ok_safe will consider it if the system has such blocks (fortunately, it's very rare so no problem in POV overhead and kswapd is never hotpath). Copy/modify from Mel's quote " Ideal solution would be "allocating" the pageblock. It would keep the free space accounting as it is but historically, memory hotplug didn't allocate pages because it would be difficult to detect if a pageblock was isolated or if part of some balloon. Allocating just full pageblocks would work around this, However, it would play very badly with CMA. " [1] http://lkml.org/lkml/2012/6/14/74 [akpm@linux-foundation.org: simplify nr_zone_isolate_freepages(), rework zone_watermark_ok_safe() comment, simplify set_pageblock_isolate() and restore_pageblock_isolate()] [akpm@linux-foundation.org: fix CONFIG_MEMORY_ISOLATION=n build] Signed-off-by: Minchan Kim Suggested-by: KOSAKI Motohiro Tested-by: Aaditya Kumar Cc: KAMEZAWA Hiroyuki Cc: Mel Gorman Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 98f079b..64b2c3a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -478,6 +478,14 @@ struct zone { * rarely used fields: */ const char *name; +#ifdef CONFIG_MEMORY_ISOLATION + /* + * the number of MIGRATE_ISOLATE *pageblock*. + * We need this for free page counting. Look at zone_watermark_ok_safe. + * It's protected by zone->lock + */ + int nr_pageblock_isolate; +#endif } ____cacheline_internodealigned_in_smp; typedef enum { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2e66359..6a29ed8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -218,6 +218,11 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; +/* + * NOTE: + * Don't use set_pageblock_migratetype(page, MIGRATE_ISOLATE) directly. + * Instead, use {un}set_pageblock_isolate. + */ void set_pageblock_migratetype(struct page *page, int migratetype) { @@ -1619,6 +1624,20 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } +#ifdef CONFIG_MEMORY_ISOLATION +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) +{ + if (unlikely(zone->nr_pageblock_isolate)) + return zone->nr_pageblock_isolate * pageblock_nr_pages; + return 0; +} +#else +static inline unsigned long nr_zone_isolate_freepages(struct zone *zone) +{ + return 0; +} +#endif + bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags) { @@ -1634,6 +1653,14 @@ bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + /* + * If the zone has MIGRATE_ISOLATE type free pages, we should consider + * it. nr_zone_isolate_freepages is never accurate so kswapd might not + * sleep although it could do so. But this is more desirable for memory + * hotplug than sleeping which can cause a livelock in the direct + * reclaim path. + */ + free_pages -= nr_zone_isolate_freepages(z); return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, free_pages); } @@ -4398,6 +4425,9 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, lruvec_init(&zone->lruvec, zone); zap_zone_vm_stats(zone); zone->flags = 0; +#ifdef CONFIG_MEMORY_ISOLATION + zone->nr_pageblock_isolate = 0; +#endif if (!size) continue; diff --git a/mm/page_isolation.c b/mm/page_isolation.c index fb482cf..247d1f1 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -8,6 +8,28 @@ #include #include "internal.h" +/* called while holding zone->lock */ +static void set_pageblock_isolate(struct page *page) +{ + if (get_pageblock_migratetype(page) == MIGRATE_ISOLATE) + return; + + set_pageblock_migratetype(page, MIGRATE_ISOLATE); + page_zone(page)->nr_pageblock_isolate++; +} + +/* called while holding zone->lock */ +static void restore_pageblock_isolate(struct page *page, int migratetype) +{ + struct zone *zone = page_zone(page); + if (WARN_ON(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) + return; + + BUG_ON(zone->nr_pageblock_isolate <= 0); + set_pageblock_migratetype(page, migratetype); + zone->nr_pageblock_isolate--; +} + int set_migratetype_isolate(struct page *page) { struct zone *zone; @@ -54,7 +76,7 @@ int set_migratetype_isolate(struct page *page) out: if (!ret) { - set_pageblock_migratetype(page, MIGRATE_ISOLATE); + set_pageblock_isolate(page); move_freepages_block(zone, page, MIGRATE_ISOLATE); } @@ -72,8 +94,8 @@ void unset_migratetype_isolate(struct page *page, unsigned migratetype) spin_lock_irqsave(&zone->lock, flags); if (get_pageblock_migratetype(page) != MIGRATE_ISOLATE) goto out; - set_pageblock_migratetype(page, migratetype); move_freepages_block(zone, page, migratetype); + restore_pageblock_isolate(page, migratetype); out: spin_unlock_irqrestore(&zone->lock, flags); } -- cgit v0.10.2 From 072bb0aa5e062902968c5c1007bba332c7820cf4 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:43:58 -0700 Subject: mm: sl[au]b: add knowledge of PFMEMALLOC reserve pages When a user or administrator requires swap for their application, they create a swap partition and file, format it with mkswap and activate it with swapon. Swap over the network is considered as an option in diskless systems. The two likely scenarios are when blade servers are used as part of a cluster where the form factor or maintenance costs do not allow the use of disks and thin clients. The Linux Terminal Server Project recommends the use of the Network Block Device (NBD) for swap according to the manual at https://sourceforge.net/projects/ltsp/files/Docs-Admin-Guide/LTSPManual.pdf/download There is also documentation and tutorials on how to setup swap over NBD at places like https://help.ubuntu.com/community/UbuntuLTSP/EnableNBDSWAP The nbd-client also documents the use of NBD as swap. Despite this, the fact is that a machine using NBD for swap can deadlock within minutes if swap is used intensively. This patch series addresses the problem. The core issue is that network block devices do not use mempools like normal block devices do. As the host cannot control where they receive packets from, they cannot reliably work out in advance how much memory they might need. Some years ago, Peter Zijlstra developed a series of patches that supported swap over an NFS that at least one distribution is carrying within their kernels. This patch series borrows very heavily from Peter's work to support swapping over NBD as a pre-requisite to supporting swap-over-NFS. The bulk of the complexity is concerned with preserving memory that is allocated from the PFMEMALLOC reserves for use by the network layer which is needed for both NBD and NFS. Patch 1 adds knowledge of the PFMEMALLOC reserves to SLAB and SLUB to preserve access to pages allocated under low memory situations to callers that are freeing memory. Patch 2 optimises the SLUB fast path to avoid pfmemalloc checks Patch 3 introduces __GFP_MEMALLOC to allow access to the PFMEMALLOC reserves without setting PFMEMALLOC. Patch 4 opens the possibility for softirqs to use PFMEMALLOC reserves for later use by network packet processing. Patch 5 only sets page->pfmemalloc when ALLOC_NO_WATERMARKS was required Patch 6 ignores memory policies when ALLOC_NO_WATERMARKS is set. Patches 7-12 allows network processing to use PFMEMALLOC reserves when the socket has been marked as being used by the VM to clean pages. If packets are received and stored in pages that were allocated under low-memory situations and are unrelated to the VM, the packets are dropped. Patch 11 reintroduces __skb_alloc_page which the networking folk may object to but is needed in some cases to propogate pfmemalloc from a newly allocated page to an skb. If there is a strong objection, this patch can be dropped with the impact being that swap-over-network will be slower in some cases but it should not fail. Patch 13 is a micro-optimisation to avoid a function call in the common case. Patch 14 tags NBD sockets as being SOCK_MEMALLOC so they can use PFMEMALLOC if necessary. Patch 15 notes that it is still possible for the PFMEMALLOC reserve to be depleted. To prevent this, direct reclaimers get throttled on a waitqueue if 50% of the PFMEMALLOC reserves are depleted. It is expected that kswapd and the direct reclaimers already running will clean enough pages for the low watermark to be reached and the throttled processes are woken up. Patch 16 adds a statistic to track how often processes get throttled Some basic performance testing was run using kernel builds, netperf on loopback for UDP and TCP, hackbench (pipes and sockets), iozone and sysbench. Each of them were expected to use the sl*b allocators reasonably heavily but there did not appear to be significant performance variances. For testing swap-over-NBD, a machine was booted with 2G of RAM with a swapfile backed by NBD. 8*NUM_CPU processes were started that create anonymous memory mappings and read them linearly in a loop. The total size of the mappings were 4*PHYSICAL_MEMORY to use swap heavily under memory pressure. Without the patches and using SLUB, the machine locks up within minutes and runs to completion with them applied. With SLAB, the story is different as an unpatched kernel run to completion. However, the patched kernel completed the test 45% faster. MICRO 3.5.0-rc2 3.5.0-rc2 vanilla swapnbd Unrecognised test vmscan-anon-mmap-write MMTests Statistics: duration Sys Time Running Test (seconds) 197.80 173.07 User+Sys Time Running Test (seconds) 206.96 182.03 Total Elapsed Time (seconds) 3240.70 1762.09 This patch: mm: sl[au]b: add knowledge of PFMEMALLOC reserve pages Allocations of pages below the min watermark run a risk of the machine hanging due to a lack of memory. To prevent this, only callers who have PF_MEMALLOC or TIF_MEMDIE set and are not processing an interrupt are allowed to allocate with ALLOC_NO_WATERMARKS. Once they are allocated to a slab though, nothing prevents other callers consuming free objects within those slabs. This patch limits access to slab pages that were alloced from the PFMEMALLOC reserves. When this patch is applied, pages allocated from below the low watermark are returned with page->pfmemalloc set and it is up to the caller to determine how the page should be protected. SLAB restricts access to any page with page->pfmemalloc set to callers which are known to able to access the PFMEMALLOC reserve. If one is not available, an attempt is made to allocate a new page rather than use a reserve. SLUB is a bit more relaxed in that it only records if the current per-CPU page was allocated from PFMEMALLOC reserve and uses another partial slab if the caller does not have the necessary GFP or process flags. This was found to be sufficient in tests to avoid hangs due to SLUB generally maintaining smaller lists than SLAB. In low-memory conditions it does mean that !PFMEMALLOC allocators can fail a slab allocation even though free objects are available because they are being preserved for callers that are freeing pages. [a.p.zijlstra@chello.nl: Original implementation] [sebastian@breakpoint.cc: Correct order of page flag clearing] Signed-off-by: Mel Gorman Cc: David Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 074eb98..375e79e 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -54,6 +54,15 @@ struct page { union { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* slub/slob first free object */ + bool pfmemalloc; /* If set by the page allocator, + * ALLOC_PFMEMALLOC was set + * and the low watermark was not + * met implying that the system + * is under some pressure. The + * caller should try ensure + * this page is only used to + * free other pages. + */ }; union { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index c88d2a9..b5d1384 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -7,6 +7,7 @@ #include #include +#include #ifndef __GENERATING_BOUNDS_H #include #include @@ -453,6 +454,34 @@ static inline int PageTransTail(struct page *page) } #endif +/* + * If network-based swap is enabled, sl*b must keep track of whether pages + * were allocated from pfmemalloc reserves. + */ +static inline int PageSlabPfmemalloc(struct page *page) +{ + VM_BUG_ON(!PageSlab(page)); + return PageActive(page); +} + +static inline void SetPageSlabPfmemalloc(struct page *page) +{ + VM_BUG_ON(!PageSlab(page)); + SetPageActive(page); +} + +static inline void __ClearPageSlabPfmemalloc(struct page *page) +{ + VM_BUG_ON(!PageSlab(page)); + __ClearPageActive(page); +} + +static inline void ClearPageSlabPfmemalloc(struct page *page) +{ + VM_BUG_ON(!PageSlab(page)); + ClearPageActive(page); +} + #ifdef CONFIG_MMU #define __PG_MLOCKED (1 << PG_mlocked) #else diff --git a/mm/internal.h b/mm/internal.h index 3314f79..eb76b67 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -279,6 +279,9 @@ static inline struct page *mem_map_next(struct page *iter, #define __paginginit __init #endif +/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); + /* Memory initialisation debug and verification */ enum mminit_level { MMINIT_WARNING, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6a29ed8..38e5be6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1513,6 +1513,7 @@ failed: #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#define ALLOC_PFMEMALLOC 0x80 /* Caller has PF_MEMALLOC set */ #ifdef CONFIG_FAIL_PAGE_ALLOC @@ -2293,16 +2294,22 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { - if (!in_interrupt() && - ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE)))) + if ((current->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE))) { + alloc_flags |= ALLOC_PFMEMALLOC; + + if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt()) alloc_flags |= ALLOC_NO_WATERMARKS; } return alloc_flags; } +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) +{ + return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_PFMEMALLOC); +} + static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, @@ -2490,10 +2497,18 @@ nopage: warn_alloc_failed(gfp_mask, order, NULL); return page; got_pg: + /* + * page->pfmemalloc is set when the caller had PFMEMALLOC set or is + * been OOM killed. The expectation is that the caller is taking + * steps that will free more memory. The caller should avoid the + * page being used for !PFMEMALLOC purposes. + */ + page->pfmemalloc = !!(alloc_flags & ALLOC_PFMEMALLOC); + if (kmemcheck_enabled) kmemcheck_pagealloc_alloc(page, order, gfp_mask); - return page; + return page; } /* @@ -2544,6 +2559,8 @@ retry_cpuset: page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); + else + page->pfmemalloc = false; trace_mm_page_alloc(page, order, gfp_mask, migratetype); diff --git a/mm/slab.c b/mm/slab.c index 1fcf3ac..55d84a2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -124,6 +124,8 @@ #include +#include "internal.h" + /* * DEBUG - 1 for kmem_cache_create() to honour; SLAB_RED_ZONE & SLAB_POISON. * 0 for faster, smaller code (especially in the critical paths). @@ -152,6 +154,12 @@ #define ARCH_KMALLOC_FLAGS SLAB_HWCACHE_ALIGN #endif +/* + * true if a page was allocated from pfmemalloc reserves for network-based + * swap + */ +static bool pfmemalloc_active __read_mostly; + /* Legal flag mask for kmem_cache_create(). */ #if DEBUG # define CREATE_MASK (SLAB_RED_ZONE | \ @@ -257,9 +265,30 @@ struct array_cache { * Must have this definition in here for the proper * alignment of array_cache. Also simplifies accessing * the entries. + * + * Entries should not be directly dereferenced as + * entries belonging to slabs marked pfmemalloc will + * have the lower bits set SLAB_OBJ_PFMEMALLOC */ }; +#define SLAB_OBJ_PFMEMALLOC 1 +static inline bool is_obj_pfmemalloc(void *objp) +{ + return (unsigned long)objp & SLAB_OBJ_PFMEMALLOC; +} + +static inline void set_obj_pfmemalloc(void **objp) +{ + *objp = (void *)((unsigned long)*objp | SLAB_OBJ_PFMEMALLOC); + return; +} + +static inline void clear_obj_pfmemalloc(void **objp) +{ + *objp = (void *)((unsigned long)*objp & ~SLAB_OBJ_PFMEMALLOC); +} + /* * bootstrap: The caches do not work without cpuarrays anymore, but the * cpuarrays are allocated from the generic caches... @@ -900,6 +929,102 @@ static struct array_cache *alloc_arraycache(int node, int entries, return nc; } +static inline bool is_slab_pfmemalloc(struct slab *slabp) +{ + struct page *page = virt_to_page(slabp->s_mem); + + return PageSlabPfmemalloc(page); +} + +/* Clears pfmemalloc_active if no slabs have pfmalloc set */ +static void recheck_pfmemalloc_active(struct kmem_cache *cachep, + struct array_cache *ac) +{ + struct kmem_list3 *l3 = cachep->nodelists[numa_mem_id()]; + struct slab *slabp; + unsigned long flags; + + if (!pfmemalloc_active) + return; + + spin_lock_irqsave(&l3->list_lock, flags); + list_for_each_entry(slabp, &l3->slabs_full, list) + if (is_slab_pfmemalloc(slabp)) + goto out; + + list_for_each_entry(slabp, &l3->slabs_partial, list) + if (is_slab_pfmemalloc(slabp)) + goto out; + + list_for_each_entry(slabp, &l3->slabs_free, list) + if (is_slab_pfmemalloc(slabp)) + goto out; + + pfmemalloc_active = false; +out: + spin_unlock_irqrestore(&l3->list_lock, flags); +} + +static void *ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, + gfp_t flags, bool force_refill) +{ + int i; + void *objp = ac->entry[--ac->avail]; + + /* Ensure the caller is allowed to use objects from PFMEMALLOC slab */ + if (unlikely(is_obj_pfmemalloc(objp))) { + struct kmem_list3 *l3; + + if (gfp_pfmemalloc_allowed(flags)) { + clear_obj_pfmemalloc(&objp); + return objp; + } + + /* The caller cannot use PFMEMALLOC objects, find another one */ + for (i = 1; i < ac->avail; i++) { + /* If a !PFMEMALLOC object is found, swap them */ + if (!is_obj_pfmemalloc(ac->entry[i])) { + objp = ac->entry[i]; + ac->entry[i] = ac->entry[ac->avail]; + ac->entry[ac->avail] = objp; + return objp; + } + } + + /* + * If there are empty slabs on the slabs_free list and we are + * being forced to refill the cache, mark this one !pfmemalloc. + */ + l3 = cachep->nodelists[numa_mem_id()]; + if (!list_empty(&l3->slabs_free) && force_refill) { + struct slab *slabp = virt_to_slab(objp); + ClearPageSlabPfmemalloc(virt_to_page(slabp->s_mem)); + clear_obj_pfmemalloc(&objp); + recheck_pfmemalloc_active(cachep, ac); + return objp; + } + + /* No !PFMEMALLOC objects available */ + ac->avail++; + objp = NULL; + } + + return objp; +} + +static void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, + void *objp) +{ + if (unlikely(pfmemalloc_active)) { + /* Some pfmemalloc slabs exist, check if this is one */ + struct page *page = virt_to_page(objp); + if (PageSlabPfmemalloc(page)) + set_obj_pfmemalloc(&objp); + } + + ac->entry[ac->avail++] = objp; +} + /* * Transfer objects in one arraycache to another. * Locking must be handled by the caller. @@ -1076,7 +1201,7 @@ static inline int cache_free_alien(struct kmem_cache *cachep, void *objp) STATS_INC_ACOVERFLOW(cachep); __drain_alien_cache(cachep, alien, nodeid); } - alien->entry[alien->avail++] = objp; + ac_put_obj(cachep, alien, objp); spin_unlock(&alien->lock); } else { spin_lock(&(cachep->nodelists[nodeid])->list_lock); @@ -1759,6 +1884,10 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) return NULL; } + /* Record if ALLOC_PFMEMALLOC was set when allocating the slab */ + if (unlikely(page->pfmemalloc)) + pfmemalloc_active = true; + nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) add_zone_page_state(page_zone(page), @@ -1766,9 +1895,13 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) else add_zone_page_state(page_zone(page), NR_SLAB_UNRECLAIMABLE, nr_pages); - for (i = 0; i < nr_pages; i++) + for (i = 0; i < nr_pages; i++) { __SetPageSlab(page + i); + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page + i); + } + if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1800,6 +1933,7 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr) NR_SLAB_UNRECLAIMABLE, nr_freed); while (i--) { BUG_ON(!PageSlab(page)); + __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); page++; } @@ -3015,16 +3149,19 @@ bad: #define check_slabp(x,y) do { } while(0) #endif -static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) +static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags, + bool force_refill) { int batchcount; struct kmem_list3 *l3; struct array_cache *ac; int node; -retry: check_irq_off(); node = numa_mem_id(); + if (unlikely(force_refill)) + goto force_grow; +retry: ac = cpu_cache_get(cachep); batchcount = ac->batchcount; if (!ac->touched && batchcount > BATCHREFILL_LIMIT) { @@ -3074,8 +3211,8 @@ retry: STATS_INC_ACTIVE(cachep); STATS_SET_HIGH(cachep); - ac->entry[ac->avail++] = slab_get_obj(cachep, slabp, - node); + ac_put_obj(cachep, ac, slab_get_obj(cachep, slabp, + node)); } check_slabp(cachep, slabp); @@ -3094,18 +3231,22 @@ alloc_done: if (unlikely(!ac->avail)) { int x; +force_grow: x = cache_grow(cachep, flags | GFP_THISNODE, node, NULL); /* cache_grow can reenable interrupts, then ac could change. */ ac = cpu_cache_get(cachep); - if (!x && ac->avail == 0) /* no objects in sight? abort */ + + /* no objects in sight? abort */ + if (!x && (ac->avail == 0 || force_refill)) return NULL; if (!ac->avail) /* objects refilled by interrupt? */ goto retry; } ac->touched = 1; - return ac->entry[--ac->avail]; + + return ac_get_obj(cachep, ac, flags, force_refill); } static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, @@ -3187,23 +3328,35 @@ static inline void *____cache_alloc(struct kmem_cache *cachep, gfp_t flags) { void *objp; struct array_cache *ac; + bool force_refill = false; check_irq_off(); ac = cpu_cache_get(cachep); if (likely(ac->avail)) { - STATS_INC_ALLOCHIT(cachep); ac->touched = 1; - objp = ac->entry[--ac->avail]; - } else { - STATS_INC_ALLOCMISS(cachep); - objp = cache_alloc_refill(cachep, flags); + objp = ac_get_obj(cachep, ac, flags, false); + /* - * the 'ac' may be updated by cache_alloc_refill(), - * and kmemleak_erase() requires its correct value. + * Allow for the possibility all avail objects are not allowed + * by the current flags */ - ac = cpu_cache_get(cachep); + if (objp) { + STATS_INC_ALLOCHIT(cachep); + goto out; + } + force_refill = true; } + + STATS_INC_ALLOCMISS(cachep); + objp = cache_alloc_refill(cachep, flags, force_refill); + /* + * the 'ac' may be updated by cache_alloc_refill(), + * and kmemleak_erase() requires its correct value. + */ + ac = cpu_cache_get(cachep); + +out: /* * To avoid a false negative, if an object that is in one of the * per-CPU caches is leaked, we need to make sure kmemleak doesn't @@ -3525,9 +3678,12 @@ static void free_block(struct kmem_cache *cachep, void **objpp, int nr_objects, struct kmem_list3 *l3; for (i = 0; i < nr_objects; i++) { - void *objp = objpp[i]; + void *objp; struct slab *slabp; + clear_obj_pfmemalloc(&objpp[i]); + objp = objpp[i]; + slabp = virt_to_slab(objp); l3 = cachep->nodelists[node]; list_del(&slabp->list); @@ -3645,7 +3801,7 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, cache_flusharray(cachep, ac); } - ac->entry[ac->avail++] = objp; + ac_put_obj(cachep, ac, objp); } /** diff --git a/mm/slub.c b/mm/slub.c index e517d43..c3f05e1 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -34,6 +34,8 @@ #include +#include "internal.h" + /* * Lock order: * 1. slab_mutex (Global Mutex) @@ -1354,6 +1356,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) inc_slabs_node(s, page_to_nid(page), page->objects); page->slab = s; __SetPageSlab(page); + if (page->pfmemalloc) + SetPageSlabPfmemalloc(page); start = page_address(page); @@ -1397,6 +1401,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE, -pages); + __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); reset_page_mapcount(page); if (current->reclaim_state) @@ -2126,6 +2131,14 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, return freelist; } +static inline bool pfmemalloc_match(struct page *page, gfp_t gfpflags) +{ + if (unlikely(PageSlabPfmemalloc(page))) + return gfp_pfmemalloc_allowed(gfpflags); + + return true; +} + /* * Check the page->freelist of a page and either transfer the freelist to the per cpu freelist * or deactivate the page. @@ -2206,6 +2219,18 @@ redo: goto new_slab; } + /* + * By rights, we should be searching for a slab page that was + * PFMEMALLOC but right now, we are losing the pfmemalloc + * information when the page leaves the per-cpu allocator + */ + if (unlikely(!pfmemalloc_match(page, gfpflags))) { + deactivate_slab(s, page, c->freelist); + c->page = NULL; + c->freelist = NULL; + goto new_slab; + } + /* must check again c->freelist in case of cpu migration or IRQ */ freelist = c->freelist; if (freelist) @@ -2312,8 +2337,8 @@ redo: object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node))) - + if (unlikely(!object || !node_match(page, node) || + !pfmemalloc_match(page, gfpflags))) object = __slab_alloc(s, gfpflags, node, addr, c); else { -- cgit v0.10.2 From 5091b74a95d447e34530e713a8971450a45498b3 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Tue, 31 Jul 2012 16:44:00 -0700 Subject: mm: slub: optimise the SLUB fast path to avoid pfmemalloc checks This patch removes the check for pfmemalloc from the alloc hotpath and puts the logic after the election of a new per cpu slab. For a pfmemalloc page we do not use the fast path but force the use of the slow path which is also used for the debug case. This has the side-effect of weakening pfmemalloc processing in the following way; 1. A process that is allocating for network swap calls __slab_alloc. pfmemalloc_match is true so the freelist is loaded and c->freelist is now pointing to a pfmemalloc page. 2. A process that is attempting normal allocations calls slab_alloc, finds the pfmemalloc page on the freelist and uses it because it did not check pfmemalloc_match() The patch allows non-pfmemalloc allocations to use pfmemalloc pages with the kmalloc slabs being the most vunerable caches on the grounds they are most likely to have a mix of pfmemalloc and !pfmemalloc requests. A later patch will still protect the system as processes will get throttled if the pfmemalloc reserves get depleted but performance will not degrade as smoothly. [mgorman@suse.de: Expanded changelog] Signed-off-by: Christoph Lameter Signed-off-by: Mel Gorman Cc: David Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index c3f05e1..8f78e25 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2281,11 +2281,11 @@ new_slab: } page = c->page; - if (likely(!kmem_cache_debug(s))) + if (likely(!kmem_cache_debug(s) && pfmemalloc_match(page, gfpflags))) goto load_freelist; /* Only entered in the debug case */ - if (!alloc_debug_processing(s, page, freelist, addr)) + if (kmem_cache_debug(s) && !alloc_debug_processing(s, page, freelist, addr)) goto new_slab; /* Slab failed checks. Next slab needed */ deactivate_slab(s, page, get_freepointer(s, freelist)); @@ -2337,8 +2337,7 @@ redo: object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node) || - !pfmemalloc_match(page, gfpflags))) + if (unlikely(!object || !node_match(page, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { -- cgit v0.10.2 From b37f1dd0f543d9714f96c2f9b9f74f7bdfdfdf31 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:03 -0700 Subject: mm: introduce __GFP_MEMALLOC to allow access to emergency reserves __GFP_MEMALLOC will allow the allocation to disregard the watermarks, much like PF_MEMALLOC. It allows one to pass along the memalloc state in object related allocation flags as opposed to task related flags, such as sk->sk_allocation. This removes the need for ALLOC_PFMEMALLOC as callers using __GFP_MEMALLOC can get the ALLOC_NO_WATERMARK flag which is now enough to identify allocations related to page reclaim. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Cc: David Miller Cc: Neil Brown Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 1e49be4..cbd7400 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -23,6 +23,7 @@ struct vm_area_struct; #define ___GFP_REPEAT 0x400u #define ___GFP_NOFAIL 0x800u #define ___GFP_NORETRY 0x1000u +#define ___GFP_MEMALLOC 0x2000u #define ___GFP_COMP 0x4000u #define ___GFP_ZERO 0x8000u #define ___GFP_NOMEMALLOC 0x10000u @@ -76,9 +77,14 @@ struct vm_area_struct; #define __GFP_REPEAT ((__force gfp_t)___GFP_REPEAT) /* See above */ #define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) /* See above */ #define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) /* See above */ +#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC)/* Allow access to emergency reserves */ #define __GFP_COMP ((__force gfp_t)___GFP_COMP) /* Add compound page metadata */ #define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) /* Return zeroed page on success */ -#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves */ +#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) /* Don't use emergency reserves. + * This takes precedence over the + * __GFP_MEMALLOC flag if both are + * set + */ #define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */ #define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */ #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ @@ -129,7 +135,7 @@ struct vm_area_struct; /* Control page allocator reclaim behavior */ #define GFP_RECLAIM_MASK (__GFP_WAIT|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_REPEAT|__GFP_NOFAIL|\ - __GFP_NORETRY|__GFP_NOMEMALLOC) + __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC) /* Control slab gfp mask during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_WAIT|__GFP_IO|__GFP_FS)) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 375e79e..bf78672 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -55,7 +55,7 @@ struct page { pgoff_t index; /* Our offset within mapping. */ void *freelist; /* slub/slob first free object */ bool pfmemalloc; /* If set by the page allocator, - * ALLOC_PFMEMALLOC was set + * ALLOC_NO_WATERMARKS was set * and the low watermark was not * met implying that the system * is under some pressure. The diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 9fe3a36..d6fd8e5 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -30,6 +30,7 @@ {(unsigned long)__GFP_COMP, "GFP_COMP"}, \ {(unsigned long)__GFP_ZERO, "GFP_ZERO"}, \ {(unsigned long)__GFP_NOMEMALLOC, "GFP_NOMEMALLOC"}, \ + {(unsigned long)__GFP_MEMALLOC, "GFP_MEMALLOC"}, \ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 38e5be6..8f65abe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1513,7 +1513,6 @@ failed: #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ -#define ALLOC_PFMEMALLOC 0x80 /* Caller has PF_MEMALLOC set */ #ifdef CONFIG_FAIL_PAGE_ALLOC @@ -2294,11 +2293,10 @@ gfp_to_alloc_flags(gfp_t gfp_mask) } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; - if ((current->flags & PF_MEMALLOC) || - unlikely(test_thread_flag(TIF_MEMDIE))) { - alloc_flags |= ALLOC_PFMEMALLOC; - - if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt()) + if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { + if (gfp_mask & __GFP_MEMALLOC) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt()) alloc_flags |= ALLOC_NO_WATERMARKS; } @@ -2307,7 +2305,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask) bool gfp_pfmemalloc_allowed(gfp_t gfp_mask) { - return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_PFMEMALLOC); + return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS); } static inline struct page * @@ -2498,12 +2496,12 @@ nopage: return page; got_pg: /* - * page->pfmemalloc is set when the caller had PFMEMALLOC set or is - * been OOM killed. The expectation is that the caller is taking - * steps that will free more memory. The caller should avoid the - * page being used for !PFMEMALLOC purposes. + * page->pfmemalloc is set when the caller had PFMEMALLOC set, is + * been OOM killed or specified __GFP_MEMALLOC. The expectation is + * that the caller is taking steps that will free more memory. The + * caller should avoid the page being used for !PFMEMALLOC purposes. */ - page->pfmemalloc = !!(alloc_flags & ALLOC_PFMEMALLOC); + page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); if (kmemcheck_enabled) kmemcheck_pagealloc_alloc(page, order, gfp_mask); diff --git a/mm/slab.c b/mm/slab.c index 55d84a2..77be18d 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1884,7 +1884,7 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) return NULL; } - /* Record if ALLOC_PFMEMALLOC was set when allocating the slab */ + /* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */ if (unlikely(page->pfmemalloc)) pfmemalloc_active = true; -- cgit v0.10.2 From 907aed48f65efeecf91575397e3d79335d93a466 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:07 -0700 Subject: mm: allow PF_MEMALLOC from softirq context This is needed to allow network softirq packet processing to make use of PF_MEMALLOC. Currently softirq context cannot use PF_MEMALLOC due to it not being associated with a task, and therefore not having task flags to fiddle with - thus the gfp to alloc flag mapping ignores the task flags when in interrupts (hard or soft) context. Allowing softirqs to make use of PF_MEMALLOC therefore requires some trickery. This patch borrows the task flags from whatever process happens to be preempted by the softirq. It then modifies the gfp to alloc flags mapping to not exclude task flags in softirq context, and modify the softirq code to save, clear and restore the PF_MEMALLOC flag. The save and clear, ensures the preempted task's PF_MEMALLOC flag doesn't leak into the softirq. The restore ensures a softirq's PF_MEMALLOC flag cannot leak back into the preempted process. This should be safe due to the following reasons Softirqs can run on multiple CPUs sure but the same task should not be executing the same softirq code. Neither should the softirq handler be preempted by any other softirq handler so the flags should not leak to an unrelated softirq. Softirqs re-enable hardware interrupts in __do_softirq() so can be preempted by hardware interrupts so PF_MEMALLOC is inherited by the hard IRQ. However, this is similar to a process in reclaim being preempted by a hardirq. While PF_MEMALLOC is set, gfp_to_alloc_flags() distinguishes between hard and soft irqs and avoids giving a hardirq the ALLOC_NO_WATERMARKS flag. If the softirq is deferred to ksoftirq then its flags may be used instead of a normal tasks but as the softirq cannot be preempted, the PF_MEMALLOC flag does not leak to other code by accident. [davem@davemloft.net: Document why PF_MEMALLOC is safe] Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Cc: David Miller Cc: Neil Brown Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/sched.h b/include/linux/sched.h index 865725a..c147e70 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1894,6 +1894,13 @@ static inline void rcu_copy_process(struct task_struct *p) #endif +static inline void tsk_restore_flags(struct task_struct *task, + unsigned long orig_flags, unsigned long flags) +{ + task->flags &= ~flags; + task->flags |= orig_flags & flags; +} + #ifdef CONFIG_SMP extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); diff --git a/kernel/softirq.c b/kernel/softirq.c index 671f959..b73e681 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -210,6 +210,14 @@ asmlinkage void __do_softirq(void) __u32 pending; int max_restart = MAX_SOFTIRQ_RESTART; int cpu; + unsigned long old_flags = current->flags; + + /* + * Mask out PF_MEMALLOC s current task context is borrowed for the + * softirq. A softirq handled such as network RX might set PF_MEMALLOC + * again if the socket is related to swap + */ + current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); account_system_vtime(current); @@ -265,6 +273,7 @@ restart: account_system_vtime(current); __local_bh_enable(SOFTIRQ_OFFSET); + tsk_restore_flags(current, old_flags, PF_MEMALLOC); } #ifndef __ARCH_HAS_DO_SOFTIRQ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f65abe..cd5390f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2296,7 +2296,11 @@ gfp_to_alloc_flags(gfp_t gfp_mask) if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { if (gfp_mask & __GFP_MEMALLOC) alloc_flags |= ALLOC_NO_WATERMARKS; - else if (likely(!(gfp_mask & __GFP_NOMEMALLOC)) && !in_interrupt()) + else if (in_serving_softirq() && (current->flags & PF_MEMALLOC)) + alloc_flags |= ALLOC_NO_WATERMARKS; + else if (!in_interrupt() && + ((current->flags & PF_MEMALLOC) || + unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } -- cgit v0.10.2 From cfd19c5a9ecf8e5e38de2603077c4330af21316e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:10 -0700 Subject: mm: only set page->pfmemalloc when ALLOC_NO_WATERMARKS was used __alloc_pages_slowpath() is called when the number of free pages is below the low watermark. If the caller is entitled to use ALLOC_NO_WATERMARKS then the page will be marked page->pfmemalloc. This protects more pages than are strictly necessary as we only need to protect pages allocated below the min watermark (the pfmemalloc reserves). This patch only sets page->pfmemalloc when ALLOC_NO_WATERMARKS was required to allocate the page. [rientjes@google.com: David noticed the problem during review] Signed-off-by: Mel Gorman Cc: David Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index cd5390f..f9d9254 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2116,8 +2116,8 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, - alloc_flags, preferred_zone, - migratetype); + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); if (page) { preferred_zone->compact_considered = 0; preferred_zone->compact_defer_shift = 0; @@ -2209,8 +2209,8 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, - alloc_flags, preferred_zone, - migratetype); + alloc_flags & ~ALLOC_NO_WATERMARKS, + preferred_zone, migratetype); /* * If an allocation failed after direct reclaim, it could be because @@ -2381,8 +2381,17 @@ rebalance: page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); - if (page) + if (page) { + /* + * page->pfmemalloc is set when ALLOC_NO_WATERMARKS was + * necessary to allocate the page. The expectation is + * that the caller is taking steps that will free more + * memory. The caller should avoid the page being used + * for !PFMEMALLOC purposes. + */ + page->pfmemalloc = true; goto got_pg; + } } /* Atomic allocations - we can't balance anything */ @@ -2499,14 +2508,6 @@ nopage: warn_alloc_failed(gfp_mask, order, NULL); return page; got_pg: - /* - * page->pfmemalloc is set when the caller had PFMEMALLOC set, is - * been OOM killed or specified __GFP_MEMALLOC. The expectation is - * that the caller is taking steps that will free more memory. The - * caller should avoid the page being used for !PFMEMALLOC purposes. - */ - page->pfmemalloc = !!(alloc_flags & ALLOC_NO_WATERMARKS); - if (kmemcheck_enabled) kmemcheck_pagealloc_alloc(page, order, gfp_mask); -- cgit v0.10.2 From 183f6371aac2a5496a8ef2b0b0a68562652c3cdb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:12 -0700 Subject: mm: ignore mempolicies when using ALLOC_NO_WATERMARK The reserve is proportionally distributed over all !highmem zones in the system. So we need to allow an emergency allocation access to all zones. In order to do that we need to break out of any mempolicy boundaries we might have. In my opinion that does not break mempolicies as those are user oriented and not system oriented. That is, system allocations are not guaranteed to be within mempolicy boundaries. For instance IRQs do not even have a mempolicy. So breaking out of mempolicy boundaries for 'rare' emergency allocations, which are always system allocations (as opposed to user) is ok. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Cc: David Miller Cc: Neil Brown Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f9d9254..ef2d1e7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2378,6 +2378,13 @@ rebalance: /* Allocate without watermarks if the context allows */ if (alloc_flags & ALLOC_NO_WATERMARKS) { + /* + * Ignore mempolicies if ALLOC_NO_WATERMARKS on the grounds + * the allocation is high priority and these type of + * allocations are system rather than user orientated + */ + zonelist = node_zonelist(numa_node_id(), gfp_mask); + page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, migratetype); -- cgit v0.10.2 From 99a1dec70d5acbd8c6b3928cdebb4a2d1da676c8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:14 -0700 Subject: net: introduce sk_gfp_atomic() to allow addition of GFP flags depending on the individual socket Introduce sk_gfp_atomic(), this function allows to inject sock specific flags to each sock related allocation. It is only used on allocation paths that may be required for writing pages back to network storage. [davem@davemloft.net: Use sk_gfp_atomic only when necessary] Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Acked-by: David S. Miller Cc: Neil Brown Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/net/sock.h b/include/net/sock.h index cee528c..11ccde6 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -658,6 +658,11 @@ static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) return test_bit(flag, &sk->sk_flags); } +static inline gfp_t sk_gfp_atomic(struct sock *sk, gfp_t gfp_mask) +{ + return GFP_ATOMIC; +} + static inline void sk_acceptq_removed(struct sock *sk) { sk->sk_ack_backlog--; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 33cd065..3f1bcff 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2045,7 +2045,8 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, if (unlikely(sk->sk_state == TCP_CLOSE)) return; - if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) + if (tcp_write_xmit(sk, cur_mss, nonagle, 0, + sk_gfp_atomic(sk, GFP_ATOMIC))) tcp_check_probe_timer(sk); } @@ -2666,7 +2667,8 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, if (cvp != NULL && cvp->s_data_constant && cvp->s_data_desired) s_data_desired = cvp->s_data_desired; - skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER + 15 + s_data_desired, + sk_gfp_atomic(sk, GFP_ATOMIC)); if (unlikely(!skb)) { dst_release(dst); return NULL; @@ -3064,7 +3066,7 @@ void tcp_send_ack(struct sock *sk) * tcp_transmit_skb() will set the ownership to this * sock. */ - buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); if (buff == NULL) { inet_csk_schedule_ack(sk); inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; @@ -3079,7 +3081,7 @@ void tcp_send_ack(struct sock *sk) /* Send it off, this clears delayed acks for us. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); + tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); } /* This routine sends a packet with an out of date sequence @@ -3099,7 +3101,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) struct sk_buff *skb; /* We don't queue it, tcp_transmit_skb() sets ownership. */ - skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC)); if (skb == NULL) return -1; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 61c7b6d..c66b90f 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1299,7 +1299,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, /* Clone pktoptions received with SYN */ newnp->pktoptions = NULL; if (treq->pktopts != NULL) { - newnp->pktoptions = skb_clone(treq->pktopts, GFP_ATOMIC); + newnp->pktoptions = skb_clone(treq->pktopts, + sk_gfp_atomic(sk, GFP_ATOMIC)); consume_skb(treq->pktopts); treq->pktopts = NULL; if (newnp->pktoptions) @@ -1349,7 +1350,8 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, * across. Shucks. */ tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newnp->daddr, - AF_INET6, key->key, key->keylen, GFP_ATOMIC); + AF_INET6, key->key, key->keylen, + sk_gfp_atomic(sk, GFP_ATOMIC)); } #endif @@ -1442,7 +1444,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) --ANK (980728) */ if (np->rxopt.all) - opt_skb = skb_clone(skb, GFP_ATOMIC); + opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC)); if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ sock_rps_save_rxhash(sk, skb); -- cgit v0.10.2 From 7cb0240492caea2f6467f827313478f41877e6ef Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:16 -0700 Subject: netvm: allow the use of __GFP_MEMALLOC by specific sockets Allow specific sockets to be tagged SOCK_MEMALLOC and use __GFP_MEMALLOC for their allocations. These sockets will be able to go below watermarks and allocate from the emergency reserve. Such sockets are to be used to service the VM (iow. to swap over). They must be handled kernel side, exposing such a socket to user-space is a bug. There is a risk that the reserves be depleted so for now, the administrator is responsible for increasing min_free_kbytes as necessary to prevent deadlock for their workloads. [a.p.zijlstra@chello.nl: Original patches] Signed-off-by: Mel Gorman Acked-by: David S. Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/net/sock.h b/include/net/sock.h index 11ccde6..bfa7d20 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -621,6 +621,7 @@ enum sock_flags { SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ + SOCK_MEMALLOC, /* VM depends on this socket for swapping */ SOCK_TIMESTAMPING_TX_HARDWARE, /* %SOF_TIMESTAMPING_TX_HARDWARE */ SOCK_TIMESTAMPING_TX_SOFTWARE, /* %SOF_TIMESTAMPING_TX_SOFTWARE */ SOCK_TIMESTAMPING_RX_HARDWARE, /* %SOF_TIMESTAMPING_RX_HARDWARE */ @@ -660,7 +661,7 @@ static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) static inline gfp_t sk_gfp_atomic(struct sock *sk, gfp_t gfp_mask) { - return GFP_ATOMIC; + return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC); } static inline void sk_acceptq_removed(struct sock *sk) @@ -803,6 +804,8 @@ extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); extern void sk_stream_wait_close(struct sock *sk, long timeo_p); extern int sk_stream_error(struct sock *sk, int flags, int err); extern void sk_stream_kill_queues(struct sock *sk); +extern void sk_set_memalloc(struct sock *sk); +extern void sk_clear_memalloc(struct sock *sk); extern int sk_wait_data(struct sock *sk, long *timeo); diff --git a/net/core/sock.c b/net/core/sock.c index a67b062..3617f65 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -271,6 +271,28 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); +/** + * sk_set_memalloc - sets %SOCK_MEMALLOC + * @sk: socket to set it on + * + * Set %SOCK_MEMALLOC on a socket for access to emergency reserves. + * It's the responsibility of the admin to adjust min_free_kbytes + * to meet the requirements + */ +void sk_set_memalloc(struct sock *sk) +{ + sock_set_flag(sk, SOCK_MEMALLOC); + sk->sk_allocation |= __GFP_MEMALLOC; +} +EXPORT_SYMBOL_GPL(sk_set_memalloc); + +void sk_clear_memalloc(struct sock *sk) +{ + sock_reset_flag(sk, SOCK_MEMALLOC); + sk->sk_allocation &= ~__GFP_MEMALLOC; +} +EXPORT_SYMBOL_GPL(sk_clear_memalloc); + #if defined(CONFIG_CGROUPS) #if !defined(CONFIG_NET_CLS_CGROUP) int net_cls_subsys_id = -1; -- cgit v0.10.2 From c93bdd0e03e848555d144eb44a1f275b871a8dd5 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:19 -0700 Subject: netvm: allow skb allocation to use PFMEMALLOC reserves Change the skb allocation API to indicate RX usage and use this to fall back to the PFMEMALLOC reserve when needed. SKBs allocated from the reserve are tagged in skb->pfmemalloc. If an SKB is allocated from the reserve and the socket is later found to be unrelated to page reclaim, the packet is dropped so that the memory remains available for page reclaim. Network protocols are expected to recover from this packet loss. [a.p.zijlstra@chello.nl: Ideas taken from various patches] [davem@davemloft.net: Use static branches, coding style corrections] [sebastian@breakpoint.cc: Avoid unnecessary cast, fix !CONFIG_NET build] Signed-off-by: Mel Gorman Acked-by: David S. Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index cbd7400..4883f39 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -385,6 +385,9 @@ void drain_local_pages(void *dummy); */ extern gfp_t gfp_allowed_mask; +/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ +bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); + extern void pm_restrict_gfp_mask(void); extern void pm_restore_gfp_mask(void); diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index d205c4b..0336f02 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -462,6 +462,7 @@ struct sk_buff { #ifdef CONFIG_IPV6_NDISC_NODETYPE __u8 ndisc_nodetype:2; #endif + __u8 pfmemalloc:1; __u8 ooo_okay:1; __u8 l4_rxhash:1; __u8 wifi_acked_valid:1; @@ -502,6 +503,15 @@ struct sk_buff { #include +#define SKB_ALLOC_FCLONE 0x01 +#define SKB_ALLOC_RX 0x02 + +/* Returns true if the skb was allocated from PFMEMALLOC reserves */ +static inline bool skb_pfmemalloc(const struct sk_buff *skb) +{ + return unlikely(skb->pfmemalloc); +} + /* * skb might have a dst pointer attached, refcounted or not. * _skb_refdst low order bit is set if refcount was _not_ taken @@ -565,7 +575,7 @@ extern bool skb_try_coalesce(struct sk_buff *to, struct sk_buff *from, bool *fragstolen, int *delta_truesize); extern struct sk_buff *__alloc_skb(unsigned int size, - gfp_t priority, int fclone, int node); + gfp_t priority, int flags, int node); extern struct sk_buff *build_skb(void *data, unsigned int frag_size); static inline struct sk_buff *alloc_skb(unsigned int size, gfp_t priority) @@ -576,7 +586,7 @@ static inline struct sk_buff *alloc_skb(unsigned int size, static inline struct sk_buff *alloc_skb_fclone(unsigned int size, gfp_t priority) { - return __alloc_skb(size, priority, 1, NUMA_NO_NODE); + return __alloc_skb(size, priority, SKB_ALLOC_FCLONE, NUMA_NO_NODE); } extern void skb_recycle(struct sk_buff *skb); diff --git a/include/net/sock.h b/include/net/sock.h index bfa7d20..8119863 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -659,6 +659,21 @@ static inline bool sock_flag(const struct sock *sk, enum sock_flags flag) return test_bit(flag, &sk->sk_flags); } +#ifdef CONFIG_NET +extern struct static_key memalloc_socks; +static inline int sk_memalloc_socks(void) +{ + return static_key_false(&memalloc_socks); +} +#else + +static inline int sk_memalloc_socks(void) +{ + return 0; +} + +#endif + static inline gfp_t sk_gfp_atomic(struct sock *sk, gfp_t gfp_mask) { return GFP_ATOMIC | (sk->sk_allocation & __GFP_MEMALLOC); diff --git a/mm/internal.h b/mm/internal.h index eb76b67..3314f79 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -279,9 +279,6 @@ static inline struct page *mem_map_next(struct page *iter, #define __paginginit __init #endif -/* Returns true if the gfp_mask allows use of ALLOC_NO_WATERMARK */ -bool gfp_pfmemalloc_allowed(gfp_t gfp_mask); - /* Memory initialisation debug and verification */ enum mminit_level { MMINIT_WARNING, diff --git a/net/core/filter.c b/net/core/filter.c index d4ce2dc..907efd2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -83,6 +83,14 @@ int sk_filter(struct sock *sk, struct sk_buff *skb) int err; struct sk_filter *filter; + /* + * If the skb was allocated from pfmemalloc reserves, only + * allow SOCK_MEMALLOC sockets to use it as this socket is + * helping free memory + */ + if (skb_pfmemalloc(skb) && !sock_flag(sk, SOCK_MEMALLOC)) + return -ENOMEM; + err = security_sock_rcv_skb(sk, skb); if (err) return err; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 368f65c..fe00d12 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -145,6 +145,43 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) BUG(); } + +/* + * kmalloc_reserve is a wrapper around kmalloc_node_track_caller that tells + * the caller if emergency pfmemalloc reserves are being used. If it is and + * the socket is later found to be SOCK_MEMALLOC then PFMEMALLOC reserves + * may be used. Otherwise, the packet data may be discarded until enough + * memory is free + */ +#define kmalloc_reserve(size, gfp, node, pfmemalloc) \ + __kmalloc_reserve(size, gfp, node, _RET_IP_, pfmemalloc) +void *__kmalloc_reserve(size_t size, gfp_t flags, int node, unsigned long ip, + bool *pfmemalloc) +{ + void *obj; + bool ret_pfmemalloc = false; + + /* + * Try a regular allocation, when that fails and we're not entitled + * to the reserves, fail. + */ + obj = kmalloc_node_track_caller(size, + flags | __GFP_NOMEMALLOC | __GFP_NOWARN, + node); + if (obj || !(gfp_pfmemalloc_allowed(flags))) + goto out; + + /* Try again but now we are using pfmemalloc reserves */ + ret_pfmemalloc = true; + obj = kmalloc_node_track_caller(size, flags, node); + +out: + if (pfmemalloc) + *pfmemalloc = ret_pfmemalloc; + + return obj; +} + /* Allocate a new skbuff. We do this ourselves so we can fill in a few * 'private' fields and also do memory statistics to find all the * [BEEP] leaks. @@ -155,8 +192,10 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) * __alloc_skb - allocate a network buffer * @size: size to allocate * @gfp_mask: allocation mask - * @fclone: allocate from fclone cache instead of head cache - * and allocate a cloned (child) skb + * @flags: If SKB_ALLOC_FCLONE is set, allocate from fclone cache + * instead of head cache and allocate a cloned (child) skb. + * If SKB_ALLOC_RX is set, __GFP_MEMALLOC will be used for + * allocations in case the data is required for writeback * @node: numa node to allocate memory on * * Allocate a new &sk_buff. The returned buffer has no headroom and a @@ -167,14 +206,19 @@ static void skb_under_panic(struct sk_buff *skb, int sz, void *here) * %GFP_ATOMIC. */ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, - int fclone, int node) + int flags, int node) { struct kmem_cache *cache; struct skb_shared_info *shinfo; struct sk_buff *skb; u8 *data; + bool pfmemalloc; - cache = fclone ? skbuff_fclone_cache : skbuff_head_cache; + cache = (flags & SKB_ALLOC_FCLONE) + ? skbuff_fclone_cache : skbuff_head_cache; + + if (sk_memalloc_socks() && (flags & SKB_ALLOC_RX)) + gfp_mask |= __GFP_MEMALLOC; /* Get the HEAD */ skb = kmem_cache_alloc_node(cache, gfp_mask & ~__GFP_DMA, node); @@ -189,7 +233,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, */ size = SKB_DATA_ALIGN(size); size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - data = kmalloc_node_track_caller(size, gfp_mask, node); + data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); if (!data) goto nodata; /* kmalloc(size) might give us more room than requested. @@ -207,6 +251,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, memset(skb, 0, offsetof(struct sk_buff, tail)); /* Account for allocated memory : skb + skb->head */ skb->truesize = SKB_TRUESIZE(size); + skb->pfmemalloc = pfmemalloc; atomic_set(&skb->users, 1); skb->head = data; skb->data = data; @@ -222,7 +267,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, atomic_set(&shinfo->dataref, 1); kmemcheck_annotate_variable(shinfo->destructor_arg); - if (fclone) { + if (flags & SKB_ALLOC_FCLONE) { struct sk_buff *child = skb + 1; atomic_t *fclone_ref = (atomic_t *) (child + 1); @@ -232,6 +277,7 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, atomic_set(fclone_ref, 1); child->fclone = SKB_FCLONE_UNAVAILABLE; + child->pfmemalloc = pfmemalloc; } out: return skb; @@ -302,14 +348,7 @@ static DEFINE_PER_CPU(struct netdev_alloc_cache, netdev_alloc_cache); #define NETDEV_PAGECNT_BIAS (PAGE_SIZE / SMP_CACHE_BYTES) -/** - * netdev_alloc_frag - allocate a page fragment - * @fragsz: fragment size - * - * Allocates a frag from a page for receive buffer. - * Uses GFP_ATOMIC allocations. - */ -void *netdev_alloc_frag(unsigned int fragsz) +static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask) { struct netdev_alloc_cache *nc; void *data = NULL; @@ -319,7 +358,7 @@ void *netdev_alloc_frag(unsigned int fragsz) nc = &__get_cpu_var(netdev_alloc_cache); if (unlikely(!nc->page)) { refill: - nc->page = alloc_page(GFP_ATOMIC | __GFP_COLD); + nc->page = alloc_page(gfp_mask); if (unlikely(!nc->page)) goto end; recycle: @@ -343,6 +382,18 @@ end: local_irq_restore(flags); return data; } + +/** + * netdev_alloc_frag - allocate a page fragment + * @fragsz: fragment size + * + * Allocates a frag from a page for receive buffer. + * Uses GFP_ATOMIC allocations. + */ +void *netdev_alloc_frag(unsigned int fragsz) +{ + return __netdev_alloc_frag(fragsz, GFP_ATOMIC | __GFP_COLD); +} EXPORT_SYMBOL(netdev_alloc_frag); /** @@ -366,7 +417,12 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); if (fragsz <= PAGE_SIZE && !(gfp_mask & (__GFP_WAIT | GFP_DMA))) { - void *data = netdev_alloc_frag(fragsz); + void *data; + + if (sk_memalloc_socks()) + gfp_mask |= __GFP_MEMALLOC; + + data = __netdev_alloc_frag(fragsz, gfp_mask); if (likely(data)) { skb = build_skb(data, fragsz); @@ -374,7 +430,8 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, put_page(virt_to_head_page(data)); } } else { - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, + SKB_ALLOC_RX, NUMA_NO_NODE); } if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); @@ -656,6 +713,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) #if IS_ENABLED(CONFIG_IP_VS) new->ipvs_property = old->ipvs_property; #endif + new->pfmemalloc = old->pfmemalloc; new->protocol = old->protocol; new->mark = old->mark; new->skb_iif = old->skb_iif; @@ -814,6 +872,9 @@ struct sk_buff *skb_clone(struct sk_buff *skb, gfp_t gfp_mask) n->fclone = SKB_FCLONE_CLONE; atomic_inc(fclone_ref); } else { + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + n = kmem_cache_alloc(skbuff_head_cache, gfp_mask); if (!n) return NULL; @@ -850,6 +911,13 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) skb_shinfo(new)->gso_type = skb_shinfo(old)->gso_type; } +static inline int skb_alloc_rx_flag(const struct sk_buff *skb) +{ + if (skb_pfmemalloc(skb)) + return SKB_ALLOC_RX; + return 0; +} + /** * skb_copy - create private copy of an sk_buff * @skb: buffer to copy @@ -871,7 +939,8 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { int headerlen = skb_headroom(skb); unsigned int size = skb_end_offset(skb) + skb->data_len; - struct sk_buff *n = alloc_skb(size, gfp_mask); + struct sk_buff *n = __alloc_skb(size, gfp_mask, + skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) return NULL; @@ -906,7 +975,8 @@ EXPORT_SYMBOL(skb_copy); struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask) { unsigned int size = skb_headlen(skb) + headroom; - struct sk_buff *n = alloc_skb(size, gfp_mask); + struct sk_buff *n = __alloc_skb(size, gfp_mask, + skb_alloc_rx_flag(skb), NUMA_NO_NODE); if (!n) goto out; @@ -979,8 +1049,10 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, size = SKB_DATA_ALIGN(size); - data = kmalloc(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), - gfp_mask); + if (skb_pfmemalloc(skb)) + gfp_mask |= __GFP_MEMALLOC; + data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), + gfp_mask, NUMA_NO_NODE, NULL); if (!data) goto nodata; size = SKB_WITH_OVERHEAD(ksize(data)); @@ -1092,8 +1164,9 @@ struct sk_buff *skb_copy_expand(const struct sk_buff *skb, /* * Allocate the copy buffer */ - struct sk_buff *n = alloc_skb(newheadroom + skb->len + newtailroom, - gfp_mask); + struct sk_buff *n = __alloc_skb(newheadroom + skb->len + newtailroom, + gfp_mask, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); int oldheadroom = skb_headroom(skb); int head_copy_len, head_copy_off; int off; @@ -2775,8 +2848,9 @@ struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features) skb_release_head_state(nskb); __skb_push(nskb, doffset); } else { - nskb = alloc_skb(hsize + doffset + headroom, - GFP_ATOMIC); + nskb = __alloc_skb(hsize + doffset + headroom, + GFP_ATOMIC, skb_alloc_rx_flag(skb), + NUMA_NO_NODE); if (unlikely(!nskb)) goto err; diff --git a/net/core/sock.c b/net/core/sock.c index 3617f65..c8c5816 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -271,6 +271,9 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); EXPORT_SYMBOL(sysctl_optmem_max); +struct static_key memalloc_socks = STATIC_KEY_INIT_FALSE; +EXPORT_SYMBOL_GPL(memalloc_socks); + /** * sk_set_memalloc - sets %SOCK_MEMALLOC * @sk: socket to set it on @@ -283,6 +286,7 @@ void sk_set_memalloc(struct sock *sk) { sock_set_flag(sk, SOCK_MEMALLOC); sk->sk_allocation |= __GFP_MEMALLOC; + static_key_slow_inc(&memalloc_socks); } EXPORT_SYMBOL_GPL(sk_set_memalloc); @@ -290,6 +294,7 @@ void sk_clear_memalloc(struct sock *sk) { sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; + static_key_slow_dec(&memalloc_socks); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); -- cgit v0.10.2 From c48a11c7ad2623b99bbd6859b0b4234e7f11176f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:23 -0700 Subject: netvm: propagate page->pfmemalloc to skb The skb->pfmemalloc flag gets set to true iff during the slab allocation of data in __alloc_skb that the the PFMEMALLOC reserves were used. If the packet is fragmented, it is possible that pages will be allocated from the PFMEMALLOC reserve without propagating this information to the skb. This patch propagates page->pfmemalloc from pages allocated for fragments to the skb. Signed-off-by: Mel Gorman Acked-by: David S. Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 0336f02..b814bb8 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1247,6 +1247,17 @@ static inline void __skb_fill_page_desc(struct sk_buff *skb, int i, { skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; + /* + * Propagate page->pfmemalloc to the skb if we can. The problem is + * that not all callers have unique ownership of the page. If + * pfmemalloc is set, we check the mapping as a mapping implies + * page->index is set (index and pfmemalloc share space). + * If it's a valid mapping, we cannot use page->pfmemalloc but we + * do not lose pfmemalloc information as the pages would not be + * allocated using __GFP_MEMALLOC. + */ + if (page->pfmemalloc && !page->mapping) + skb->pfmemalloc = true; frag->page.p = page; frag->page_offset = off; skb_frag_size_set(frag, size); -- cgit v0.10.2 From 0614002bb5f7411e61ffa0dfe5be1f2c84df3da3 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:24 -0700 Subject: netvm: propagate page->pfmemalloc from skb_alloc_page to skb The skb->pfmemalloc flag gets set to true iff during the slab allocation of data in __alloc_skb that the the PFMEMALLOC reserves were used. If page splitting is used, it is possible that pages will be allocated from the PFMEMALLOC reserve without propagating this information to the skb. This patch propagates page->pfmemalloc from pages allocated for fragments to the skb. It works by reintroducing and expanding the skb_alloc_page() API to take an skb. If the page was allocated from pfmemalloc reserves, it is automatically copied. If the driver allocates the page before the skb, it should call skb_propagate_pfmemalloc() after the skb is allocated to ensure the flag is copied properly. Failure to do so is not critical. The resulting driver may perform slower if it is used for swap-over-NBD or swap-over-NFS but it should not result in failure. [davem@davemloft.net: API rename and consistency] Signed-off-by: Mel Gorman Acked-by: David S. Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/net/ethernet/chelsio/cxgb4/sge.c b/drivers/net/ethernet/chelsio/cxgb4/sge.c index 8596aca..d49933e 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4/sge.c @@ -528,7 +528,7 @@ static unsigned int refill_fl(struct adapter *adap, struct sge_fl *q, int n, #endif while (n--) { - pg = alloc_page(gfp); + pg = __skb_alloc_page(gfp, NULL); if (unlikely(!pg)) { q->alloc_failed++; break; diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c index f2d1ecd..8877fbf 100644 --- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c @@ -653,7 +653,7 @@ static unsigned int refill_fl(struct adapter *adapter, struct sge_fl *fl, alloc_small_pages: while (n--) { - page = alloc_page(gfp | __GFP_NOWARN | __GFP_COLD); + page = __skb_alloc_page(gfp | __GFP_NOWARN, NULL); if (unlikely(!page)) { fl->alloc_failed++; break; diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 1050411..b7c2d50 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -6235,7 +6235,7 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring, return true; if (!page) { - page = alloc_page(GFP_ATOMIC | __GFP_COLD); + page = __skb_alloc_page(GFP_ATOMIC, bi->skb); bi->page = page; if (unlikely(!page)) { rx_ring->rx_stats.alloc_failed++; diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index c709eae..4326f74 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -1141,8 +1141,8 @@ static bool ixgbe_alloc_mapped_page(struct ixgbe_ring *rx_ring, /* alloc new page for storage */ if (likely(!page)) { - page = alloc_pages(GFP_ATOMIC | __GFP_COLD | __GFP_COMP, - ixgbe_rx_pg_order(rx_ring)); + page = __skb_alloc_pages(GFP_ATOMIC | __GFP_COLD | __GFP_COMP, + bi->skb, ixgbe_rx_pg_order(rx_ring)); if (unlikely(!page)) { rx_ring->rx_stats.alloc_rx_page_failed++; return false; diff --git a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c index 3f9841d..60ef645 100644 --- a/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c +++ b/drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c @@ -352,7 +352,6 @@ static void ixgbevf_alloc_rx_buffers(struct ixgbevf_adapter *adapter, adapter->alloc_rx_buff_failed++; goto no_buffers; } - bi->skb = skb; } if (!bi->dma) { diff --git a/drivers/net/usb/cdc-phonet.c b/drivers/net/usb/cdc-phonet.c index 187c144..6461004 100644 --- a/drivers/net/usb/cdc-phonet.c +++ b/drivers/net/usb/cdc-phonet.c @@ -130,7 +130,7 @@ static int rx_submit(struct usbpn_dev *pnd, struct urb *req, gfp_t gfp_flags) struct page *page; int err; - page = alloc_page(gfp_flags); + page = __skb_alloc_page(gfp_flags | __GFP_NOMEMALLOC, NULL); if (!page) return -ENOMEM; diff --git a/drivers/usb/gadget/f_phonet.c b/drivers/usb/gadget/f_phonet.c index 965a629..8ee9268 100644 --- a/drivers/usb/gadget/f_phonet.c +++ b/drivers/usb/gadget/f_phonet.c @@ -301,7 +301,7 @@ pn_rx_submit(struct f_phonet *fp, struct usb_request *req, gfp_t gfp_flags) struct page *page; int err; - page = alloc_page(gfp_flags); + page = __skb_alloc_page(gfp_flags | __GFP_NOMEMALLOC, NULL); if (!page) return -ENOMEM; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index b814bb8..7632c87 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1774,6 +1774,61 @@ static inline struct sk_buff *netdev_alloc_skb_ip_align(struct net_device *dev, return __netdev_alloc_skb_ip_align(dev, length, GFP_ATOMIC); } +/* + * __skb_alloc_page - allocate pages for ps-rx on a skb and preserve pfmemalloc data + * @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX + * @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used + * @order: size of the allocation + * + * Allocate a new page. + * + * %NULL is returned if there is no free memory. +*/ +static inline struct page *__skb_alloc_pages(gfp_t gfp_mask, + struct sk_buff *skb, + unsigned int order) +{ + struct page *page; + + gfp_mask |= __GFP_COLD; + + if (!(gfp_mask & __GFP_NOMEMALLOC)) + gfp_mask |= __GFP_MEMALLOC; + + page = alloc_pages_node(NUMA_NO_NODE, gfp_mask, order); + if (skb && page && page->pfmemalloc) + skb->pfmemalloc = true; + + return page; +} + +/** + * __skb_alloc_page - allocate a page for ps-rx for a given skb and preserve pfmemalloc data + * @gfp_mask: alloc_pages_node mask. Set __GFP_NOMEMALLOC if not for network packet RX + * @skb: skb to set pfmemalloc on if __GFP_MEMALLOC is used + * + * Allocate a new page. + * + * %NULL is returned if there is no free memory. + */ +static inline struct page *__skb_alloc_page(gfp_t gfp_mask, + struct sk_buff *skb) +{ + return __skb_alloc_pages(gfp_mask, skb, 0); +} + +/** + * skb_propagate_pfmemalloc - Propagate pfmemalloc if skb is allocated after RX page + * @page: The page that was allocated from skb_alloc_page + * @skb: The skb that may need pfmemalloc set + */ +static inline void skb_propagate_pfmemalloc(struct page *page, + struct sk_buff *skb) +{ + if (page && page->pfmemalloc) + skb->pfmemalloc = true; +} + /** * skb_frag_page - retrieve the page refered to by a paged fragment * @frag: the paged fragment -- cgit v0.10.2 From b4b9e3558508980fc0cd161a545ffb55a1f13ee9 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:26 -0700 Subject: netvm: set PF_MEMALLOC as appropriate during SKB processing In order to make sure pfmemalloc packets receive all memory needed to proceed, ensure processing of pfmemalloc SKBs happens under PF_MEMALLOC. This is limited to a subset of protocols that are expected to be used for writing to swap. Taps are not allowed to use PF_MEMALLOC as these are expected to communicate with userspace processes which could be paged out. [a.p.zijlstra@chello.nl: Ideas taken from various patches] [jslaby@suse.cz: Lock imbalance fix] Signed-off-by: Mel Gorman Acked-by: David S. Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/net/sock.h b/include/net/sock.h index 8119863..43a470d 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -754,8 +754,13 @@ static inline __must_check int sk_add_backlog(struct sock *sk, struct sk_buff *s return 0; } +extern int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb); + static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) { + if (sk_memalloc_socks() && skb_pfmemalloc(skb)) + return __sk_backlog_rcv(sk, skb); + return sk->sk_backlog_rcv(sk, skb); } diff --git a/net/core/dev.c b/net/core/dev.c index 0ebaea1..ce13244 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3155,6 +3155,23 @@ void netdev_rx_handler_unregister(struct net_device *dev) } EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); +/* + * Limit the use of PFMEMALLOC reserves to those protocols that implement + * the special handling of PFMEMALLOC skbs. + */ +static bool skb_pfmemalloc_protocol(struct sk_buff *skb) +{ + switch (skb->protocol) { + case __constant_htons(ETH_P_ARP): + case __constant_htons(ETH_P_IP): + case __constant_htons(ETH_P_IPV6): + case __constant_htons(ETH_P_8021Q): + return true; + default: + return false; + } +} + static int __netif_receive_skb(struct sk_buff *skb) { struct packet_type *ptype, *pt_prev; @@ -3164,14 +3181,27 @@ static int __netif_receive_skb(struct sk_buff *skb) bool deliver_exact = false; int ret = NET_RX_DROP; __be16 type; + unsigned long pflags = current->flags; net_timestamp_check(!netdev_tstamp_prequeue, skb); trace_netif_receive_skb(skb); + /* + * PFMEMALLOC skbs are special, they should + * - be delivered to SOCK_MEMALLOC sockets only + * - stay away from userspace + * - have bounded memory usage + * + * Use PF_MEMALLOC as this saves us from propagating the allocation + * context down to all allocation sites. + */ + if (sk_memalloc_socks() && skb_pfmemalloc(skb)) + current->flags |= PF_MEMALLOC; + /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) - return NET_RX_DROP; + goto out; orig_dev = skb->dev; @@ -3191,7 +3221,7 @@ another_round: if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) { skb = vlan_untag(skb); if (unlikely(!skb)) - goto out; + goto unlock; } #ifdef CONFIG_NET_CLS_ACT @@ -3201,6 +3231,9 @@ another_round: } #endif + if (sk_memalloc_socks() && skb_pfmemalloc(skb)) + goto skip_taps; + list_for_each_entry_rcu(ptype, &ptype_all, list) { if (!ptype->dev || ptype->dev == skb->dev) { if (pt_prev) @@ -3209,13 +3242,18 @@ another_round: } } +skip_taps: #ifdef CONFIG_NET_CLS_ACT skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) - goto out; + goto unlock; ncls: #endif + if (sk_memalloc_socks() && skb_pfmemalloc(skb) + && !skb_pfmemalloc_protocol(skb)) + goto drop; + rx_handler = rcu_dereference(skb->dev->rx_handler); if (vlan_tx_tag_present(skb)) { if (pt_prev) { @@ -3225,7 +3263,7 @@ ncls: if (vlan_do_receive(&skb, !rx_handler)) goto another_round; else if (unlikely(!skb)) - goto out; + goto unlock; } if (rx_handler) { @@ -3235,7 +3273,7 @@ ncls: } switch (rx_handler(&skb)) { case RX_HANDLER_CONSUMED: - goto out; + goto unlock; case RX_HANDLER_ANOTHER: goto another_round; case RX_HANDLER_EXACT: @@ -3268,6 +3306,7 @@ ncls: else ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { +drop: atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); /* Jamal, now you will not able to escape explaining @@ -3276,8 +3315,10 @@ ncls: ret = NET_RX_DROP; } -out: +unlock: rcu_read_unlock(); +out: + tsk_restore_flags(current, pflags, PF_MEMALLOC); return ret; } diff --git a/net/core/sock.c b/net/core/sock.c index c8c5816..32fdcd2 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -298,6 +298,22 @@ void sk_clear_memalloc(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_clear_memalloc); +int __sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + int ret; + unsigned long pflags = current->flags; + + /* these should have been dropped before queueing */ + BUG_ON(!sock_flag(sk, SOCK_MEMALLOC)); + + current->flags |= PF_MEMALLOC; + ret = sk->sk_backlog_rcv(sk, skb); + tsk_restore_flags(current, pflags, PF_MEMALLOC); + + return ret; +} +EXPORT_SYMBOL(__sk_backlog_rcv); + #if defined(CONFIG_CGROUPS) #if !defined(CONFIG_NET_CLS_CGROUP) int net_cls_subsys_id = -1; -- cgit v0.10.2 From 381760eadc393bcb1bb328510ad75cf13431806d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:30 -0700 Subject: mm: micro-optimise slab to avoid a function call Getting and putting objects in SLAB currently requires a function call but the bulk of the work is related to PFMEMALLOC reserves which are only consumed when network-backed storage is critical. Use an inline function to determine if the function call is required. Signed-off-by: Mel Gorman Cc: David Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 77be18d..f8b0d53 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -118,6 +118,8 @@ #include #include +#include + #include #include #include @@ -965,7 +967,7 @@ out: spin_unlock_irqrestore(&l3->list_lock, flags); } -static void *ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, +static void *__ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, gfp_t flags, bool force_refill) { int i; @@ -1012,7 +1014,20 @@ static void *ac_get_obj(struct kmem_cache *cachep, struct array_cache *ac, return objp; } -static void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, +static inline void *ac_get_obj(struct kmem_cache *cachep, + struct array_cache *ac, gfp_t flags, bool force_refill) +{ + void *objp; + + if (unlikely(sk_memalloc_socks())) + objp = __ac_get_obj(cachep, ac, flags, force_refill); + else + objp = ac->entry[--ac->avail]; + + return objp; +} + +static void *__ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, void *objp) { if (unlikely(pfmemalloc_active)) { @@ -1022,6 +1037,15 @@ static void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, set_obj_pfmemalloc(&objp); } + return objp; +} + +static inline void ac_put_obj(struct kmem_cache *cachep, struct array_cache *ac, + void *objp) +{ + if (unlikely(sk_memalloc_socks())) + objp = __ac_put_obj(cachep, ac, objp); + ac->entry[ac->avail++] = objp; } -- cgit v0.10.2 From 7f338fe4540b1d0600b02314c7d885fd358e9eca Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:32 -0700 Subject: nbd: set SOCK_MEMALLOC for access to PFMEMALLOC reserves Set SOCK_MEMALLOC on the NBD socket to allow access to PFMEMALLOC reserves so pages backed by NBD, particularly if swap related, can be cleaned to prevent the machine being deadlocked. It is still possible that the PFMEMALLOC reserves get depleted resulting in deadlock but this can be resolved by the administrator by increasing min_free_kbytes. Signed-off-by: Mel Gorman Cc: David Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 061427a..76bc96f 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -154,6 +154,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, struct msghdr msg; struct kvec iov; sigset_t blocked, oldset; + unsigned long pflags = current->flags; if (unlikely(!sock)) { dev_err(disk_to_dev(nbd->disk), @@ -167,8 +168,9 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, siginitsetinv(&blocked, sigmask(SIGKILL)); sigprocmask(SIG_SETMASK, &blocked, &oldset); + current->flags |= PF_MEMALLOC; do { - sock->sk->sk_allocation = GFP_NOIO; + sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; iov.iov_base = buf; iov.iov_len = size; msg.msg_name = NULL; @@ -214,6 +216,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, } while (size > 0); sigprocmask(SIG_SETMASK, &oldset, NULL); + tsk_restore_flags(current, pflags, PF_MEMALLOC); return result; } @@ -405,6 +408,7 @@ static int nbd_do_it(struct nbd_device *nbd) BUG_ON(nbd->magic != NBD_MAGIC); + sk_set_memalloc(nbd->sock->sk); nbd->pid = task_pid_nr(current); ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (ret) { -- cgit v0.10.2 From 5515061d22f0f9976ae7815864bfd22042d36848 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:35 -0700 Subject: mm: throttle direct reclaimers if PF_MEMALLOC reserves are low and swap is backed by network storage If swap is backed by network storage such as NBD, there is a risk that a large number of reclaimers can hang the system by consuming all PF_MEMALLOC reserves. To avoid these hangs, the administrator must tune min_free_kbytes in advance which is a bit fragile. This patch throttles direct reclaimers if half the PF_MEMALLOC reserves are in use. If the system is routinely getting throttled the system administrator can increase min_free_kbytes so degradation is smoother but the system will keep running. Signed-off-by: Mel Gorman Cc: David Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 64b2c3a..2daa54f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -705,6 +705,7 @@ typedef struct pglist_data { range, including holes */ int node_id; wait_queue_head_t kswapd_wait; + wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ int kswapd_max_order; enum zone_type classzone_idx; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef2d1e7..48aee0f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4389,6 +4389,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, pgdat_resize_init(pgdat); pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); + init_waitqueue_head(&pgdat->pfmemalloc_wait); pgdat->kswapd_max_order = 0; pgdat_page_cgroup_init(pgdat); diff --git a/mm/vmscan.c b/mm/vmscan.c index 6b1f89a..021a44a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2112,6 +2112,80 @@ out: return 0; } +static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) +{ + struct zone *zone; + unsigned long pfmemalloc_reserve = 0; + unsigned long free_pages = 0; + int i; + bool wmark_ok; + + for (i = 0; i <= ZONE_NORMAL; i++) { + zone = &pgdat->node_zones[i]; + pfmemalloc_reserve += min_wmark_pages(zone); + free_pages += zone_page_state(zone, NR_FREE_PAGES); + } + + wmark_ok = free_pages > pfmemalloc_reserve / 2; + + /* kswapd must be awake if processes are being throttled */ + if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) { + pgdat->classzone_idx = min(pgdat->classzone_idx, + (enum zone_type)ZONE_NORMAL); + wake_up_interruptible(&pgdat->kswapd_wait); + } + + return wmark_ok; +} + +/* + * Throttle direct reclaimers if backing storage is backed by the network + * and the PFMEMALLOC reserve for the preferred node is getting dangerously + * depleted. kswapd will continue to make progress and wake the processes + * when the low watermark is reached + */ +static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, + nodemask_t *nodemask) +{ + struct zone *zone; + int high_zoneidx = gfp_zone(gfp_mask); + pg_data_t *pgdat; + + /* + * Kernel threads should not be throttled as they may be indirectly + * responsible for cleaning pages necessary for reclaim to make forward + * progress. kjournald for example may enter direct reclaim while + * committing a transaction where throttling it could forcing other + * processes to block on log_wait_commit(). + */ + if (current->flags & PF_KTHREAD) + return; + + /* Check if the pfmemalloc reserves are ok */ + first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + return; + + /* + * If the caller cannot enter the filesystem, it's possible that it + * is due to the caller holding an FS lock or performing a journal + * transaction in the case of a filesystem like ext[3|4]. In this case, + * it is not safe to block on pfmemalloc_wait as kswapd could be + * blocked waiting on the same lock. Instead, throttle for up to a + * second before continuing. + */ + if (!(gfp_mask & __GFP_FS)) { + wait_event_interruptible_timeout(pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat), HZ); + return; + } + + /* Throttle until kswapd wakes the process */ + wait_event_killable(zone->zone_pgdat->pfmemalloc_wait, + pfmemalloc_watermark_ok(pgdat)); +} + unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask) { @@ -2131,6 +2205,15 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, .gfp_mask = sc.gfp_mask, }; + throttle_direct_reclaim(gfp_mask, zonelist, nodemask); + + /* + * Do not enter reclaim if fatal signal is pending. 1 is returned so + * that the page allocator does not consider triggering OOM + */ + if (fatal_signal_pending(current)) + return 1; + trace_mm_vmscan_direct_reclaim_begin(order, sc.may_writepage, gfp_mask); @@ -2275,8 +2358,13 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, return balanced_pages >= (present_pages >> 2); } -/* is kswapd sleeping prematurely? */ -static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, +/* + * Prepare kswapd for sleeping. This verifies that there are no processes + * waiting in throttle_direct_reclaim() and that watermarks have been met. + * + * Returns true if kswapd is ready to sleep + */ +static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, int classzone_idx) { int i; @@ -2285,7 +2373,21 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) - return true; + return false; + + /* + * There is a potential race between when kswapd checks its watermarks + * and a process gets throttled. There is also a potential race if + * processes get throttled, kswapd wakes, a large process exits therby + * balancing the zones that causes kswapd to miss a wakeup. If kswapd + * is going to sleep, no process should be sleeping on pfmemalloc_wait + * so wake them now if necessary. If necessary, processes will wake + * kswapd and get throttled again + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait)) { + wake_up(&pgdat->pfmemalloc_wait); + return false; + } /* Check the watermark levels */ for (i = 0; i <= classzone_idx; i++) { @@ -2318,9 +2420,9 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, * must be balanced */ if (order) - return !pgdat_balanced(pgdat, balanced, classzone_idx); + return pgdat_balanced(pgdat, balanced, classzone_idx); else - return !all_zones_ok; + return all_zones_ok; } /* @@ -2546,6 +2648,16 @@ loop_again: } } + + /* + * If the low watermark is met there is no need for processes + * to be throttled on pfmemalloc_wait as they should not be + * able to safely make forward progress. Wake them + */ + if (waitqueue_active(&pgdat->pfmemalloc_wait) && + pfmemalloc_watermark_ok(pgdat)) + wake_up(&pgdat->pfmemalloc_wait); + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* @@ -2647,7 +2759,7 @@ out: } /* - * Return the order we were reclaiming at so sleeping_prematurely() + * Return the order we were reclaiming at so prepare_kswapd_sleep() * makes a decision on the order we were last reclaiming at. However, * if another caller entered the allocator slow path while kswapd * was awake, order will remain at the higher level @@ -2667,7 +2779,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -2677,7 +2789,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { + if (prepare_kswapd_sleep(pgdat, order, remaining, classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* -- cgit v0.10.2 From 68243e76ee343d63c6cf76978588a885951e2818 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:39 -0700 Subject: mm: account for the number of times direct reclaimers get throttled Under significant pressure when writing back to network-backed storage, direct reclaimers may get throttled. This is expected to be a short-lived event and the processes get woken up again but processes do get stalled. This patch counts how many times such stalling occurs. It's up to the administrator whether to reduce these stalls by increasing min_free_kbytes. Signed-off-by: Mel Gorman Cc: David Miller Cc: Neil Brown Cc: Peter Zijlstra Cc: Mike Christie Cc: Eric B Munson Cc: Eric Dumazet Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 06f8e38..57f7b10 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -30,6 +30,7 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, FOR_ALL_ZONES(PGSTEAL_DIRECT), FOR_ALL_ZONES(PGSCAN_KSWAPD), FOR_ALL_ZONES(PGSCAN_DIRECT), + PGSCAN_DIRECT_THROTTLE, #ifdef CONFIG_NUMA PGSCAN_ZONE_RECLAIM_FAILED, #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 021a44a..8880401 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2167,6 +2167,9 @@ static void throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, if (pfmemalloc_watermark_ok(pgdat)) return; + /* Account for the throttling */ + count_vm_event(PGSCAN_DIRECT_THROTTLE); + /* * If the caller cannot enter the filesystem, it's possible that it * is due to the caller holding an FS lock or performing a journal diff --git a/mm/vmstat.c b/mm/vmstat.c index 1bbbbd9..df7a674 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -745,6 +745,7 @@ const char * const vmstat_text[] = { TEXTS_FOR_ZONES("pgsteal_direct") TEXTS_FOR_ZONES("pgscan_kswapd") TEXTS_FOR_ZONES("pgscan_direct") + "pgscan_direct_throttle", #ifdef CONFIG_NUMA "zone_reclaim_failed", -- cgit v0.10.2 From c76562b6709fee5eff8a6a779be41c0bce661fd7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:41 -0700 Subject: netvm: prevent a stream-specific deadlock This patch series is based on top of "Swap-over-NBD without deadlocking v15" as it depends on the same reservation of PF_MEMALLOC reserves logic. When a user or administrator requires swap for their application, they create a swap partition and file, format it with mkswap and activate it with swapon. In diskless systems this is not an option so if swap if required then swapping over the network is considered. The two likely scenarios are when blade servers are used as part of a cluster where the form factor or maintenance costs do not allow the use of disks and thin clients. The Linux Terminal Server Project recommends the use of the Network Block Device (NBD) for swap but this is not always an option. There is no guarantee that the network attached storage (NAS) device is running Linux or supports NBD. However, it is likely that it supports NFS so there are users that want support for swapping over NFS despite any performance concern. Some distributions currently carry patches that support swapping over NFS but it would be preferable to support it in the mainline kernel. Patch 1 avoids a stream-specific deadlock that potentially affects TCP. Patch 2 is a small modification to SELinux to avoid using PFMEMALLOC reserves. Patch 3 adds three helpers for filesystems to handle swap cache pages. For example, page_file_mapping() returns page->mapping for file-backed pages and the address_space of the underlying swap file for swap cache pages. Patch 4 adds two address_space_operations to allow a filesystem to pin all metadata relevant to a swapfile in memory. Upon successful activation, the swapfile is marked SWP_FILE and the address space operation ->direct_IO is used for writing and ->readpage for reading in swap pages. Patch 5 notes that patch 3 is bolting filesystem-specific-swapfile-support onto the side and that the default handlers have different information to what is available to the filesystem. This patch refactors the code so that there are generic handlers for each of the new address_space operations. Patch 6 adds an API to allow a vector of kernel addresses to be translated to struct pages and pinned for IO. Patch 7 adds support for using highmem pages for swap by kmapping the pages before calling the direct_IO handler. Patch 8 updates NFS to use the helpers from patch 3 where necessary. Patch 9 avoids setting PF_private on PG_swapcache pages within NFS. Patch 10 implements the new swapfile-related address_space operations for NFS and teaches the direct IO handler how to manage kernel addresses. Patch 11 prevents page allocator recursions in NFS by using GFP_NOIO where appropriate. Patch 12 fixes a NULL pointer dereference that occurs when using swap-over-NFS. With the patches applied, it is possible to mount a swapfile that is on an NFS filesystem. Swap performance is not great with a swap stress test taking roughly twice as long to complete than if the swap device was backed by NBD. This patch: netvm: prevent a stream-specific deadlock It could happen that all !SOCK_MEMALLOC sockets have buffered so much data that we're over the global rmem limit. This will prevent SOCK_MEMALLOC buffers from receiving data, which will prevent userspace from running, which is needed to reduce the buffered data. Fix this by exempting the SOCK_MEMALLOC sockets from the rmem limit. Once this change it applied, it is important that sockets that set SOCK_MEMALLOC do not clear the flag until the socket is being torn down. If this happens, a warning is generated and the tokens reclaimed to avoid accounting errors until the bug is fixed. [davem@davemloft.net: Warning about clearing SOCK_MEMALLOC] Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Acked-by: David S. Miller Acked-by: Rik van Riel Cc: Trond Myklebust Cc: Neil Brown Cc: Christoph Hellwig Cc: Mike Christie Cc: Eric B Munson Cc: Sebastian Andrzej Siewior Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/net/sock.h b/include/net/sock.h index 43a470d..b373023 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1329,12 +1329,14 @@ static inline bool sk_wmem_schedule(struct sock *sk, int size) __sk_mem_schedule(sk, size, SK_MEM_SEND); } -static inline bool sk_rmem_schedule(struct sock *sk, int size) +static inline bool +sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, unsigned int size) { if (!sk_has_account(sk)) return true; - return size <= sk->sk_forward_alloc || - __sk_mem_schedule(sk, size, SK_MEM_RECV); + return size<= sk->sk_forward_alloc || + __sk_mem_schedule(sk, size, SK_MEM_RECV) || + skb_pfmemalloc(skb); } static inline void sk_mem_reclaim(struct sock *sk) diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 78f1cda..095259f 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -141,7 +141,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) err = sk_filter(sk, skb); if (err) return err; - if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { + if (!sk_rmem_schedule(sk, skb, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); net_dbg_ratelimited("sending flow OFF due to rmem_schedule\n"); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); diff --git a/net/core/sock.c b/net/core/sock.c index 32fdcd2..6b654b3 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -295,6 +295,18 @@ void sk_clear_memalloc(struct sock *sk) sock_reset_flag(sk, SOCK_MEMALLOC); sk->sk_allocation &= ~__GFP_MEMALLOC; static_key_slow_dec(&memalloc_socks); + + /* + * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward + * progress of swapping. However, if SOCK_MEMALLOC is cleared while + * it has rmem allocations there is a risk that the user of the + * socket cannot make forward progress due to exceeding the rmem + * limits. By rights, sk_clear_memalloc() should only be called + * on sockets being torn down but warn and reset the accounting if + * that assumption breaks. + */ + if (WARN_ON(sk->sk_forward_alloc)) + sk_mem_reclaim(sk); } EXPORT_SYMBOL_GPL(sk_clear_memalloc); @@ -396,7 +408,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (err) return err; - if (!sk_rmem_schedule(sk, skb->truesize)) { + if (!sk_rmem_schedule(sk, skb, skb->truesize)) { atomic_inc(&sk->sk_drops); return -ENOBUFS; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index a356e1f..00b91b4 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4351,19 +4351,20 @@ static void tcp_ofo_queue(struct sock *sk) static bool tcp_prune_ofo_queue(struct sock *sk); static int tcp_prune_queue(struct sock *sk); -static int tcp_try_rmem_schedule(struct sock *sk, unsigned int size) +static int tcp_try_rmem_schedule(struct sock *sk, struct sk_buff *skb, + unsigned int size) { if (atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || - !sk_rmem_schedule(sk, size)) { + !sk_rmem_schedule(sk, skb, size)) { if (tcp_prune_queue(sk) < 0) return -1; - if (!sk_rmem_schedule(sk, size)) { + if (!sk_rmem_schedule(sk, skb, size)) { if (!tcp_prune_ofo_queue(sk)) return -1; - if (!sk_rmem_schedule(sk, size)) + if (!sk_rmem_schedule(sk, skb, size)) return -1; } } @@ -4418,7 +4419,7 @@ static void tcp_data_queue_ofo(struct sock *sk, struct sk_buff *skb) TCP_ECN_check_ce(tp, skb); - if (unlikely(tcp_try_rmem_schedule(sk, skb->truesize))) { + if (unlikely(tcp_try_rmem_schedule(sk, skb, skb->truesize))) { NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPOFODROP); __kfree_skb(skb); return; @@ -4552,17 +4553,17 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size) { - struct sk_buff *skb; + struct sk_buff *skb = NULL; struct tcphdr *th; bool fragstolen; - if (tcp_try_rmem_schedule(sk, size + sizeof(*th))) - goto err; - skb = alloc_skb(size + sizeof(*th), sk->sk_allocation); if (!skb) goto err; + if (tcp_try_rmem_schedule(sk, skb, size + sizeof(*th))) + goto err_free; + th = (struct tcphdr *)skb_put(skb, sizeof(*th)); skb_reset_transport_header(skb); memset(th, 0, sizeof(*th)); @@ -4633,7 +4634,7 @@ static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) if (eaten <= 0) { queue_and_out: if (eaten < 0 && - tcp_try_rmem_schedule(sk, skb->truesize)) + tcp_try_rmem_schedule(sk, skb, skb->truesize)) goto drop; eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen); diff --git a/net/sctp/ulpevent.c b/net/sctp/ulpevent.c index 33d8947..10c018a 100644 --- a/net/sctp/ulpevent.c +++ b/net/sctp/ulpevent.c @@ -702,7 +702,8 @@ struct sctp_ulpevent *sctp_ulpevent_make_rcvmsg(struct sctp_association *asoc, if (rx_count >= asoc->base.sk->sk_rcvbuf) { if ((asoc->base.sk->sk_userlocks & SOCK_RCVBUF_LOCK) || - (!sk_rmem_schedule(asoc->base.sk, chunk->skb->truesize))) + (!sk_rmem_schedule(asoc->base.sk, chunk->skb, + chunk->skb->truesize))) goto fail; } -- cgit v0.10.2 From 6290c2c439732727899405f39fb76c2f5585b707 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:44 -0700 Subject: selinux: tag avc cache alloc as non-critical Failing to allocate a cache entry will only harm performance not correctness. Do not consume valuable reserve pages for something like that. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Acked-by: Eric Paris Acked-by: Rik van Riel Cc: James Morris Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 68d82da..4d3fab4 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -274,7 +274,7 @@ static struct avc_node *avc_alloc_node(void) { struct avc_node *node; - node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC); + node = kmem_cache_zalloc(avc_node_cachep, GFP_ATOMIC|__GFP_NOMEMALLOC); if (!node) goto out; -- cgit v0.10.2 From f981c5950fa85916ba49bea5d9a7a5078f47e569 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:47 -0700 Subject: mm: methods for teaching filesystems about PG_swapcache pages In order to teach filesystems to handle swap cache pages, three new page functions are introduced: pgoff_t page_file_index(struct page *); loff_t page_file_offset(struct page *); struct address_space *page_file_mapping(struct page *); page_file_index() - gives the offset of this page in the file in PAGE_CACHE_SIZE blocks. Like page->index is for mapped pages, this function also gives the correct index for PG_swapcache pages. page_file_offset() - uses page_file_index(), so that it will give the expected result, even for PG_swapcache pages. page_file_mapping() - gives the mapping backing the actual page; that is for swap cache pages it will give swap_file->f_mapping. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index 7c6dfd2..7cdac16 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -805,6 +805,17 @@ static inline void *page_rmapping(struct page *page) return (void *)((unsigned long)page->mapping & ~PAGE_MAPPING_FLAGS); } +extern struct address_space *__page_file_mapping(struct page *); + +static inline +struct address_space *page_file_mapping(struct page *page) +{ + if (unlikely(PageSwapCache(page))) + return __page_file_mapping(page); + + return page->mapping; +} + static inline int PageAnon(struct page *page) { return ((unsigned long)page->mapping & PAGE_MAPPING_ANON) != 0; @@ -821,6 +832,20 @@ static inline pgoff_t page_index(struct page *page) return page->index; } +extern pgoff_t __page_file_index(struct page *page); + +/* + * Return the file index of the page. Regular pagecache pages use ->index + * whereas swapcache pages use swp_offset(->private) + */ +static inline pgoff_t page_file_index(struct page *page) +{ + if (unlikely(PageSwapCache(page))) + return __page_file_index(page); + + return page->index; +} + /* * Return true if this page is mapped into pagetables. */ diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 7cfad3b..e42c762 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -286,6 +286,11 @@ static inline loff_t page_offset(struct page *page) return ((loff_t)page->index) << PAGE_CACHE_SHIFT; } +static inline loff_t page_file_offset(struct page *page) +{ + return ((loff_t)page_file_index(page)) << PAGE_CACHE_SHIFT; +} + extern pgoff_t linear_hugepage_index(struct vm_area_struct *vma, unsigned long address); diff --git a/include/linux/swap.h b/include/linux/swap.h index 9a16bb1..e62425d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -356,6 +356,7 @@ extern unsigned int count_swap_pages(int, int); extern sector_t map_swap_page(struct page *, struct block_device **); extern sector_t swapdev_block(int, pgoff_t); extern int page_swapcount(struct page *); +extern struct swap_info_struct *page_swap_info(struct page *); extern int reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; diff --git a/mm/swapfile.c b/mm/swapfile.c index 71373d0..f89af5b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -2285,6 +2286,31 @@ int swapcache_prepare(swp_entry_t entry) return __swap_duplicate(entry, SWAP_HAS_CACHE); } +struct swap_info_struct *page_swap_info(struct page *page) +{ + swp_entry_t swap = { .val = page_private(page) }; + BUG_ON(!PageSwapCache(page)); + return swap_info[swp_type(swap)]; +} + +/* + * out-of-line __page_file_ methods to avoid include hell. + */ +struct address_space *__page_file_mapping(struct page *page) +{ + VM_BUG_ON(!PageSwapCache(page)); + return page_swap_info(page)->swap_file->f_mapping; +} +EXPORT_SYMBOL_GPL(__page_file_mapping); + +pgoff_t __page_file_index(struct page *page) +{ + swp_entry_t swap = { .val = page_private(page) }; + VM_BUG_ON(!PageSwapCache(page)); + return swp_offset(swap); +} +EXPORT_SYMBOL_GPL(__page_file_index); + /* * add_swap_count_continuation - called when a swap count is duplicated * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's -- cgit v0.10.2 From 18022c5d8627a7a9ba8097a0f238b513fae6f5b8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:51 -0700 Subject: mm: add get_kernel_page[s] for pinning of kernel addresses for I/O This patch adds two new APIs get_kernel_pages() and get_kernel_page() that may be used to pin a vector of kernel addresses for IO. The initial user is expected to be NFS for allowing pages to be written to swap using aops->direct_IO(). Strictly speaking, swap-over-NFS only needs to pin one page for IO but it makes sense to express the API in terms of a vector and add a helper for pinning single pages. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Cc: Mark Salter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0edb65d..7b7ac9c 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -160,6 +160,7 @@ enum rq_flag_bits { __REQ_FLUSH_SEQ, /* request for flush sequence */ __REQ_IO_STAT, /* account I/O stat */ __REQ_MIXED_MERGE, /* merge of different types, fail separately */ + __REQ_KERNEL, /* direct IO to kernel pages */ __REQ_NR_BITS, /* stops here */ }; @@ -201,5 +202,6 @@ enum rq_flag_bits { #define REQ_IO_STAT (1 << __REQ_IO_STAT) #define REQ_MIXED_MERGE (1 << __REQ_MIXED_MERGE) #define REQ_SECURE (1 << __REQ_SECURE) +#define REQ_KERNEL (1 << __REQ_KERNEL) #endif /* __LINUX_BLK_TYPES_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 8fabb03..9d77309d 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -165,6 +165,8 @@ struct inodes_stat_t { #define READ 0 #define WRITE RW_MASK #define READA RWA_MASK +#define KERNEL_READ (READ|REQ_KERNEL) +#define KERNEL_WRITE (WRITE|REQ_KERNEL) #define READ_SYNC (READ | REQ_SYNC) #define WRITE_SYNC (WRITE | REQ_SYNC | REQ_NOIDLE) diff --git a/include/linux/mm.h b/include/linux/mm.h index 7cdac16..bd079a1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1019,6 +1019,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, struct page **pages, struct vm_area_struct **vmas); int get_user_pages_fast(unsigned long start, int nr_pages, int write, struct page **pages); +struct kvec; +int get_kernel_pages(const struct kvec *iov, int nr_pages, int write, + struct page **pages); +int get_kernel_page(unsigned long start, int write, struct page **pages); struct page *get_dump_page(unsigned long addr); extern int try_to_release_page(struct page * page, gfp_t gfp_mask); diff --git a/mm/swap.c b/mm/swap.c index 4e7e2ec..7d7f80c 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -236,6 +236,59 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); +/* + * get_kernel_pages() - pin kernel pages in memory + * @kiov: An array of struct kvec structures + * @nr_segs: number of segments to pin + * @write: pinning for read/write, currently ignored + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_segs long. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. + */ +int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, + struct page **pages) +{ + int seg; + + for (seg = 0; seg < nr_segs; seg++) { + if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) + return seg; + + /* virt_to_page sanity checks the PFN */ + pages[seg] = virt_to_page(kiov[seg].iov_base); + page_cache_get(pages[seg]); + } + + return seg; +} +EXPORT_SYMBOL_GPL(get_kernel_pages); + +/* + * get_kernel_page() - pin a kernel page in memory + * @start: starting kernel address + * @write: pinning for read/write, currently ignored + * @pages: array that receives pointer to the page pinned. + * Must be at least nr_segs long. + * + * Returns 1 if page is pinned. If the page was not pinned, returns + * -errno. The page returned must be released with a put_page() call + * when it is finished with. + */ +int get_kernel_page(unsigned long start, int write, struct page **pages) +{ + const struct kvec kiov = { + .iov_base = (void *)start, + .iov_len = PAGE_SIZE + }; + + return get_kernel_pages(&kiov, 1, write, pages); +} +EXPORT_SYMBOL_GPL(get_kernel_page); + static void pagevec_lru_move_fn(struct pagevec *pvec, void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg), void *arg) -- cgit v0.10.2 From 62c230bc1790923a1b35da03596a68a6c9b5b100 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:55 -0700 Subject: mm: add support for a filesystem to activate swap files and use direct_IO for writing swap pages Currently swapfiles are managed entirely by the core VM by using ->bmap to allocate space and write to the blocks directly. This effectively ensures that the underlying blocks are allocated and avoids the need for the swap subsystem to locate what physical blocks store offsets within a file. If the swap subsystem is to use the filesystem information to locate the blocks, it is critical that information such as block groups, block bitmaps and the block descriptor table that map the swap file were resident in memory. This patch adds address_space_operations that the VM can call when activating or deactivating swap backed by a file. int swap_activate(struct file *); int swap_deactivate(struct file *); The ->swap_activate() method is used to communicate to the file that the VM relies on it, and the address_space should take adequate measures such as reserving space in the underlying device, reserving memory for mempools and pinning information such as the block descriptor table in memory. The ->swap_deactivate() method is called on sys_swapoff() if ->swap_activate() returned success. After a successful swapfile ->swap_activate, the swapfile is marked SWP_FILE and swapper_space.a_ops will proxy to sis->swap_file->f_mappings->a_ops using ->direct_io to write swapcache pages and ->readpage to read. It is perfectly possible that direct_IO be used to read the swap pages but it is an unnecessary complication. Similarly, it is possible that ->writepage be used instead of direct_io to write the pages but filesystem developers have stated that calling writepage from the VM is undesirable for a variety of reasons and using direct_IO opens up the possibility of writing back batches of swap pages in the future. [a.p.zijlstra@chello.nl: Original patch] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index e0cce2a..2db1900 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -206,6 +206,8 @@ prototypes: int (*launder_page)(struct page *); int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long); int (*error_remove_page)(struct address_space *, struct page *); + int (*swap_activate)(struct file *); + int (*swap_deactivate)(struct file *); locking rules: All except set_page_dirty and freepage may block @@ -229,6 +231,8 @@ migratepage: yes (both) launder_page: yes is_partially_uptodate: yes error_remove_page: yes +swap_activate: no +swap_deactivate: no ->write_begin(), ->write_end(), ->sync_page() and ->readpage() may be called from the request handler (/dev/loop). @@ -330,6 +334,15 @@ cleaned, or an error value if not. Note that in order to prevent the page getting mapped back in and redirtied, it needs to be kept locked across the entire operation. + ->swap_activate will be called with a non-zero argument on +files backing (non block device backed) swapfiles. A return value +of zero indicates success, in which case this file can be used for +backing swapspace. The swapspace operations will be proxied to the +address space operations. + + ->swap_deactivate() will be called in the sys_swapoff() +path after ->swap_activate() returned success. + ----------------------- file_lock_operations ------------------------------ prototypes: void (*fl_copy_lock)(struct file_lock *, struct file_lock *); diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index aa754e0..065aa2d 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -592,6 +592,8 @@ struct address_space_operations { int (*migratepage) (struct page *, struct page *); int (*launder_page) (struct page *); int (*error_remove_page) (struct mapping *mapping, struct page *page); + int (*swap_activate)(struct file *); + int (*swap_deactivate)(struct file *); }; writepage: called by the VM to write a dirty page to backing store. @@ -760,6 +762,16 @@ struct address_space_operations { Setting this implies you deal with pages going away under you, unless you have them locked or reference counts increased. + swap_activate: Called when swapon is used on a file to allocate + space if necessary and pin the block lookup information in + memory. A return value of zero indicates success, + in which case this file can be used to back swapspace. The + swapspace operations will be proxied to this address space's + ->swap_{out,in} methods. + + swap_deactivate: Called during swapoff on files where swap_activate + was successful. + The File Object =============== diff --git a/include/linux/fs.h b/include/linux/fs.h index 9d77309d..38356ab 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -638,6 +638,10 @@ struct address_space_operations { int (*is_partially_uptodate) (struct page *, read_descriptor_t *, unsigned long); int (*error_remove_page)(struct address_space *, struct page *); + + /* swapfile support */ + int (*swap_activate)(struct file *file); + int (*swap_deactivate)(struct file *file); }; extern const struct address_space_operations empty_aops; diff --git a/include/linux/swap.h b/include/linux/swap.h index e62425d..ab230b1 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -151,6 +151,7 @@ enum { SWP_SOLIDSTATE = (1 << 4), /* blkdev seeks are cheap */ SWP_CONTINUED = (1 << 5), /* swap_map has count continuation */ SWP_BLKDEV = (1 << 6), /* its a block device */ + SWP_FILE = (1 << 7), /* set after swap_activate success */ /* add others here before... */ SWP_SCANNING = (1 << 8), /* refcount in scan_swap_map */ }; @@ -320,6 +321,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t ent) /* linux/mm/page_io.c */ extern int swap_readpage(struct page *); extern int swap_writepage(struct page *page, struct writeback_control *wbc); +extern int swap_set_page_dirty(struct page *page); extern void end_swap_bio_read(struct bio *bio, int err); /* linux/mm/swap_state.c */ diff --git a/mm/page_io.c b/mm/page_io.c index 34f0292..307a3e7 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -94,6 +95,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) { struct bio *bio; int ret = 0, rw = WRITE; + struct swap_info_struct *sis = page_swap_info(page); if (try_to_free_swap(page)) { unlock_page(page); @@ -105,6 +107,32 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) end_page_writeback(page); goto out; } + + if (sis->flags & SWP_FILE) { + struct kiocb kiocb; + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct iovec iov = { + .iov_base = page_address(page), + .iov_len = PAGE_SIZE, + }; + + init_sync_kiocb(&kiocb, swap_file); + kiocb.ki_pos = page_file_offset(page); + kiocb.ki_left = PAGE_SIZE; + kiocb.ki_nbytes = PAGE_SIZE; + + unlock_page(page); + ret = mapping->a_ops->direct_IO(KERNEL_WRITE, + &kiocb, &iov, + kiocb.ki_pos, 1); + if (ret == PAGE_SIZE) { + count_vm_event(PSWPOUT); + ret = 0; + } + return ret; + } + bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); @@ -126,6 +154,7 @@ int swap_readpage(struct page *page) { struct bio *bio; int ret = 0; + struct swap_info_struct *sis = page_swap_info(page); VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); @@ -134,6 +163,17 @@ int swap_readpage(struct page *page) unlock_page(page); goto out; } + + if (sis->flags & SWP_FILE) { + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + + ret = mapping->a_ops->readpage(swap_file, page); + if (!ret) + count_vm_event(PSWPIN); + return ret; + } + bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); @@ -145,3 +185,15 @@ int swap_readpage(struct page *page) out: return ret; } + +int swap_set_page_dirty(struct page *page) +{ + struct swap_info_struct *sis = page_swap_info(page); + + if (sis->flags & SWP_FILE) { + struct address_space *mapping = sis->swap_file->f_mapping; + return mapping->a_ops->set_page_dirty(page); + } else { + return __set_page_dirty_no_writeback(page); + } +} diff --git a/mm/swap_state.c b/mm/swap_state.c index c85b559..0cb36fb 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -27,7 +27,7 @@ */ static const struct address_space_operations swap_aops = { .writepage = swap_writepage, - .set_page_dirty = __set_page_dirty_no_writeback, + .set_page_dirty = swap_set_page_dirty, .migratepage = migrate_page, }; diff --git a/mm/swapfile.c b/mm/swapfile.c index f89af5b..6ffc876 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1329,6 +1329,14 @@ static void destroy_swap_extents(struct swap_info_struct *sis) list_del(&se->list); kfree(se); } + + if (sis->flags & SWP_FILE) { + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + + sis->flags &= ~SWP_FILE; + mapping->a_ops->swap_deactivate(swap_file); + } } /* @@ -1410,7 +1418,9 @@ add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, */ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) { - struct inode *inode; + struct file *swap_file = sis->swap_file; + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; unsigned blocks_per_page; unsigned long page_no; unsigned blkbits; @@ -1421,13 +1431,22 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) int nr_extents = 0; int ret; - inode = sis->swap_file->f_mapping->host; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; goto out; } + if (mapping->a_ops->swap_activate) { + ret = mapping->a_ops->swap_activate(swap_file); + if (!ret) { + sis->flags |= SWP_FILE; + ret = add_swap_extent(sis, 0, sis->max, 0); + *span = sis->pages; + } + goto out; + } + blkbits = inode->i_blkbits; blocks_per_page = PAGE_SIZE >> blkbits; -- cgit v0.10.2 From a509bc1a9e487d952d9404318f7f990166ab57a7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:44:57 -0700 Subject: mm: swap: implement generic handler for swap_activate The version of swap_activate introduced is sufficient for swap-over-NFS but would not provide enough information to implement a generic handler. This patch shuffles things slightly to ensure the same information is available for aops->swap_activate() as is available to the core. No functionality change. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/fs.h b/include/linux/fs.h index 38356ab..c8667f8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -429,6 +429,7 @@ struct kstatfs; struct vm_area_struct; struct vfsmount; struct cred; +struct swap_info_struct; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -640,8 +641,9 @@ struct address_space_operations { int (*error_remove_page)(struct address_space *, struct page *); /* swapfile support */ - int (*swap_activate)(struct file *file); - int (*swap_deactivate)(struct file *file); + int (*swap_activate)(struct swap_info_struct *sis, struct file *file, + sector_t *span); + void (*swap_deactivate)(struct file *file); }; extern const struct address_space_operations empty_aops; diff --git a/include/linux/swap.h b/include/linux/swap.h index ab230b1..388e706 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -324,6 +324,11 @@ extern int swap_writepage(struct page *page, struct writeback_control *wbc); extern int swap_set_page_dirty(struct page *page); extern void end_swap_bio_read(struct bio *bio, int err); +int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, + unsigned long nr_pages, sector_t start_block); +int generic_swapfile_activate(struct swap_info_struct *, struct file *, + sector_t *); + /* linux/mm/swap_state.c */ extern struct address_space swapper_space; #define total_swapcache_pages swapper_space.nrpages diff --git a/mm/page_io.c b/mm/page_io.c index 307a3e7..4a37962 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -87,6 +87,98 @@ void end_swap_bio_read(struct bio *bio, int err) bio_put(bio); } +int generic_swapfile_activate(struct swap_info_struct *sis, + struct file *swap_file, + sector_t *span) +{ + struct address_space *mapping = swap_file->f_mapping; + struct inode *inode = mapping->host; + unsigned blocks_per_page; + unsigned long page_no; + unsigned blkbits; + sector_t probe_block; + sector_t last_block; + sector_t lowest_block = -1; + sector_t highest_block = 0; + int nr_extents = 0; + int ret; + + blkbits = inode->i_blkbits; + blocks_per_page = PAGE_SIZE >> blkbits; + + /* + * Map all the blocks into the extent list. This code doesn't try + * to be very smart. + */ + probe_block = 0; + page_no = 0; + last_block = i_size_read(inode) >> blkbits; + while ((probe_block + blocks_per_page) <= last_block && + page_no < sis->max) { + unsigned block_in_page; + sector_t first_block; + + first_block = bmap(inode, probe_block); + if (first_block == 0) + goto bad_bmap; + + /* + * It must be PAGE_SIZE aligned on-disk + */ + if (first_block & (blocks_per_page - 1)) { + probe_block++; + goto reprobe; + } + + for (block_in_page = 1; block_in_page < blocks_per_page; + block_in_page++) { + sector_t block; + + block = bmap(inode, probe_block + block_in_page); + if (block == 0) + goto bad_bmap; + if (block != first_block + block_in_page) { + /* Discontiguity */ + probe_block++; + goto reprobe; + } + } + + first_block >>= (PAGE_SHIFT - blkbits); + if (page_no) { /* exclude the header page */ + if (first_block < lowest_block) + lowest_block = first_block; + if (first_block > highest_block) + highest_block = first_block; + } + + /* + * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks + */ + ret = add_swap_extent(sis, page_no, 1, first_block); + if (ret < 0) + goto out; + nr_extents += ret; + page_no++; + probe_block += blocks_per_page; +reprobe: + continue; + } + ret = nr_extents; + *span = 1 + highest_block - lowest_block; + if (page_no == 0) + page_no = 1; /* force Empty message */ + sis->max = page_no; + sis->pages = page_no - 1; + sis->highest_bit = page_no - 1; +out: + return ret; +bad_bmap: + printk(KERN_ERR "swapon: swapfile has holes\n"); + ret = -EINVAL; + goto out; +} + /* * We may have stale swap cache pages in memory: notice * them here and get rid of the unnecessary final write. diff --git a/mm/swapfile.c b/mm/swapfile.c index 6ffc876..7307fc9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1345,7 +1345,7 @@ static void destroy_swap_extents(struct swap_info_struct *sis) * * This function rather assumes that it is called in ascending page order. */ -static int +int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) { @@ -1421,106 +1421,25 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; - unsigned blocks_per_page; - unsigned long page_no; - unsigned blkbits; - sector_t probe_block; - sector_t last_block; - sector_t lowest_block = -1; - sector_t highest_block = 0; - int nr_extents = 0; int ret; if (S_ISBLK(inode->i_mode)) { ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; - goto out; + return ret; } if (mapping->a_ops->swap_activate) { - ret = mapping->a_ops->swap_activate(swap_file); + ret = mapping->a_ops->swap_activate(sis, swap_file, span); if (!ret) { sis->flags |= SWP_FILE; ret = add_swap_extent(sis, 0, sis->max, 0); *span = sis->pages; } - goto out; + return ret; } - blkbits = inode->i_blkbits; - blocks_per_page = PAGE_SIZE >> blkbits; - - /* - * Map all the blocks into the extent list. This code doesn't try - * to be very smart. - */ - probe_block = 0; - page_no = 0; - last_block = i_size_read(inode) >> blkbits; - while ((probe_block + blocks_per_page) <= last_block && - page_no < sis->max) { - unsigned block_in_page; - sector_t first_block; - - first_block = bmap(inode, probe_block); - if (first_block == 0) - goto bad_bmap; - - /* - * It must be PAGE_SIZE aligned on-disk - */ - if (first_block & (blocks_per_page - 1)) { - probe_block++; - goto reprobe; - } - - for (block_in_page = 1; block_in_page < blocks_per_page; - block_in_page++) { - sector_t block; - - block = bmap(inode, probe_block + block_in_page); - if (block == 0) - goto bad_bmap; - if (block != first_block + block_in_page) { - /* Discontiguity */ - probe_block++; - goto reprobe; - } - } - - first_block >>= (PAGE_SHIFT - blkbits); - if (page_no) { /* exclude the header page */ - if (first_block < lowest_block) - lowest_block = first_block; - if (first_block > highest_block) - highest_block = first_block; - } - - /* - * We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks - */ - ret = add_swap_extent(sis, page_no, 1, first_block); - if (ret < 0) - goto out; - nr_extents += ret; - page_no++; - probe_block += blocks_per_page; -reprobe: - continue; - } - ret = nr_extents; - *span = 1 + highest_block - lowest_block; - if (page_no == 0) - page_no = 1; /* force Empty message */ - sis->max = page_no; - sis->pages = page_no - 1; - sis->highest_bit = page_no - 1; -out: - return ret; -bad_bmap: - printk(KERN_ERR "swapon: swapfile has holes\n"); - ret = -EINVAL; - goto out; + return generic_swapfile_activate(sis, swap_file, span); } static void enable_swap_info(struct swap_info_struct *p, int prio, -- cgit v0.10.2 From 5a178119b0fbe37f7dfb602b37df9cc4b1dc9d71 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:45:02 -0700 Subject: mm: add support for direct_IO to highmem pages The patch "mm: add support for a filesystem to activate swap files and use direct_IO for writing swap pages" added support for using direct_IO to write swap pages but it is insufficient for highmem pages. To support highmem pages, this patch kmaps() the page before calling the direct_IO() handler. As direct_IO deals with virtual addresses an additional helper is necessary for get_kernel_pages() to lookup the struct page for a kmap virtual address. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 774fa47..ef788b5 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -39,10 +39,17 @@ extern unsigned long totalhigh_pages; void kmap_flush_unused(void); +struct page *kmap_to_page(void *addr); + #else /* CONFIG_HIGHMEM */ static inline unsigned int nr_free_highpages(void) { return 0; } +static inline struct page *kmap_to_page(void *addr) +{ + return virt_to_page(addr); +} + #define totalhigh_pages 0UL #ifndef ARCH_HAS_KMAP diff --git a/mm/highmem.c b/mm/highmem.c index 57d82c6..d517cd1 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -94,6 +94,18 @@ static DECLARE_WAIT_QUEUE_HEAD(pkmap_map_wait); do { spin_unlock(&kmap_lock); (void)(flags); } while (0) #endif +struct page *kmap_to_page(void *vaddr) +{ + unsigned long addr = (unsigned long)vaddr; + + if (addr >= PKMAP_ADDR(0) && addr <= PKMAP_ADDR(LAST_PKMAP)) { + int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; + return pte_page(pkmap_page_table[i]); + } + + return virt_to_page(addr); +} + static void flush_all_zero_pkmaps(void) { int i; diff --git a/mm/page_io.c b/mm/page_io.c index 4a37962..78eee32 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -205,7 +205,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) struct file *swap_file = sis->swap_file; struct address_space *mapping = swap_file->f_mapping; struct iovec iov = { - .iov_base = page_address(page), + .iov_base = kmap(page), .iov_len = PAGE_SIZE, }; @@ -218,6 +218,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) ret = mapping->a_ops->direct_IO(KERNEL_WRITE, &kiocb, &iov, kiocb.ki_pos, 1); + kunmap(page); if (ret == PAGE_SIZE) { count_vm_event(PSWPOUT); ret = 0; diff --git a/mm/swap.c b/mm/swap.c index 7d7f80c..7782588 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -258,8 +258,7 @@ int get_kernel_pages(const struct kvec *kiov, int nr_segs, int write, if (WARN_ON(kiov[seg].iov_len != PAGE_SIZE)) return seg; - /* virt_to_page sanity checks the PFN */ - pages[seg] = virt_to_page(kiov[seg].iov_base); + pages[seg] = kmap_to_page(kiov[seg].iov_base); page_cache_get(pages[seg]); } -- cgit v0.10.2 From d56b4ddf7781ef8dd050542781cc7f55673af002 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:45:06 -0700 Subject: nfs: teach the NFS client how to treat PG_swapcache pages Replace all relevant occurences of page->index and page->mapping in the NFS client with the new page_file_index() and page_file_mapping() functions. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 70d124a..acd4e4c 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -430,7 +430,7 @@ static void nfs_invalidate_page(struct page *page, unsigned long offset) if (offset != 0) return; /* Cancel any unstarted writes on this page */ - nfs_wb_page_cancel(page->mapping->host, page); + nfs_wb_page_cancel(page_file_mapping(page)->host, page); nfs_fscache_invalidate_page(page, page->mapping->host); } @@ -472,7 +472,7 @@ static int nfs_release_page(struct page *page, gfp_t gfp) */ static int nfs_launder_page(struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_inode *nfsi = NFS_I(inode); dfprintk(PAGECACHE, "NFS: launder_page(%ld, %llu)\n", @@ -521,7 +521,7 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) nfs_fscache_wait_on_page_write(NFS_I(dentry->d_inode), page); lock_page(page); - mapping = page->mapping; + mapping = page_file_mapping(page); if (mapping != dentry->d_inode->i_mapping) goto out_unlock; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index cfafd13..4be14b3 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -546,13 +546,14 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) static inline unsigned int nfs_page_length(struct page *page) { - loff_t i_size = i_size_read(page->mapping->host); + loff_t i_size = i_size_read(page_file_mapping(page)->host); if (i_size > 0) { + pgoff_t page_index = page_file_index(page); pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; - if (page->index < end_index) + if (page_index < end_index) return PAGE_CACHE_SIZE; - if (page->index == end_index) + if (page_index == end_index) return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1; } return 0; diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index aed913c..9ef8b3c 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -117,7 +117,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, * long write-back delay. This will be adjusted in * update_nfs_request below if the region is not locked. */ req->wb_page = page; - req->wb_index = page->index; + req->wb_index = page_file_index(page); page_cache_get(page); req->wb_offset = offset; req->wb_pgbase = offset; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 6267b87..7cb0207 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -522,11 +522,11 @@ static const struct rpc_call_ops nfs_read_common_ops = { int nfs_readpage(struct file *file, struct page *page) { struct nfs_open_context *ctx; - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; int error; dprintk("NFS: nfs_readpage (%p %ld@%lu)\n", - page, PAGE_CACHE_SIZE, page->index); + page, PAGE_CACHE_SIZE, page_file_index(page)); nfs_inc_stats(inode, NFSIOS_VFSREADPAGE); nfs_add_stats(inode, NFSIOS_READPAGES, 1); @@ -580,7 +580,7 @@ static int readpage_async_filler(void *data, struct page *page) { struct nfs_readdesc *desc = (struct nfs_readdesc *)data; - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_page *new; unsigned int len; int error; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index f312860..d0feca3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -153,7 +153,7 @@ static struct nfs_page *nfs_page_find_request_locked(struct page *page) static struct nfs_page *nfs_page_find_request(struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req = NULL; spin_lock(&inode->i_lock); @@ -165,16 +165,16 @@ static struct nfs_page *nfs_page_find_request(struct page *page) /* Adjust the file length if we're writing beyond the end */ static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; loff_t end, i_size; pgoff_t end_index; spin_lock(&inode->i_lock); i_size = i_size_read(inode); end_index = (i_size - 1) >> PAGE_CACHE_SHIFT; - if (i_size > 0 && page->index < end_index) + if (i_size > 0 && page_file_index(page) < end_index) goto out; - end = ((loff_t)page->index << PAGE_CACHE_SHIFT) + ((loff_t)offset+count); + end = page_file_offset(page) + ((loff_t)offset+count); if (i_size >= end) goto out; i_size_write(inode, end); @@ -187,7 +187,7 @@ out: static void nfs_set_pageerror(struct page *page) { SetPageError(page); - nfs_zap_mapping(page->mapping->host, page->mapping); + nfs_zap_mapping(page_file_mapping(page)->host, page_file_mapping(page)); } /* We can set the PG_uptodate flag if we see that a write request @@ -228,7 +228,7 @@ static int nfs_set_page_writeback(struct page *page) int ret = test_set_page_writeback(page); if (!ret) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_server *nfss = NFS_SERVER(inode); if (atomic_long_inc_return(&nfss->writeback) > @@ -242,7 +242,7 @@ static int nfs_set_page_writeback(struct page *page) static void nfs_end_page_writeback(struct page *page) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_server *nfss = NFS_SERVER(inode); end_page_writeback(page); @@ -252,7 +252,7 @@ static void nfs_end_page_writeback(struct page *page) static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblock) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req; int ret; @@ -313,13 +313,13 @@ out: static int nfs_do_writepage(struct page *page, struct writeback_control *wbc, struct nfs_pageio_descriptor *pgio) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; int ret; nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGE); nfs_add_stats(inode, NFSIOS_WRITEPAGES, 1); - nfs_pageio_cond_complete(pgio, page->index); + nfs_pageio_cond_complete(pgio, page_file_index(page)); ret = nfs_page_async_flush(pgio, page, wbc->sync_mode == WB_SYNC_NONE); if (ret == -EAGAIN) { redirty_page_for_writepage(wbc, page); @@ -336,7 +336,7 @@ static int nfs_writepage_locked(struct page *page, struct writeback_control *wbc struct nfs_pageio_descriptor pgio; int err; - NFS_PROTO(page->mapping->host)->write_pageio_init(&pgio, + NFS_PROTO(page_file_mapping(page)->host)->write_pageio_init(&pgio, page->mapping->host, wb_priority(wbc), &nfs_async_write_completion_ops); @@ -471,7 +471,7 @@ nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, spin_unlock(cinfo->lock); if (!cinfo->dreq) { inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - inc_bdi_stat(req->wb_page->mapping->backing_dev_info, + inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, BDI_RECLAIMABLE); __mark_inode_dirty(req->wb_context->dentry->d_inode, I_DIRTY_DATASYNC); @@ -538,7 +538,7 @@ static void nfs_clear_page_commit(struct page *page) { dec_zone_page_state(page, NR_UNSTABLE_NFS); - dec_bdi_stat(page->mapping->backing_dev_info, BDI_RECLAIMABLE); + dec_bdi_stat(page_file_mapping(page)->backing_dev_info, BDI_RECLAIMABLE); } static void @@ -789,7 +789,7 @@ out_err: static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, struct page *page, unsigned int offset, unsigned int bytes) { - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; struct nfs_page *req; req = nfs_try_to_update_request(inode, page, offset, bytes); @@ -842,7 +842,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page) nfs_release_request(req); if (!do_flush) return 0; - status = nfs_wb_page(page->mapping->host, page); + status = nfs_wb_page(page_file_mapping(page)->host, page); } while (status == 0); return status; } @@ -872,7 +872,7 @@ int nfs_updatepage(struct file *file, struct page *page, unsigned int offset, unsigned int count) { struct nfs_open_context *ctx = nfs_file_open_context(file); - struct inode *inode = page->mapping->host; + struct inode *inode = page_file_mapping(page)->host; int status = 0; nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); @@ -880,7 +880,7 @@ int nfs_updatepage(struct file *file, struct page *page, dprintk("NFS: nfs_updatepage(%s/%s %d@%lld)\n", file->f_path.dentry->d_parent->d_name.name, file->f_path.dentry->d_name.name, count, - (long long)(page_offset(page) + offset)); + (long long)(page_file_offset(page) + offset)); /* If we're not using byte range locks, and we know the page * is up to date, it may be more efficient to extend the write @@ -1469,7 +1469,7 @@ void nfs_retry_commit(struct list_head *page_list, nfs_mark_request_commit(req, lseg, cinfo); if (!cinfo->dreq) { dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); - dec_bdi_stat(req->wb_page->mapping->backing_dev_info, + dec_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, BDI_RECLAIMABLE); } nfs_unlock_and_release_request(req); -- cgit v0.10.2 From 29418aa4bd487c82016733ef5c6a06d656ed3c7d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:45:10 -0700 Subject: nfs: disable data cache revalidation for swapfiles The VM does not like PG_private set on PG_swapcache pages. As suggested by Trond in http://lkml.org/lkml/2006/8/25/348, this patch disables NFS data cache revalidation on swap files. as it does not make sense to have other clients change the file while it is being used as swap. This avoids setting PG_private on swap pages, since there ought to be no further races with invalidate_inode_pages2() to deal with. Since we cannot set PG_private we cannot use page->private which is already used by PG_swapcache pages to store the nfs_page. Thus augment the new nfs_page_find_request logic. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 35f7e4b..1d57fe9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -882,6 +882,10 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping) struct nfs_inode *nfsi = NFS_I(inode); int ret = 0; + /* swapfiles are not supposed to be shared. */ + if (IS_SWAPFILE(inode)) + goto out; + if (nfs_mapping_need_revalidate_inode(inode)) { ret = __nfs_revalidate_inode(NFS_SERVER(inode), inode); if (ret < 0) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index d0feca3..974e9c2 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -139,15 +139,28 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) set_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); } -static struct nfs_page *nfs_page_find_request_locked(struct page *page) +static struct nfs_page * +nfs_page_find_request_locked(struct nfs_inode *nfsi, struct page *page) { struct nfs_page *req = NULL; - if (PagePrivate(page)) { + if (PagePrivate(page)) req = (struct nfs_page *)page_private(page); - if (req != NULL) - kref_get(&req->wb_kref); + else if (unlikely(PageSwapCache(page))) { + struct nfs_page *freq, *t; + + /* Linearly search the commit list for the correct req */ + list_for_each_entry_safe(freq, t, &nfsi->commit_info.list, wb_list) { + if (freq->wb_page == page) { + req = freq; + break; + } + } } + + if (req) + kref_get(&req->wb_kref); + return req; } @@ -157,7 +170,7 @@ static struct nfs_page *nfs_page_find_request(struct page *page) struct nfs_page *req = NULL; spin_lock(&inode->i_lock); - req = nfs_page_find_request_locked(page); + req = nfs_page_find_request_locked(NFS_I(inode), page); spin_unlock(&inode->i_lock); return req; } @@ -258,7 +271,7 @@ static struct nfs_page *nfs_find_and_lock_request(struct page *page, bool nonblo spin_lock(&inode->i_lock); for (;;) { - req = nfs_page_find_request_locked(page); + req = nfs_page_find_request_locked(NFS_I(inode), page); if (req == NULL) break; if (nfs_lock_request(req)) @@ -413,9 +426,15 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) spin_lock(&inode->i_lock); if (!nfsi->npages && NFS_PROTO(inode)->have_delegation(inode, FMODE_WRITE)) inode->i_version++; - set_bit(PG_MAPPED, &req->wb_flags); - SetPagePrivate(req->wb_page); - set_page_private(req->wb_page, (unsigned long)req); + /* + * Swap-space should not get truncated. Hence no need to plug the race + * with invalidate/truncate. + */ + if (likely(!PageSwapCache(req->wb_page))) { + set_bit(PG_MAPPED, &req->wb_flags); + SetPagePrivate(req->wb_page); + set_page_private(req->wb_page, (unsigned long)req); + } nfsi->npages++; kref_get(&req->wb_kref); spin_unlock(&inode->i_lock); @@ -432,9 +451,11 @@ static void nfs_inode_remove_request(struct nfs_page *req) BUG_ON (!NFS_WBACK_BUSY(req)); spin_lock(&inode->i_lock); - set_page_private(req->wb_page, 0); - ClearPagePrivate(req->wb_page); - clear_bit(PG_MAPPED, &req->wb_flags); + if (likely(!PageSwapCache(req->wb_page))) { + set_page_private(req->wb_page, 0); + ClearPagePrivate(req->wb_page); + clear_bit(PG_MAPPED, &req->wb_flags); + } nfsi->npages--; spin_unlock(&inode->i_lock); nfs_release_request(req); @@ -730,7 +751,7 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, spin_lock(&inode->i_lock); for (;;) { - req = nfs_page_find_request_locked(page); + req = nfs_page_find_request_locked(NFS_I(inode), page); if (req == NULL) goto out_unlock; @@ -1744,7 +1765,7 @@ int nfs_wb_page_cancel(struct inode *inode, struct page *page) */ int nfs_wb_page(struct inode *inode, struct page *page) { - loff_t range_start = page_offset(page); + loff_t range_start = page_file_offset(page); loff_t range_end = range_start + (loff_t)(PAGE_CACHE_SIZE - 1); struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, -- cgit v0.10.2 From a564b8f0398636ba30b07c0eaebdef7ff7837249 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:45:12 -0700 Subject: nfs: enable swap on NFS Implement the new swapfile a_ops for NFS and hook up ->direct_IO. This will set the NFS socket to SOCK_MEMALLOC and run socket reconnect under PF_MEMALLOC as well as reset SOCK_MEMALLOC before engaging the protocol ->connect() method. PF_MEMALLOC should allow the allocation of struct socket and related objects and the early (re)setting of SOCK_MEMALLOC should allow us to receive the packets required for the TCP connection buildup. [jlayton@redhat.com: Restore PF_MEMALLOC task flags in all cases] [dfeng@redhat.com: Fix handling of multiple swap files] [a.p.zijlstra@chello.nl: Original patch] Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index 404c6a8..6fd5f2c 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -86,6 +86,14 @@ config NFS_V4 If unsure, say Y. +config NFS_SWAP + bool "Provide swap over NFS support" + default n + depends on NFS_FS + select SUNRPC_SWAP + help + This option enables swapon to work on files located on NFS mounts. + config NFS_V4_1 bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" depends on NFS_V4 && EXPERIMENTAL diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 42dce909..bf9c8d0 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -115,17 +115,28 @@ static inline int put_dreq(struct nfs_direct_req *dreq) * @nr_segs: size of iovec array * * The presence of this routine in the address space ops vector means - * the NFS client supports direct I/O. However, we shunt off direct - * read and write requests before the VFS gets them, so this method - * should never be called. + * the NFS client supports direct I/O. However, for most direct IO, we + * shunt off direct read and write requests before the VFS gets them, + * so this method is only ever called for swap. */ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { +#ifndef CONFIG_NFS_SWAP dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n", iocb->ki_filp->f_path.dentry->d_name.name, (long long) pos, nr_segs); return -EINVAL; +#else + VM_BUG_ON(iocb->ki_left != PAGE_SIZE); + VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); + + if (rw == READ || rw == KERNEL_READ) + return nfs_file_direct_read(iocb, iov, nr_segs, pos, + rw == READ ? true : false); + return nfs_file_direct_write(iocb, iov, nr_segs, pos, + rw == WRITE ? true : false); +#endif /* CONFIG_NFS_SWAP */ } static void nfs_direct_release_pages(struct page **pages, unsigned int npages) @@ -303,7 +314,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_read_completion_ops = { */ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *desc, const struct iovec *iov, - loff_t pos) + loff_t pos, bool uio) { struct nfs_direct_req *dreq = desc->pg_dreq; struct nfs_open_context *ctx = dreq->ctx; @@ -331,12 +342,20 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de GFP_KERNEL); if (!pagevec) break; - down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, user_addr, + if (uio) { + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, npages, 1, 0, pagevec, NULL); - up_read(¤t->mm->mmap_sem); - if (result < 0) - break; + up_read(¤t->mm->mmap_sem); + if (result < 0) + break; + } else { + WARN_ON(npages != 1); + result = get_kernel_page(user_addr, 1, pagevec); + if (WARN_ON(result != 1)) + break; + } + if ((unsigned)result < npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { @@ -386,7 +405,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, const struct iovec *iov, unsigned long nr_segs, - loff_t pos) + loff_t pos, bool uio) { struct nfs_pageio_descriptor desc; ssize_t result = -EINVAL; @@ -400,7 +419,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; - result = nfs_direct_read_schedule_segment(&desc, vec, pos); + result = nfs_direct_read_schedule_segment(&desc, vec, pos, uio); if (result < 0) break; requested_bytes += result; @@ -426,7 +445,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, } static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) + unsigned long nr_segs, loff_t pos, bool uio) { ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; @@ -444,7 +463,7 @@ static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); + result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos, uio); if (!result) result = nfs_direct_wait(dreq); NFS_I(inode)->read_io += result; @@ -610,7 +629,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode */ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *desc, const struct iovec *iov, - loff_t pos) + loff_t pos, bool uio) { struct nfs_direct_req *dreq = desc->pg_dreq; struct nfs_open_context *ctx = dreq->ctx; @@ -638,12 +657,19 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d if (!pagevec) break; - down_read(¤t->mm->mmap_sem); - result = get_user_pages(current, current->mm, user_addr, - npages, 0, 0, pagevec, NULL); - up_read(¤t->mm->mmap_sem); - if (result < 0) - break; + if (uio) { + down_read(¤t->mm->mmap_sem); + result = get_user_pages(current, current->mm, user_addr, + npages, 0, 0, pagevec, NULL); + up_read(¤t->mm->mmap_sem); + if (result < 0) + break; + } else { + WARN_ON(npages != 1); + result = get_kernel_page(user_addr, 0, pagevec); + if (WARN_ON(result != 1)) + break; + } if ((unsigned)result < npages) { bytes = result * PAGE_SIZE; @@ -774,7 +800,7 @@ static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops = { static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, const struct iovec *iov, unsigned long nr_segs, - loff_t pos) + loff_t pos, bool uio) { struct nfs_pageio_descriptor desc; struct inode *inode = dreq->inode; @@ -790,7 +816,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, for (seg = 0; seg < nr_segs; seg++) { const struct iovec *vec = &iov[seg]; - result = nfs_direct_write_schedule_segment(&desc, vec, pos); + result = nfs_direct_write_schedule_segment(&desc, vec, pos, uio); if (result < 0) break; requested_bytes += result; @@ -818,7 +844,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, - size_t count) + size_t count, bool uio) { ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; @@ -836,7 +862,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; - result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos); + result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio); if (!result) result = nfs_direct_wait(dreq); out_release: @@ -867,7 +893,7 @@ out: * cache. */ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) + unsigned long nr_segs, loff_t pos, bool uio) { ssize_t retval = -EINVAL; struct file *file = iocb->ki_filp; @@ -892,7 +918,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, task_io_account_read(count); - retval = nfs_direct_read(iocb, iov, nr_segs, pos); + retval = nfs_direct_read(iocb, iov, nr_segs, pos, uio); if (retval > 0) iocb->ki_pos = pos + retval; @@ -923,7 +949,7 @@ out: * is no atomic O_APPEND write facility in the NFS protocol. */ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, - unsigned long nr_segs, loff_t pos) + unsigned long nr_segs, loff_t pos, bool uio) { ssize_t retval = -EINVAL; struct file *file = iocb->ki_filp; @@ -955,7 +981,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, task_io_account_write(count); - retval = nfs_direct_write(iocb, iov, nr_segs, pos, count); + retval = nfs_direct_write(iocb, iov, nr_segs, pos, count, uio); if (retval > 0) { struct inode *inode = mapping->host; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index acd4e4c..50fb83a 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -175,7 +175,7 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, ssize_t result; if (iocb->ki_filp->f_flags & O_DIRECT) - return nfs_file_direct_read(iocb, iov, nr_segs, pos); + return nfs_file_direct_read(iocb, iov, nr_segs, pos, true); dprintk("NFS: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -482,6 +482,20 @@ static int nfs_launder_page(struct page *page) return nfs_wb_page(inode, page); } +#ifdef CONFIG_NFS_SWAP +static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, + sector_t *span) +{ + *span = sis->pages; + return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); +} + +static void nfs_swap_deactivate(struct file *file) +{ + xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); +} +#endif + const struct address_space_operations nfs_file_aops = { .readpage = nfs_readpage, .readpages = nfs_readpages, @@ -496,6 +510,10 @@ const struct address_space_operations nfs_file_aops = { .migratepage = nfs_migrate_page, .launder_page = nfs_launder_page, .error_remove_page = generic_error_remove_page, +#ifdef CONFIG_NFS_SWAP + .swap_activate = nfs_swap_activate, + .swap_deactivate = nfs_swap_deactivate, +#endif }; /* @@ -570,7 +588,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, size_t count = iov_length(iov, nr_segs); if (iocb->ki_filp->f_flags & O_DIRECT) - return nfs_file_direct_write(iocb, iov, nr_segs, pos); + return nfs_file_direct_write(iocb, iov, nr_segs, pos, true); dprintk("NFS: write(%s/%s, %lu@%Ld)\n", dentry->d_parent->d_name.name, dentry->d_name.name, diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4b6043c..35994f9 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -473,10 +473,10 @@ extern ssize_t nfs_direct_IO(int, struct kiocb *, const struct iovec *, loff_t, unsigned long); extern ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, - loff_t pos); + loff_t pos, bool uio); extern ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, - loff_t pos); + loff_t pos, bool uio); /* * linux/fs/nfs/dir.c diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 77d278d..cff40aa 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -174,6 +174,8 @@ struct rpc_xprt { unsigned long state; /* transport state */ unsigned char shutdown : 1, /* being shut down */ resvport : 1; /* use a reserved port */ + unsigned int swapper; /* we're swapping over this + transport */ unsigned int bind_index; /* bind function index */ /* @@ -316,6 +318,7 @@ void xprt_release_rqst_cong(struct rpc_task *task); void xprt_disconnect_done(struct rpc_xprt *xprt); void xprt_force_disconnect(struct rpc_xprt *xprt); void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie); +int xs_swapper(struct rpc_xprt *xprt, int enable); /* * Reserved bit positions in xprt->state diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig index 9fe8857..03d03e3 100644 --- a/net/sunrpc/Kconfig +++ b/net/sunrpc/Kconfig @@ -21,6 +21,11 @@ config SUNRPC_XPRT_RDMA If unsure, say N. +config SUNRPC_SWAP + bool + depends on SUNRPC + select NETVM + config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism" depends on SUNRPC && CRYPTO diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b05df36..fa48c60 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -717,6 +717,15 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt) atomic_inc(&clnt->cl_count); if (clnt->cl_softrtry) task->tk_flags |= RPC_TASK_SOFT; + if (sk_memalloc_socks()) { + struct rpc_xprt *xprt; + + rcu_read_lock(); + xprt = rcu_dereference(clnt->cl_xprt); + if (xprt->swapper) + task->tk_flags |= RPC_TASK_SWAPPER; + rcu_read_unlock(); + } /* Add to the client's list of all tasks */ spin_lock(&clnt->cl_lock); list_add_tail(&task->tk_task, &clnt->cl_tasks); diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 994cfea..83a4c43 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -812,7 +812,10 @@ static void rpc_async_schedule(struct work_struct *work) void *rpc_malloc(struct rpc_task *task, size_t size) { struct rpc_buffer *buf; - gfp_t gfp = RPC_IS_SWAPPER(task) ? GFP_ATOMIC : GFP_NOWAIT; + gfp_t gfp = GFP_NOWAIT; + + if (RPC_IS_SWAPPER(task)) + gfp |= __GFP_MEMALLOC; size += sizeof(struct rpc_buffer); if (size <= RPC_BUFFER_MAXSIZE) @@ -886,7 +889,7 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta static struct rpc_task * rpc_alloc_task(void) { - return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOFS); + return (struct rpc_task *)mempool_alloc(rpc_task_mempool, GFP_NOIO); } /* diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 62d0dac..bd59d01 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1927,6 +1927,45 @@ out: xprt_wake_pending_tasks(xprt, status); } +#ifdef CONFIG_SUNRPC_SWAP +static void xs_set_memalloc(struct rpc_xprt *xprt) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, + xprt); + + if (xprt->swapper) + sk_set_memalloc(transport->inet); +} + +/** + * xs_swapper - Tag this transport as being used for swap. + * @xprt: transport to tag + * @enable: enable/disable + * + */ +int xs_swapper(struct rpc_xprt *xprt, int enable) +{ + struct sock_xprt *transport = container_of(xprt, struct sock_xprt, + xprt); + int err = 0; + + if (enable) { + xprt->swapper++; + xs_set_memalloc(xprt); + } else if (xprt->swapper) { + xprt->swapper--; + sk_clear_memalloc(transport->inet); + } + + return err; +} +EXPORT_SYMBOL_GPL(xs_swapper); +#else +static void xs_set_memalloc(struct rpc_xprt *xprt) +{ +} +#endif + static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) { struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -1951,6 +1990,8 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) transport->sock = sock; transport->inet = sk; + xs_set_memalloc(xprt); + write_unlock_bh(&sk->sk_callback_lock); } xs_udp_do_set_buffer_size(xprt); @@ -2075,6 +2116,8 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) if (!xprt_bound(xprt)) goto out; + xs_set_memalloc(xprt); + /* Tell the socket layer to start connecting... */ xprt->stat.connect_count++; xprt->stat.connect_start = jiffies; -- cgit v0.10.2 From 192e501b0438bb0e1574179773537f84c4752e25 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:45:16 -0700 Subject: nfs: prevent page allocator recursions with swap over NFS. GFP_NOFS is _more_ permissive than GFP_NOIO in that it will initiate IO, just not of any filesystem data. The problem is that previously NOFS was correct because that avoids recursion into the NFS code. With swap-over-NFS, it is no longer correct as swap IO can lead to this recursion. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Cc: Xiaotian Feng Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 9ef8b3c..7de1646 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -70,7 +70,7 @@ void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos) static inline struct nfs_page * nfs_page_alloc(void) { - struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL); + struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_NOIO); if (p) INIT_LIST_HEAD(&p->wb_list); return p; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 974e9c2..211ba65 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -52,7 +52,7 @@ static mempool_t *nfs_commit_mempool; struct nfs_commit_data *nfs_commitdata_alloc(void) { - struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); + struct nfs_commit_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOIO); if (p) { memset(p, 0, sizeof(*p)); @@ -70,7 +70,7 @@ EXPORT_SYMBOL_GPL(nfs_commit_free); struct nfs_write_header *nfs_writehdr_alloc(void) { - struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); + struct nfs_write_header *p = mempool_alloc(nfs_wdata_mempool, GFP_NOIO); if (p) { struct nfs_pgio_header *hdr = &p->header; -- cgit v0.10.2 From 737449236240e30a7bbe99f4d5586b8ed1416763 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:45:20 -0700 Subject: swapfile: avoid dereferencing bd_disk during swap_entry_free for network storage Commit b3a27d ("swap: Add swap slot free callback to block_device_operations") dereferences p->bdev->bd_disk but this is a NULL dereference if using swap-over-NFS. This patch checks SWP_BLKDEV on the swap_info_struct before dereferencing. With reference to this callback, Christoph Hellwig stated "Please just remove the callback entirely. It has no user outside the staging tree and was added clearly against the rules for that staging tree". This would also be my preference but there was not an obvious way of keeping zram in staging/ happy. Signed-off-by: Xiaotian Feng Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Christoph Hellwig Cc: David S. Miller Cc: Eric B Munson Cc: Eric Paris Cc: James Morris Cc: Mel Gorman Cc: Mike Christie Cc: Neil Brown Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Trond Myklebust Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/swapfile.c b/mm/swapfile.c index 7307fc9..63958d6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -549,7 +549,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, /* free if no reference */ if (!usage) { - struct gendisk *disk = p->bdev->bd_disk; if (offset < p->lowest_bit) p->lowest_bit = offset; if (offset > p->highest_bit) @@ -560,9 +559,12 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, nr_swap_pages++; p->inuse_pages--; frontswap_invalidate_page(p->type, offset); - if ((p->flags & SWP_BLKDEV) && - disk->fops->swap_slot_free_notify) - disk->fops->swap_slot_free_notify(p->bdev, offset); + if (p->flags & SWP_BLKDEV) { + struct gendisk *disk = p->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) + disk->fops->swap_slot_free_notify(p->bdev, + offset); + } } return usage; -- cgit v0.10.2 From 0030f535a5cf9b1841d2088c10a0b2f8f2987460 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Jul 2012 16:45:25 -0700 Subject: mm: memcg: fix compaction/migration failing due to memcg limits Compaction (and page migration in general) can currently be hindered through pages being owned by memory cgroups that are at their limits and unreclaimable. The reason is that the replacement page is being charged against the limit while the page being replaced is also still charged. But this seems unnecessary, given that only one of the two pages will still be in use after migration finishes. This patch changes the memcg migration sequence so that the replacement page is not charged. Whatever page is still in use after successful or failed migration gets to keep the charge of the page that was going to be replaced. The replacement page will still show up temporarily in the rss/cache statistics, this can be fixed in a later patch as it's less urgent. Reported-by: David Rientjes Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Hugh Dickins Cc: David Rientjes Cc: Wanpeng Li Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5a3ee64..8d9489f 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -98,9 +98,9 @@ int mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *cgroup) extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg); -extern int -mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask); +extern void +mem_cgroup_prepare_migration(struct page *page, struct page *newpage, + struct mem_cgroup **memcgp); extern void mem_cgroup_end_migration(struct mem_cgroup *memcg, struct page *oldpage, struct page *newpage, bool migration_ok); @@ -276,11 +276,10 @@ static inline struct cgroup_subsys_state return NULL; } -static inline int +static inline void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, - struct mem_cgroup **memcgp, gfp_t gfp_mask) + struct mem_cgroup **memcgp) { - return 0; } static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f692a2d..7eadcda 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2976,7 +2976,8 @@ direct_uncharge: * uncharge if !page_mapped(page) */ static struct mem_cgroup * -__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) +__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, + bool end_migration) { struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; @@ -3020,7 +3021,16 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) /* fallthrough */ case MEM_CGROUP_CHARGE_TYPE_DROP: /* See mem_cgroup_prepare_migration() */ - if (page_mapped(page) || PageCgroupMigration(pc)) + if (page_mapped(page)) + goto unlock_out; + /* + * Pages under migration may not be uncharged. But + * end_migration() /must/ be the one uncharging the + * unused post-migration page and so it has to call + * here with the migration bit still set. See the + * res_counter handling below. + */ + if (!end_migration && PageCgroupMigration(pc)) goto unlock_out; break; case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: @@ -3054,7 +3064,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_swap_statistics(memcg, true); mem_cgroup_get(memcg); } - if (!mem_cgroup_is_root(memcg)) + /* + * Migration does not charge the res_counter for the + * replacement page, so leave it alone when phasing out the + * page that is unused after the migration. + */ + if (!end_migration && !mem_cgroup_is_root(memcg)) mem_cgroup_do_uncharge(memcg, nr_pages, ctype); return memcg; @@ -3070,14 +3085,14 @@ void mem_cgroup_uncharge_page(struct page *page) if (page_mapped(page)) return; VM_BUG_ON(page->mapping && !PageAnon(page)); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON); + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); } void mem_cgroup_uncharge_cache_page(struct page *page) { VM_BUG_ON(page_mapped(page)); VM_BUG_ON(page->mapping); - __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); + __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); } /* @@ -3141,7 +3156,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) if (!swapout) /* this was a swap cache but the swap is unused ! */ ctype = MEM_CGROUP_CHARGE_TYPE_DROP; - memcg = __mem_cgroup_uncharge_common(page, ctype); + memcg = __mem_cgroup_uncharge_common(page, ctype, false); /* * record memcg information, if swapout && memcg != NULL, @@ -3231,19 +3246,18 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, * Before starting migration, account PAGE_SIZE to mem_cgroup that the old * page belongs to. */ -int mem_cgroup_prepare_migration(struct page *page, - struct page *newpage, struct mem_cgroup **memcgp, gfp_t gfp_mask) +void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, + struct mem_cgroup **memcgp) { struct mem_cgroup *memcg = NULL; struct page_cgroup *pc; enum charge_type ctype; - int ret = 0; *memcgp = NULL; VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) - return 0; + return; pc = lookup_page_cgroup(page); lock_page_cgroup(pc); @@ -3288,24 +3302,9 @@ int mem_cgroup_prepare_migration(struct page *page, * we return here. */ if (!memcg) - return 0; + return; *memcgp = memcg; - ret = __mem_cgroup_try_charge(NULL, gfp_mask, 1, memcgp, false); - css_put(&memcg->css);/* drop extra refcnt */ - if (ret) { - if (PageAnon(page)) { - lock_page_cgroup(pc); - ClearPageCgroupMigration(pc); - unlock_page_cgroup(pc); - /* - * The old page may be fully unmapped while we kept it. - */ - mem_cgroup_uncharge_page(page); - } - /* we'll need to revisit this error code (we have -EINTR) */ - return -ENOMEM; - } /* * We charge new page before it's used/mapped. So, even if unlock_page() * is called before end_migration, we can catch all events on this new @@ -3318,8 +3317,12 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; + /* + * The page is committed to the memcg, but it's not actually + * charged to the res_counter since we plan on replacing the + * old one and only one page is going to be left afterwards. + */ __mem_cgroup_commit_charge(memcg, newpage, 1, ctype, false); - return ret; } /* remove redundant charge if migration failed*/ @@ -3341,6 +3344,12 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, used = newpage; unused = oldpage; } + anon = PageAnon(used); + __mem_cgroup_uncharge_common(unused, + anon ? MEM_CGROUP_CHARGE_TYPE_ANON + : MEM_CGROUP_CHARGE_TYPE_CACHE, + true); + css_put(&memcg->css); /* * We disallowed uncharge of pages under migration because mapcount * of the page goes down to zero, temporarly. @@ -3350,10 +3359,6 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, lock_page_cgroup(pc); ClearPageCgroupMigration(pc); unlock_page_cgroup(pc); - anon = PageAnon(used); - __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_ANON - : MEM_CGROUP_CHARGE_TYPE_CACHE); /* * If a page is a file cache, radix-tree replacement is very atomic diff --git a/mm/migrate.c b/mm/migrate.c index 6c37c51..77ed2d7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -683,7 +683,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage, { int rc = -EAGAIN; int remap_swapcache = 1; - int charge = 0; struct mem_cgroup *mem; struct anon_vma *anon_vma = NULL; @@ -725,12 +724,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } /* charge against new page */ - charge = mem_cgroup_prepare_migration(page, newpage, &mem, GFP_KERNEL); - if (charge == -ENOMEM) { - rc = -ENOMEM; - goto unlock; - } - BUG_ON(charge); + mem_cgroup_prepare_migration(page, newpage, &mem); if (PageWriteback(page)) { /* @@ -820,8 +814,7 @@ skip_unmap: put_anon_vma(anon_vma); uncharge: - if (!charge) - mem_cgroup_end_migration(mem, page, newpage, rc == 0); + mem_cgroup_end_migration(mem, page, newpage, rc == 0); unlock: unlock_page(page); out: -- cgit v0.10.2 From 5d84c7766e8aacc6e3477bdf02fdb417163cf89b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Jul 2012 16:45:28 -0700 Subject: mm: swapfile: clean up unuse_pte race handling The conditional mem_cgroup_cancel_charge_swapin() is a leftover from when the function would continue to reestablish the page even after mem_cgroup_try_charge_swapin() failed. After 85d9fc8 "memcg: fix refcnt handling at swapoff", the condition is always true when this code is reached. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Johannes Weiner Cc: Michal Hocko Cc: Wanpeng Li Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/swapfile.c b/mm/swapfile.c index 63958d6..14e254c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -835,8 +835,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { - if (ret > 0) - mem_cgroup_cancel_charge_swapin(memcg); + mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; } -- cgit v0.10.2 From 0c59b89c81eab7fe7dcf08f2252ae22a6c081ce8 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Jul 2012 16:45:31 -0700 Subject: mm: memcg: push down PageSwapCache check into uncharge entry functions Not all uncharge paths need to check if the page is swapcache, some of them can know for sure. Push down the check into all callsites of uncharge_common() so that the patch that removes some of them is more obvious. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: David Rientjes Cc: Hugh Dickins Cc: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Cc: Wanpeng Li Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7eadcda..87c2ec4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2987,8 +2987,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, if (mem_cgroup_disabled()) return NULL; - if (PageSwapCache(page)) - return NULL; + VM_BUG_ON(PageSwapCache(page)); if (PageTransHuge(page)) { nr_pages <<= compound_order(page); @@ -3085,6 +3084,8 @@ void mem_cgroup_uncharge_page(struct page *page) if (page_mapped(page)) return; VM_BUG_ON(page->mapping && !PageAnon(page)); + if (PageSwapCache(page)) + return; __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); } @@ -3092,6 +3093,8 @@ void mem_cgroup_uncharge_cache_page(struct page *page) { VM_BUG_ON(page_mapped(page)); VM_BUG_ON(page->mapping); + if (PageSwapCache(page)) + return; __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); } @@ -3156,6 +3159,8 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) if (!swapout) /* this was a swap cache but the swap is unused ! */ ctype = MEM_CGROUP_CHARGE_TYPE_DROP; + if (PageSwapCache(page)) + return; memcg = __mem_cgroup_uncharge_common(page, ctype, false); /* @@ -3345,10 +3350,11 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, unused = oldpage; } anon = PageAnon(used); - __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_ANON - : MEM_CGROUP_CHARGE_TYPE_CACHE, - true); + if (!PageSwapCache(unused)) + __mem_cgroup_uncharge_common(unused, + anon ? MEM_CGROUP_CHARGE_TYPE_ANON + : MEM_CGROUP_CHARGE_TYPE_CACHE, + true); css_put(&memcg->css); /* * We disallowed uncharge of pages under migration because mapcount -- cgit v0.10.2 From 7d188958bb64708577aa77e6b1ad68abbf0480f5 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Jul 2012 16:45:34 -0700 Subject: mm: memcg: only check for PageSwapCache when uncharging anon Only anon pages that are uncharged at the time of the last page table mapping vanishing may be in swapcache. When shmem pages, file pages, swap-freed anon pages, or just migrated pages are uncharged, they are known for sure to be not in swapcache. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: David Rientjes Cc: Hugh Dickins Cc: Johannes Weiner Cc: Wanpeng Li Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 87c2ec4..29bc440 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3093,8 +3093,6 @@ void mem_cgroup_uncharge_cache_page(struct page *page) { VM_BUG_ON(page_mapped(page)); VM_BUG_ON(page->mapping); - if (PageSwapCache(page)) - return; __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); } @@ -3159,8 +3157,6 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) if (!swapout) /* this was a swap cache but the swap is unused ! */ ctype = MEM_CGROUP_CHARGE_TYPE_DROP; - if (PageSwapCache(page)) - return; memcg = __mem_cgroup_uncharge_common(page, ctype, false); /* @@ -3350,11 +3346,10 @@ void mem_cgroup_end_migration(struct mem_cgroup *memcg, unused = oldpage; } anon = PageAnon(used); - if (!PageSwapCache(unused)) - __mem_cgroup_uncharge_common(unused, - anon ? MEM_CGROUP_CHARGE_TYPE_ANON - : MEM_CGROUP_CHARGE_TYPE_CACHE, - true); + __mem_cgroup_uncharge_common(unused, + anon ? MEM_CGROUP_CHARGE_TYPE_ANON + : MEM_CGROUP_CHARGE_TYPE_CACHE, + true); css_put(&memcg->css); /* * We disallowed uncharge of pages under migration because mapcount -- cgit v0.10.2 From 827a03d22e776ff7838b5b2780b71092e0b9de1e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Jul 2012 16:45:36 -0700 Subject: mm: memcg: move swapin charge functions above callsites Charging cache pages may require swapin in the shmem case. Save the forward declaration and just move the swapin functions above the cache charging functions. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: David Rientjes Cc: Hugh Dickins Cc: Johannes Weiner Cc: Wanpeng Li Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 29bc440..3be1afa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2796,37 +2796,6 @@ int mem_cgroup_newpage_charge(struct page *page, MEM_CGROUP_CHARGE_TYPE_ANON); } -static void -__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, - enum charge_type ctype); - -int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask) -{ - struct mem_cgroup *memcg = NULL; - enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; - int ret; - - if (mem_cgroup_disabled()) - return 0; - if (PageCompound(page)) - return 0; - - if (unlikely(!mm)) - mm = &init_mm; - if (!page_is_file_cache(page)) - type = MEM_CGROUP_CHARGE_TYPE_SHMEM; - - if (!PageSwapCache(page)) - ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); - else { /* page is swapcache/shmem */ - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); - if (!ret) - __mem_cgroup_commit_charge_swapin(page, memcg, type); - } - return ret; -} - /* * While swap-in, try_charge -> commit or cancel, the page is locked. * And when try_charge() successfully returns, one refcnt to memcg without @@ -2873,6 +2842,15 @@ charge_cur_mm: return ret; } +void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) +{ + if (mem_cgroup_disabled()) + return; + if (!memcg) + return; + __mem_cgroup_cancel_charge(memcg, 1); +} + static void __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, enum charge_type ctype) @@ -2910,13 +2888,31 @@ void mem_cgroup_commit_charge_swapin(struct page *page, MEM_CGROUP_CHARGE_TYPE_ANON); } -void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) +int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, + gfp_t gfp_mask) { + struct mem_cgroup *memcg = NULL; + enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; + int ret; + if (mem_cgroup_disabled()) - return; - if (!memcg) - return; - __mem_cgroup_cancel_charge(memcg, 1); + return 0; + if (PageCompound(page)) + return 0; + + if (unlikely(!mm)) + mm = &init_mm; + if (!page_is_file_cache(page)) + type = MEM_CGROUP_CHARGE_TYPE_SHMEM; + + if (!PageSwapCache(page)) + ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); + else { /* page is swapcache/shmem */ + ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); + if (!ret) + __mem_cgroup_commit_charge_swapin(page, memcg, type); + } + return ret; } static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, -- cgit v0.10.2 From 62ba7442c8e286b6dd68fe82a3cc85c9414d909b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Jul 2012 16:45:39 -0700 Subject: mm: memcg: remove unneeded shmem charge type shmem page charges have not needed a separate charge type to tell them from regular file pages since 08e552c ("memcg: synchronized LRU"). Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: David Rientjes Cc: Hugh Dickins Cc: Johannes Weiner Cc: Wanpeng Li Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3be1afa..17a79e5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -379,7 +379,6 @@ static bool move_file(void) enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_ANON, - MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ NR_CHARGE_TYPE, @@ -2902,8 +2901,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (unlikely(!mm)) mm = &init_mm; - if (!page_is_file_cache(page)) - type = MEM_CGROUP_CHARGE_TYPE_SHMEM; if (!PageSwapCache(page)) ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); @@ -3310,10 +3307,8 @@ void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, */ if (PageAnon(page)) ctype = MEM_CGROUP_CHARGE_TYPE_ANON; - else if (page_is_file_cache(page)) - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else - ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; /* * The page is committed to the memcg, but it's not actually * charged to the res_counter since we plan on replacing the @@ -3407,10 +3402,6 @@ void mem_cgroup_replace_page_cache(struct page *oldpage, */ if (!memcg) return; - - if (PageSwapBacked(oldpage)) - type = MEM_CGROUP_CHARGE_TYPE_SHMEM; - /* * Even if newpage->mapping was NULL before starting replacement, * the newpage may be on LRU(or pagevec for LRU) already. We lock -- cgit v0.10.2 From 24467cacc03cf8b156d4c6786697e5e5d13c9321 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Jul 2012 16:45:40 -0700 Subject: mm: memcg: remove needless !mm fixup to init_mm when charging It does not matter to __mem_cgroup_try_charge() if the passed mm is NULL or init_mm, it will charge the root memcg in either case. Also fix up the comment in __mem_cgroup_try_charge() that claimed the init_mm would be charged when no mm was passed. It's not really incorrect, but confusing. Clarify that the root memcg is charged in this case. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: David Rientjes Cc: Hugh Dickins Cc: Johannes Weiner Cc: Wanpeng Li Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 17a79e5..10b3703 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2333,7 +2333,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the * thread group leader migrates. It's possible that mm is not - * set, if so charge the init_mm (happens for pagecache usage). + * set, if so charge the root memcg (happens for pagecache usage). */ if (!*ptr && !mm) *ptr = root_mem_cgroup; @@ -2833,8 +2833,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, ret = 0; return ret; charge_cur_mm: - if (unlikely(!mm)) - mm = &init_mm; ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); if (ret == -EINTR) ret = 0; @@ -2899,9 +2897,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (PageCompound(page)) return 0; - if (unlikely(!mm)) - mm = &init_mm; - if (!PageSwapCache(page)) ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); else { /* page is swapcache/shmem */ -- cgit v0.10.2 From 0435a2fdcb50f8e53a67e98c27708e5d9396b71a Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Jul 2012 16:45:43 -0700 Subject: mm: memcg: split swapin charge function into private and public part When shmem is charged upon swapin, it does not need to check twice whether the memory controller is enabled. Also, shmem pages do not have to be checked for everything that regular anon pages have to be checked for, so let shmem use the internal version directly and allow future patches to move around checks that are only required when swapping in anon pages. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: David Rientjes Cc: Hugh Dickins Cc: Johannes Weiner Cc: Wanpeng Li Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 10b3703..4cc962d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2801,18 +2801,14 @@ int mem_cgroup_newpage_charge(struct page *page, * struct page_cgroup is acquired. This refcnt will be consumed by * "commit()" or removed by "cancel()" */ -int mem_cgroup_try_charge_swapin(struct mm_struct *mm, - struct page *page, - gfp_t mask, struct mem_cgroup **memcgp) +static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, + struct page *page, + gfp_t mask, + struct mem_cgroup **memcgp) { struct mem_cgroup *memcg; int ret; - *memcgp = NULL; - - if (mem_cgroup_disabled()) - return 0; - if (!do_swap_account) goto charge_cur_mm; /* @@ -2839,6 +2835,15 @@ charge_cur_mm: return ret; } +int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, + gfp_t gfp_mask, struct mem_cgroup **memcgp) +{ + *memcgp = NULL; + if (mem_cgroup_disabled()) + return 0; + return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); +} + void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) { if (mem_cgroup_disabled()) @@ -2900,7 +2905,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (!PageSwapCache(page)) ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); else { /* page is swapcache/shmem */ - ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &memcg); + ret = __mem_cgroup_try_charge_swapin(mm, page, + gfp_mask, &memcg); if (!ret) __mem_cgroup_commit_charge_swapin(page, memcg, type); } -- cgit v0.10.2 From 90deb78839faedd194b65d419dbd9cba981e1922 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Jul 2012 16:45:47 -0700 Subject: mm: memcg: only check swap cache pages for repeated charging Only anon and shmem pages in the swap cache are attempted to be charged multiple times, from every swap pte fault or from shmem_unuse(). No other pages require checking PageCgroupUsed(). Charging pages in the swap cache is also serialized by the page lock, and since both the try_charge and commit_charge are called under the same page lock section, the PageCgroupUsed() check might as well happen before the counter charging, let alone reclaim. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: David Rientjes Cc: Hugh Dickins Cc: Johannes Weiner Cc: Wanpeng Li Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4cc962d..1a2020d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2538,11 +2538,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, bool anon; lock_page_cgroup(pc); - if (unlikely(PageCgroupUsed(pc))) { - unlock_page_cgroup(pc); - __mem_cgroup_cancel_charge(memcg, nr_pages); - return; - } + VM_BUG_ON(PageCgroupUsed(pc)); /* * we don't need page_cgroup_lock about tail pages, becase they are not * accessed by any other context at this point. @@ -2807,8 +2803,19 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct mem_cgroup **memcgp) { struct mem_cgroup *memcg; + struct page_cgroup *pc; int ret; + pc = lookup_page_cgroup(page); + /* + * Every swap fault against a single page tries to charge the + * page, bail as early as possible. shmem_unuse() encounters + * already charged pages, too. The USED bit is protected by + * the page lock, which serializes swap cache removal, which + * in turn serializes uncharging. + */ + if (PageCgroupUsed(pc)) + return 0; if (!do_swap_account) goto charge_cur_mm; /* -- cgit v0.10.2 From bdf4f4d2161a795b9323855a81a047bd68f16202 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 31 Jul 2012 16:45:50 -0700 Subject: mm: memcg: only check anon swapin page charges for swap cache shmem knows for sure that the page is in swap cache when attempting to charge a page, because the cache charge entry function has a check for it. Only anon pages may be removed from swap cache already when trying to charge their swapin. Adjust the comment, though: '4969c11 mm: fix swapin race condition' added a stable PageSwapCache check under the page lock in the do_swap_page() before calling the memory controller, so it's unuse_pte()'s pte_same() that may fail. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Cc: David Rientjes Cc: Hugh Dickins Cc: Johannes Weiner Cc: Wanpeng Li Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1a2020d..b06833f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2818,14 +2818,6 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, return 0; if (!do_swap_account) goto charge_cur_mm; - /* - * A racing thread's fault, or swapoff, may have already updated - * the pte, and even removed page from swap cache: in those cases - * do_swap_page()'s pte_same() test will fail; but there's also a - * KSM case which does need to charge the page. - */ - if (!PageSwapCache(page)) - goto charge_cur_mm; memcg = try_get_mem_cgroup_from_page(page); if (!memcg) goto charge_cur_mm; @@ -2848,6 +2840,20 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, *memcgp = NULL; if (mem_cgroup_disabled()) return 0; + /* + * A racing thread's fault, or swapoff, may have already + * updated the pte, and even removed page from swap cache: in + * those cases unuse_pte()'s pte_same() test will fail; but + * there's also a KSM case which does need to charge the page. + */ + if (!PageSwapCache(page)) { + int ret; + + ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); + if (ret == -EINTR) + ret = 0; + return ret; + } return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); } -- cgit v0.10.2 From 3ad3d901bbcfb15a5e4690e55350db0899095a68 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Tue, 31 Jul 2012 16:45:52 -0700 Subject: mm: mmu_notifier: fix freed page still mapped in secondary MMU mmu_notifier_release() is called when the process is exiting. It will delete all the mmu notifiers. But at this time the page belonging to the process is still present in page tables and is present on the LRU list, so this race will happen: CPU 0 CPU 1 mmu_notifier_release: try_to_unmap: hlist_del_init_rcu(&mn->hlist); ptep_clear_flush_notify: mmu nofifler not found free page !!!!!! /* * At the point, the page has been * freed, but it is still mapped in * the secondary MMU. */ mn->ops->release(mn, mm); Then the box is not stable and sometimes we can get this bug: [ 738.075923] BUG: Bad page state in process migrate-perf pfn:03bec [ 738.075931] page:ffffea00000efb00 count:0 mapcount:0 mapping: (null) index:0x8076 [ 738.075936] page flags: 0x20000000000014(referenced|dirty) The same issue is present in mmu_notifier_unregister(). We can call ->release before deleting the notifier to ensure the page has been unmapped from the secondary MMU before it is freed. Signed-off-by: Xiao Guangrong Cc: Avi Kivity Cc: Marcelo Tosatti Cc: Paul Gortmaker Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 9a611d3..862b608 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -33,6 +33,24 @@ void __mmu_notifier_release(struct mm_struct *mm) { struct mmu_notifier *mn; + struct hlist_node *n; + + /* + * RCU here will block mmu_notifier_unregister until + * ->release returns. + */ + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) + /* + * if ->release runs before mmu_notifier_unregister it + * must be handled as it's the only way for the driver + * to flush all existing sptes and stop the driver + * from establishing any more sptes before all the + * pages in the mm are freed. + */ + if (mn->ops->release) + mn->ops->release(mn, mm); + rcu_read_unlock(); spin_lock(&mm->mmu_notifier_mm->lock); while (unlikely(!hlist_empty(&mm->mmu_notifier_mm->list))) { @@ -46,23 +64,6 @@ void __mmu_notifier_release(struct mm_struct *mm) * mmu_notifier_unregister to return. */ hlist_del_init_rcu(&mn->hlist); - /* - * RCU here will block mmu_notifier_unregister until - * ->release returns. - */ - rcu_read_lock(); - spin_unlock(&mm->mmu_notifier_mm->lock); - /* - * if ->release runs before mmu_notifier_unregister it - * must be handled as it's the only way for the driver - * to flush all existing sptes and stop the driver - * from establishing any more sptes before all the - * pages in the mm are freed. - */ - if (mn->ops->release) - mn->ops->release(mn, mm); - rcu_read_unlock(); - spin_lock(&mm->mmu_notifier_mm->lock); } spin_unlock(&mm->mmu_notifier_mm->lock); @@ -284,16 +285,13 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) { BUG_ON(atomic_read(&mm->mm_count) <= 0); - spin_lock(&mm->mmu_notifier_mm->lock); if (!hlist_unhashed(&mn->hlist)) { - hlist_del_rcu(&mn->hlist); - /* * RCU here will force exit_mmap to wait ->release to finish * before freeing the pages. */ rcu_read_lock(); - spin_unlock(&mm->mmu_notifier_mm->lock); + /* * exit_mmap will block in mmu_notifier_release to * guarantee ->release is called before freeing the @@ -302,8 +300,11 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) if (mn->ops->release) mn->ops->release(mn, mm); rcu_read_unlock(); - } else + + spin_lock(&mm->mmu_notifier_mm->lock); + hlist_del_rcu(&mn->hlist); spin_unlock(&mm->mmu_notifier_mm->lock); + } /* * Wait any running method to finish, of course including -- cgit v0.10.2 From e62e384e9da8d9a0c599795464a7e76fd490931c Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 31 Jul 2012 16:45:55 -0700 Subject: memcg: prevent OOM with too many dirty pages The current implementation of dirty pages throttling is not memcg aware which makes it easy to have memcg LRUs full of dirty pages. Without throttling, these LRUs can be scanned faster than the rate of writeback, leading to memcg OOM conditions when the hard limit is small. This patch fixes the problem by throttling the allocating process (possibly a writer) during the hard limit reclaim by waiting on PageReclaim pages. We are waiting only for PageReclaim pages because those are the pages that made one full round over LRU and that means that the writeback is much slower than scanning. The solution is far from being ideal - long term solution is memcg aware dirty throttling - but it is meant to be a band aid until we have a real fix. We are seeing this happening during nightly backups which are placed into containers to prevent from eviction of the real working set. The change affects only memcg reclaim and only when we encounter PageReclaim pages which is a signal that the reclaim doesn't catch up on with the writers so somebody should be throttled. This could be potentially unfair because it could be somebody else from the group who gets throttled on behalf of the writer but as writers need to allocate as well and they allocate in higher rate the probability that only innocent processes would be penalized is not that high. I have tested this change by a simple dd copying /dev/zero to tmpfs or ext3 running under small memcg (1G copy under 5M, 60M, 300M and 2G containers) and dd got killed by OOM killer every time. With the patch I could run the dd with the same size under 5M controller without any OOM. The issue is more visible with slower devices for output. * With the patch ================ * tmpfs size=2G --------------- $ vim cgroup_cache_oom_test.sh $ ./cgroup_cache_oom_test.sh 5M using Limit 5M for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 30.4049 s, 34.5 MB/s $ ./cgroup_cache_oom_test.sh 60M using Limit 60M for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 31.4561 s, 33.3 MB/s $ ./cgroup_cache_oom_test.sh 300M using Limit 300M for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 20.4618 s, 51.2 MB/s $ ./cgroup_cache_oom_test.sh 2G using Limit 2G for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 1.42172 s, 738 MB/s * ext3 ------ $ ./cgroup_cache_oom_test.sh 5M using Limit 5M for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 27.9547 s, 37.5 MB/s $ ./cgroup_cache_oom_test.sh 60M using Limit 60M for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 30.3221 s, 34.6 MB/s $ ./cgroup_cache_oom_test.sh 300M using Limit 300M for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 24.5764 s, 42.7 MB/s $ ./cgroup_cache_oom_test.sh 2G using Limit 2G for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 3.35828 s, 312 MB/s * Without the patch =================== * tmpfs size=2G --------------- $ ./cgroup_cache_oom_test.sh 5M using Limit 5M for group ./cgroup_cache_oom_test.sh: line 46: 4668 Killed dd if=/dev/zero of=$OUT/zero bs=1M count=$count $ ./cgroup_cache_oom_test.sh 60M using Limit 60M for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 25.4989 s, 41.1 MB/s $ ./cgroup_cache_oom_test.sh 300M using Limit 300M for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 24.3928 s, 43.0 MB/s $ ./cgroup_cache_oom_test.sh 2G using Limit 2G for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 1.49797 s, 700 MB/s * ext3 ------ $ ./cgroup_cache_oom_test.sh 5M using Limit 5M for group ./cgroup_cache_oom_test.sh: line 46: 4689 Killed dd if=/dev/zero of=$OUT/zero bs=1M count=$count $ ./cgroup_cache_oom_test.sh 60M using Limit 60M for group ./cgroup_cache_oom_test.sh: line 46: 4692 Killed dd if=/dev/zero of=$OUT/zero bs=1M count=$count $ ./cgroup_cache_oom_test.sh 300M using Limit 300M for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 20.248 s, 51.8 MB/s $ ./cgroup_cache_oom_test.sh 2G using Limit 2G for group 1000+0 records in 1000+0 records out 1048576000 bytes (1.0 GB) copied, 2.85201 s, 368 MB/s [akpm@linux-foundation.org: tweak changelog, reordered the test to optimize for CONFIG_CGROUP_MEM_RES_CTLR=n] [hughd@google.com: fix deadlock with loop driver] Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Rik van Riel Cc: Ying Han Cc: Greg Thelen Cc: Hugh Dickins Reviewed-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Fengguang Wu Signed-off-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 8880401..ca43aa0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -720,9 +720,26 @@ static unsigned long shrink_page_list(struct list_head *page_list, (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); if (PageWriteback(page)) { - nr_writeback++; - unlock_page(page); - goto keep; + /* + * memcg doesn't have any dirty pages throttling so we + * could easily OOM just because too many pages are in + * writeback from reclaim and there is nothing else to + * reclaim. + * + * Check may_enter_fs, certainly because a loop driver + * thread might enter reclaim, and deadlock if it waits + * on a page for which it is needed to do the write + * (loop masks off __GFP_IO|__GFP_FS for this reason); + * but more thought would probably show more reasons. + */ + if (!global_reclaim(sc) && PageReclaim(page) && + may_enter_fs) + wait_on_page_writeback(page); + else { + nr_writeback++; + unlock_page(page); + goto keep; + } } references = page_check_references(page, sc); -- cgit v0.10.2 From c3b94f44fcb0725471ecebb701c077a0ed67bd07 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Tue, 31 Jul 2012 16:45:59 -0700 Subject: memcg: further prevent OOM with too many dirty pages The may_enter_fs test turns out to be too restrictive: though I saw no problem with it when testing on 3.5-rc6, it very soon OOMed when I tested on 3.5-rc6-mm1. I don't know what the difference there is, perhaps I just slightly changed the way I started off the testing: dd if=/dev/zero of=/mnt/temp bs=1M count=1024; rm -f /mnt/temp; sync repeatedly, in 20M memory.limit_in_bytes cgroup to ext4 on USB stick. ext4 (and gfs2 and xfs) turn out to allocate new pages for writing with AOP_FLAG_NOFS: that seems a little worrying, and it's unclear to me why the transaction needs to be started even before allocating pagecache memory. But it may not be worth worrying about these days: if direct reclaim avoids FS writeback, does __GFP_FS now mean anything? Anyway, we insisted on the may_enter_fs test to avoid hangs with the loop device; but since that also masks off __GFP_IO, we can test for __GFP_IO directly, ignoring may_enter_fs and __GFP_FS. But even so, the test still OOMs sometimes: when originally testing on 3.5-rc6, it OOMed about one time in five or ten; when testing just now on 3.5-rc6-mm1, it OOMed on the first iteration. This residual problem comes from an accumulation of pages under ordinary writeback, not marked PageReclaim, so rightly not causing the memcg check to wait on their writeback: these too can prevent shrink_page_list() from freeing any pages, so many times that memcg reclaim fails and OOMs. Deal with these in the same way as direct reclaim now deals with dirty FS pages: mark them PageReclaim. It is appropriate to rotate these to tail of list when writepage completes, but more importantly, the PageReclaim flag makes memcg reclaim wait on them if encountered again. Increment NR_VMSCAN_IMMEDIATE? That's arguable: I chose not. Setting PageReclaim here may occasionally race with end_page_writeback() clearing it: lru_deactivate_fn() already faced the same race, and correctly concluded that the window is small and the issue non-critical. With these changes, the test runs indefinitely without OOMing on ext4, ext3 and ext2: I'll move on to test with other filesystems later. Trivia: invert conditions for a clearer block without an else, and goto keep_locked to do the unlock_page. Signed-off-by: Hugh Dickins Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Rik van Riel Cc: Ying Han Cc: Greg Thelen Cc: Hugh Dickins Cc: Mel Gorman Cc: Johannes Weiner Cc: Fengguang Wu Acked-by: Michal Hocko Cc: Dave Chinner Cc: Theodore Ts'o Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index ca43aa0..e37e687 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -723,23 +723,38 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* * memcg doesn't have any dirty pages throttling so we * could easily OOM just because too many pages are in - * writeback from reclaim and there is nothing else to - * reclaim. + * writeback and there is nothing else to reclaim. * - * Check may_enter_fs, certainly because a loop driver + * Check __GFP_IO, certainly because a loop driver * thread might enter reclaim, and deadlock if it waits * on a page for which it is needed to do the write * (loop masks off __GFP_IO|__GFP_FS for this reason); * but more thought would probably show more reasons. + * + * Don't require __GFP_FS, since we're not going into + * the FS, just waiting on its writeback completion. + * Worryingly, ext4 gfs2 and xfs allocate pages with + * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so + * testing may_enter_fs here is liable to OOM on them. */ - if (!global_reclaim(sc) && PageReclaim(page) && - may_enter_fs) - wait_on_page_writeback(page); - else { + if (global_reclaim(sc) || + !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { + /* + * This is slightly racy - end_page_writeback() + * might have just cleared PageReclaim, then + * setting PageReclaim here end up interpreted + * as PageReadahead - but that does not matter + * enough to care. What we do want is for this + * page to have PageReclaim set next time memcg + * reclaim reaches the tests above, so it will + * then wait_on_page_writeback() to avoid OOM; + * and it's also appropriate in global reclaim. + */ + SetPageReclaim(page); nr_writeback++; - unlock_page(page); - goto keep; + goto keep_locked; } + wait_on_page_writeback(page); } references = page_check_references(page, sc); -- cgit v0.10.2 From b214514592d2dcb0b9d14ee8dd14f3699e3b0a84 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 31 Jul 2012 16:46:01 -0700 Subject: memcg: add mem_cgroup_from_css() helper Add a mem_cgroup_from_css() helper to replace open-coded invokations of container_of(). To clarify the code and to add a little more type safety. [akpm@linux-foundation.org: fix extensive breakage] Signed-off-by: Wanpeng Li Acked-by: Michal Hocko Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Gavin Shan Cc: Wanpeng Li Cc: Gavin Shan Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b06833f..795e525 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -405,6 +405,12 @@ enum charge_type { static void mem_cgroup_get(struct mem_cgroup *memcg); static void mem_cgroup_put(struct mem_cgroup *memcg); +static inline +struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) +{ + return container_of(s, struct mem_cgroup, css); +} + /* Writing them here to avoid exposing memcg's inner layout */ #ifdef CONFIG_MEMCG_KMEM #include @@ -862,9 +868,8 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont) { - return container_of(cgroup_subsys_state(cont, - mem_cgroup_subsys_id), struct mem_cgroup, - css); + return mem_cgroup_from_css( + cgroup_subsys_state(cont, mem_cgroup_subsys_id)); } struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) @@ -877,8 +882,7 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) if (unlikely(!p)) return NULL; - return container_of(task_subsys_state(p, mem_cgroup_subsys_id), - struct mem_cgroup, css); + return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); } struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) @@ -964,8 +968,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, css = css_get_next(&mem_cgroup_subsys, id + 1, &root->css, &id); if (css) { if (css == &root->css || css_tryget(css)) - memcg = container_of(css, - struct mem_cgroup, css); + memcg = mem_cgroup_from_css(css); } else id = 0; rcu_read_unlock(); @@ -2494,7 +2497,7 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) css = css_lookup(&mem_cgroup_subsys, id); if (!css) return NULL; - return container_of(css, struct mem_cgroup, css); + return mem_cgroup_from_css(css); } struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) -- cgit v0.10.2 From 5b760e64a64c8940cdccd0ba6fce19a9bd010d20 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 31 Jul 2012 16:46:02 -0700 Subject: mm/sparse: optimize sparse_index_alloc With CONFIG_SPARSEMEM_EXTREME, the two levels of memory section descriptors are allocated from slab or bootmem. When allocating from slab, let slab/bootmem allocator clear the memory chunk. We needn't clear it explicitly. Signed-off-by: Gavin Shan Reviewed-by: Michal Hocko Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/sparse.c b/mm/sparse.c index 950981f..fa933f4 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -65,14 +65,12 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) if (slab_is_available()) { if (node_state(nid, N_HIGH_MEMORY)) - section = kmalloc_node(array_size, GFP_KERNEL, nid); + section = kzalloc_node(array_size, GFP_KERNEL, nid); else - section = kmalloc(array_size, GFP_KERNEL); - } else + section = kzalloc(array_size, GFP_KERNEL); + } else { section = alloc_bootmem_node(NODE_DATA(nid), array_size); - - if (section) - memset(section, 0, array_size); + } return section; } -- cgit v0.10.2 From db36a46113e101a8aa2d6ede41e78f2eaabed3f1 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 31 Jul 2012 16:46:04 -0700 Subject: mm/sparse: more checks on mem_section number __section_nr() was implemented to retrieve the corresponding memory section number according to its descriptor. It's possible that the specified memory section descriptor doesn't exist in the global array. So add more checking on that and report an error for a wrong case. Signed-off-by: Gavin Shan Acked-by: David Rientjes Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/sparse.c b/mm/sparse.c index fa933f4..42ca0ea 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -130,6 +130,8 @@ int __section_nr(struct mem_section* ms) break; } + VM_BUG_ON(root_nr == NR_SECTION_ROOTS); + return (root_nr * SECTIONS_PER_ROOT) + (ms - root); } -- cgit v0.10.2 From c1c9518331969f97ea403bac66f0fd4a85d204d5 Mon Sep 17 00:00:00 2001 From: Gavin Shan Date: Tue, 31 Jul 2012 16:46:06 -0700 Subject: mm/sparse: remove index_init_lock sparse_index_init() uses the index_init_lock spinlock to protect root mem_section assignment. The lock is not necessary anymore because the function is called only during boot (during paging init which is executed only from a single CPU) and from the hotplug code (by add_memory() via arch_add_memory()) which uses mem_hotplug_mutex. The lock was introduced by 28ae55c9 ("sparsemem extreme: hotplug preparation") and sparse_index_init() was used only during boot at that time. Later when the hotplug code (and add_memory()) was introduced there was no synchronization so it was possible to online more sections from the same root probably (though I am not 100% sure about that). The first synchronization has been added by 6ad696d2 ("mm: allow memory hotplug and hibernation in the same kernel") which was later replaced by the mem_hotplug_mutex - 20d6c96b ("mem-hotplug: introduce {un}lock_memory_hotplug()"). Let's remove the lock as it is not needed and it makes the code more confusing. [mhocko@suse.cz: changelog] Signed-off-by: Gavin Shan Reviewed-by: Michal Hocko Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/sparse.c b/mm/sparse.c index 42ca0ea..fac95f2 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -77,7 +77,6 @@ static struct mem_section noinline __init_refok *sparse_index_alloc(int nid) static int __meminit sparse_index_init(unsigned long section_nr, int nid) { - static DEFINE_SPINLOCK(index_init_lock); unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; int ret = 0; @@ -88,20 +87,9 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) section = sparse_index_alloc(nid); if (!section) return -ENOMEM; - /* - * This lock keeps two different sections from - * reallocating for the same index - */ - spin_lock(&index_init_lock); - - if (mem_section[root]) { - ret = -EEXIST; - goto out; - } mem_section[root] = section; -out: - spin_unlock(&index_init_lock); + return ret; } #else /* !SPARSEMEM_EXTREME */ -- cgit v0.10.2 From 69980e31754ef23307d51372e61bf7c2584f8a4b Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Tue, 31 Jul 2012 16:46:08 -0700 Subject: memcg: gix memory accounting scalability in shrink_page_list I noticed in a multi-process parallel files reading benchmark I ran on a 8 socket machine, throughput slowed down by a factor of 8 when I ran the benchmark within a cgroup container. I traced the problem to the following code path (see below) when we are trying to reclaim memory from file cache. The res_counter_uncharge function is called on every page that's reclaimed and created heavy lock contention. The patch below allows the reclaimed pages to be uncharged from the resource counter in batch and recovered the regression. Tim 40.67% usemem [kernel.kallsyms] [k] _raw_spin_lock | --- _raw_spin_lock | |--92.61%-- res_counter_uncharge | | | |--100.00%-- __mem_cgroup_uncharge_common | | | | | |--100.00%-- mem_cgroup_uncharge_cache_page | | | __remove_mapping | | | shrink_page_list | | | shrink_inactive_list | | | shrink_mem_cgroup_zone | | | shrink_zone | | | do_try_to_free_pages | | | try_to_free_pages | | | __alloc_pages_nodemask | | | alloc_pages_current Signed-off-by: Tim Chen Acked-by: KAMEZAWA Hiroyuki Acked-by: Johannes Weiner Acked-by: Kirill A. Shutemov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index e37e687..8d01243 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -687,6 +687,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, cond_resched(); + mem_cgroup_uncharge_start(); while (!list_empty(page_list)) { enum page_references references; struct address_space *mapping; @@ -953,6 +954,7 @@ keep: list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); + mem_cgroup_uncharge_end(); *ret_nr_dirty += nr_dirty; *ret_nr_writeback += nr_writeback; return nr_reclaimed; -- cgit v0.10.2 From 93180cec0064d072a984f91c429c5e6bcc816976 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:46:11 -0700 Subject: mips: zero out pg_data_t when it's allocated This patch is preparation for the next patch which removes the zeroing of the pg_data_t in core MM. All archs except MIPS already do this. Signed-off-by: Minchan Kim Cc: Ralf Baechle Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index b105eca..cd8fcab 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -401,6 +401,7 @@ static void __init node_mem_init(cnodeid_t node) * Allocate the node data structures on the node first. */ __node_data[node] = __va(slot_freepfn << PAGE_SHIFT); + memset(__node_data[node], 0, PAGE_SIZE); NODE_DATA(node)->bdata = &bootmem_node_data[node]; NODE_DATA(node)->node_start_pfn = start_pfn; -- cgit v0.10.2 From 88fdf75d1bb51d85ba00c466391770056d44bc03 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:46:14 -0700 Subject: mm: warn if pg_data_t isn't initialized with zero Warn if memory-hotplug/boot code doesn't initialize pg_data_t with zero when it is allocated. Arch code and memory hotplug already initiailize pg_data_t. So this warning should never happen. I select fields randomly near the beginning, middle and end of pg_data_t for checking. This patch isn't for performance but for removing initialization code which is necessary to add whenever we adds new field to pg_data_t or zone. Firstly, Andrew suggested clearing out of pg_data_t in MM core part but Tejun doesn't like it because in the future, some archs can initialize some fields in arch code and pass them into general MM part so blindly clearing it out in mm core part would be very annoying. Signed-off-by: Minchan Kim Cc: Tejun Heo Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 48aee0f..a6f7576 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4515,6 +4515,9 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, { pg_data_t *pgdat = NODE_DATA(nid); + /* pg_data_t should be reset to zero when it's allocated */ + WARN_ON(pgdat->nr_zones || pgdat->node_start_pfn || pgdat->classzone_idx); + pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; calculate_node_totalpages(pgdat, zones_size, zholes_size); -- cgit v0.10.2 From 6527af5d1bea219d64095a5e30c1b1e0868aae16 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 31 Jul 2012 16:46:16 -0700 Subject: mm: remove redundant initialization pg_data_t is zeroed before reaching free_area_init_core(), so remove the now unnecessary initializations. Signed-off-by: Minchan Kim Cc: Tejun Heo Cc: Ralf Baechle Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 65efb92..ad2cfd5 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -179,11 +179,6 @@ extern void zone_statistics(struct zone *, struct zone *, gfp_t gfp); #define add_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, __d) #define sub_zone_page_state(__z, __i, __d) mod_zone_page_state(__z, __i, -(__d)) -static inline void zap_zone_vm_stats(struct zone *zone) -{ - memset(zone->vm_stat, 0, sizeof(zone->vm_stat)); -} - extern void inc_zone_state(struct zone *, enum zone_stat_item); #ifdef CONFIG_SMP diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6f7576..889532b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4377,6 +4377,8 @@ void __init set_pageblock_order(void) * - mark all pages reserved * - mark all memory queues empty * - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) @@ -4387,10 +4389,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, int ret; pgdat_resize_init(pgdat); - pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); - pgdat->kswapd_max_order = 0; pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { @@ -4451,11 +4451,6 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); lruvec_init(&zone->lruvec, zone); - zap_zone_vm_stats(zone); - zone->flags = 0; -#ifdef CONFIG_MEMORY_ISOLATION - zone->nr_pageblock_isolate = 0; -#endif if (!size) continue; -- cgit v0.10.2 From 09c231cb8bfdc35e7d896850d34440b8553b084f Mon Sep 17 00:00:00 2001 From: Nathan Zimmer Date: Tue, 31 Jul 2012 16:46:17 -0700 Subject: tmpfs: distribute interleave better across nodes When tmpfs has the interleave memory policy, it always starts allocating for each file from node 0 at offset 0. When there are many small files, the lower nodes fill up disproportionately. This patch spreads out node usage by starting files at nodes other than 0, by using the inode number to bias the starting node for interleave. Signed-off-by: Nathan Zimmer Signed-off-by: Hugh Dickins Cc: Christoph Lameter Cc: Nick Piggin Cc: Lee Schermerhorn Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/shmem.c b/mm/shmem.c index c15b998..d4e184e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -929,7 +929,8 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = index; + /* Bias interleave by inode number to distribute better across nodes */ + pvma.vm_pgoff = index + info->vfs_inode.i_ino; pvma.vm_ops = NULL; pvma.vm_policy = spol; return swapin_readahead(swap, gfp, &pvma, 0); @@ -942,7 +943,8 @@ static struct page *shmem_alloc_page(gfp_t gfp, /* Create a pseudo vma that just contains the policy */ pvma.vm_start = 0; - pvma.vm_pgoff = index; + /* Bias interleave by inode number to distribute better across nodes */ + pvma.vm_pgoff = index + info->vfs_inode.i_ino; pvma.vm_ops = NULL; pvma.vm_policy = mpol_shared_policy_lookup(&info->policy, index); -- cgit v0.10.2 From d833352a4338dc31295ed832a30c9ccff5c7a183 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 31 Jul 2012 16:46:20 -0700 Subject: mm: hugetlbfs: close race during teardown of hugetlbfs shared page tables If a process creates a large hugetlbfs mapping that is eligible for page table sharing and forks heavily with children some of whom fault and others which destroy the mapping then it is possible for page tables to get corrupted. Some teardowns of the mapping encounter a "bad pmd" and output a message to the kernel log. The final teardown will trigger a BUG_ON in mm/filemap.c. This was reproduced in 3.4 but is known to have existed for a long time and goes back at least as far as 2.6.37. It was probably was introduced in 2.6.20 by [39dde65c: shared page table for hugetlb page]. The messages look like this; [ ..........] Lots of bad pmd messages followed by this [ 127.164256] mm/memory.c:391: bad pmd ffff880412e04fe8(80000003de4000e7). [ 127.164257] mm/memory.c:391: bad pmd ffff880412e04ff0(80000003de6000e7). [ 127.164258] mm/memory.c:391: bad pmd ffff880412e04ff8(80000003de0000e7). [ 127.186778] ------------[ cut here ]------------ [ 127.186781] kernel BUG at mm/filemap.c:134! [ 127.186782] invalid opcode: 0000 [#1] SMP [ 127.186783] CPU 7 [ 127.186784] Modules linked in: af_packet cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq mperf ext3 jbd dm_mod coretemp crc32c_intel usb_storage ghash_clmulni_intel aesni_intel i2c_i801 r8169 mii uas sr_mod cdrom sg iTCO_wdt iTCO_vendor_support shpchp serio_raw cryptd aes_x86_64 e1000e pci_hotplug dcdbas aes_generic container microcode ext4 mbcache jbd2 crc16 sd_mod crc_t10dif i915 drm_kms_helper drm i2c_algo_bit ehci_hcd ahci libahci usbcore rtc_cmos usb_common button i2c_core intel_agp video intel_gtt fan processor thermal thermal_sys hwmon ata_generic pata_atiixp libata scsi_mod [ 127.186801] [ 127.186802] Pid: 9017, comm: hugetlbfs-test Not tainted 3.4.0-autobuild #53 Dell Inc. OptiPlex 990/06D7TR [ 127.186804] RIP: 0010:[] [] __delete_from_page_cache+0x15e/0x160 [ 127.186809] RSP: 0000:ffff8804144b5c08 EFLAGS: 00010002 [ 127.186810] RAX: 0000000000000001 RBX: ffffea000a5c9000 RCX: 00000000ffffffc0 [ 127.186811] RDX: 0000000000000000 RSI: 0000000000000009 RDI: ffff88042dfdad00 [ 127.186812] RBP: ffff8804144b5c18 R08: 0000000000000009 R09: 0000000000000003 [ 127.186813] R10: 0000000000000000 R11: 000000000000002d R12: ffff880412ff83d8 [ 127.186814] R13: ffff880412ff83d8 R14: 0000000000000000 R15: ffff880412ff83d8 [ 127.186815] FS: 00007fe18ed2c700(0000) GS:ffff88042dce0000(0000) knlGS:0000000000000000 [ 127.186816] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 127.186817] CR2: 00007fe340000503 CR3: 0000000417a14000 CR4: 00000000000407e0 [ 127.186818] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 127.186819] DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 [ 127.186820] Process hugetlbfs-test (pid: 9017, threadinfo ffff8804144b4000, task ffff880417f803c0) [ 127.186821] Stack: [ 127.186822] ffffea000a5c9000 0000000000000000 ffff8804144b5c48 ffffffff810ed83b [ 127.186824] ffff8804144b5c48 000000000000138a 0000000000001387 ffff8804144b5c98 [ 127.186825] ffff8804144b5d48 ffffffff811bc925 ffff8804144b5cb8 0000000000000000 [ 127.186827] Call Trace: [ 127.186829] [] delete_from_page_cache+0x3b/0x80 [ 127.186832] [] truncate_hugepages+0x115/0x220 [ 127.186834] [] hugetlbfs_evict_inode+0x13/0x30 [ 127.186837] [] evict+0xa7/0x1b0 [ 127.186839] [] iput_final+0xd3/0x1f0 [ 127.186840] [] iput+0x39/0x50 [ 127.186842] [] d_kill+0xf8/0x130 [ 127.186843] [] dput+0xd2/0x1a0 [ 127.186845] [] __fput+0x170/0x230 [ 127.186848] [] ? rb_erase+0xce/0x150 [ 127.186849] [] fput+0x1d/0x30 [ 127.186851] [] remove_vma+0x37/0x80 [ 127.186853] [] do_munmap+0x2d2/0x360 [ 127.186855] [] sys_shmdt+0xc9/0x170 [ 127.186857] [] system_call_fastpath+0x16/0x1b [ 127.186858] Code: 0f 1f 44 00 00 48 8b 43 08 48 8b 00 48 8b 40 28 8b b0 40 03 00 00 85 f6 0f 88 df fe ff ff 48 89 df e8 e7 cb 05 00 e9 d2 fe ff ff <0f> 0b 55 83 e2 fd 48 89 e5 48 83 ec 30 48 89 5d d8 4c 89 65 e0 [ 127.186868] RIP [] __delete_from_page_cache+0x15e/0x160 [ 127.186870] RSP [ 127.186871] ---[ end trace 7cbac5d1db69f426 ]--- The bug is a race and not always easy to reproduce. To reproduce it I was doing the following on a single socket I7-based machine with 16G of RAM. $ hugeadm --pool-pages-max DEFAULT:13G $ echo $((18*1048576*1024)) > /proc/sys/kernel/shmmax $ echo $((18*1048576*1024)) > /proc/sys/kernel/shmall $ for i in `seq 1 9000`; do ./hugetlbfs-test; done On my particular machine, it usually triggers within 10 minutes but enabling debug options can change the timing such that it never hits. Once the bug is triggered, the machine is in trouble and needs to be rebooted. The machine will respond but processes accessing proc like "ps aux" will hang due to the BUG_ON. shutdown will also hang and needs a hard reset or a sysrq-b. The basic problem is a race between page table sharing and teardown. For the most part page table sharing depends on i_mmap_mutex. In some cases, it is also taking the mm->page_table_lock for the PTE updates but with shared page tables, it is the i_mmap_mutex that is more important. Unfortunately it appears to be also insufficient. Consider the following situation Process A Process B --------- --------- hugetlb_fault shmdt LockWrite(mmap_sem) do_munmap unmap_region unmap_vmas unmap_single_vma unmap_hugepage_range Lock(i_mmap_mutex) Lock(mm->page_table_lock) huge_pmd_unshare/unmap tables <--- (1) Unlock(mm->page_table_lock) Unlock(i_mmap_mutex) huge_pte_alloc ... Lock(i_mmap_mutex) ... vma_prio_walk, find svma, spte ... Lock(mm->page_table_lock) ... share spte ... Unlock(mm->page_table_lock) ... Unlock(i_mmap_mutex) ... hugetlb_no_page <--- (2) free_pgtables unlink_file_vma hugetlb_free_pgd_range remove_vma_list In this scenario, it is possible for Process A to share page tables with Process B that is trying to tear them down. The i_mmap_mutex on its own does not prevent Process A walking Process B's page tables. At (1) above, the page tables are not shared yet so it unmaps the PMDs. Process A sets up page table sharing and at (2) faults a new entry. Process B then trips up on it in free_pgtables. This patch fixes the problem by adding a new function __unmap_hugepage_range_final that is only called when the VMA is about to be destroyed. This function clears VM_MAYSHARE during unmap_hugepage_range() under the i_mmap_mutex. This makes the VMA ineligible for sharing and avoids the race. Superficially this looks like it would then be vunerable to truncate and madvise issues but hugetlbfs has its own truncate handlers so does not use unmap_mapping_range() and does not support madvise(DONTNEED). This should be treated as a -stable candidate if it is merged. Test program is as follows. The test case was mostly written by Michal Hocko with a few minor changes to reproduce this bug. ==== CUT HERE ==== static size_t huge_page_size = (2UL << 20); static size_t nr_huge_page_A = 512; static size_t nr_huge_page_B = 5632; unsigned int get_random(unsigned int max) { struct timeval tv; gettimeofday(&tv, NULL); srandom(tv.tv_usec); return random() % max; } static void play(void *addr, size_t size) { unsigned char *start = addr, *end = start + size, *a; start += get_random(size/2); /* we could itterate on huge pages but let's give it more time. */ for (a = start; a < end; a += 4096) *a = 0; } int main(int argc, char **argv) { key_t key = IPC_PRIVATE; size_t sizeA = nr_huge_page_A * huge_page_size; size_t sizeB = nr_huge_page_B * huge_page_size; int shmidA, shmidB; void *addrA = NULL, *addrB = NULL; int nr_children = 300, n = 0; if ((shmidA = shmget(key, sizeA, IPC_CREAT|SHM_HUGETLB|0660)) == -1) { perror("shmget:"); return 1; } if ((addrA = shmat(shmidA, addrA, SHM_R|SHM_W)) == (void *)-1UL) { perror("shmat"); return 1; } if ((shmidB = shmget(key, sizeB, IPC_CREAT|SHM_HUGETLB|0660)) == -1) { perror("shmget:"); return 1; } if ((addrB = shmat(shmidB, addrB, SHM_R|SHM_W)) == (void *)-1UL) { perror("shmat"); return 1; } fork_child: switch(fork()) { case 0: switch (n%3) { case 0: play(addrA, sizeA); break; case 1: play(addrB, sizeB); break; case 2: break; } break; case -1: perror("fork:"); break; default: if (++n < nr_children) goto fork_child; play(addrA, sizeA); break; } shmdt(addrA); shmdt(addrB); do { wait(NULL); } while (--n > 0); shmctl(shmidA, IPC_RMID, NULL); shmctl(shmidB, IPC_RMID, NULL); return 0; } [akpm@linux-foundation.org: name the declaration's args, fix CONFIG_HUGETLBFS=n build] Signed-off-by: Hugh Dickins Reviewed-by: Michal Hocko Signed-off-by: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index f9db20b..2251648 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -48,6 +48,10 @@ int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, unsigned long *, int *, int, unsigned int flags); void unmap_hugepage_range(struct vm_area_struct *, unsigned long, unsigned long, struct page *); +void __unmap_hugepage_range_final(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct page *ref_page); void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page); @@ -130,6 +134,13 @@ static inline void copy_huge_page(struct page *dst, struct page *src) #define hugetlb_change_protection(vma, address, end, newprot) +static inline void __unmap_hugepage_range_final(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) +{ + BUG(); +} + static inline void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c39e4be..bc72712 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2429,6 +2429,25 @@ again: tlb_end_vma(tlb, vma); } +void __unmap_hugepage_range_final(struct mmu_gather *tlb, + struct vm_area_struct *vma, unsigned long start, + unsigned long end, struct page *ref_page) +{ + __unmap_hugepage_range(tlb, vma, start, end, ref_page); + + /* + * Clear this flag so that x86's huge_pmd_share page_table_shareable + * test will fail on a vma being torn down, and not grab a page table + * on its way out. We're lucky that the flag has such an appropriate + * name, and can in fact be safely cleared here. We could clear it + * before the __unmap_hugepage_range above, but all that's necessary + * is to clear it before releasing the i_mmap_mutex. This works + * because in the context this is called, the VMA is about to be + * destroyed and the i_mmap_mutex is held. + */ + vma->vm_flags &= ~VM_MAYSHARE; +} + void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { @@ -3012,9 +3031,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma, } } spin_unlock(&mm->page_table_lock); - mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); - + /* + * Must flush TLB before releasing i_mmap_mutex: x86's huge_pmd_unshare + * may have cleared our pud entry and done put_page on the page table: + * once we release i_mmap_mutex, another task can do the final put_page + * and that page table be reused and filled with junk. + */ flush_tlb_range(vma, start, end); + mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); } int hugetlb_reserve_pages(struct inode *inode, diff --git a/mm/memory.c b/mm/memory.c index ec72a61..482f089 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1345,7 +1345,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, */ if (vma->vm_file) { mutex_lock(&vma->vm_file->f_mapping->i_mmap_mutex); - __unmap_hugepage_range(tlb, vma, start, end, NULL); + __unmap_hugepage_range_final(tlb, vma, start, end, NULL); mutex_unlock(&vma->vm_file->f_mapping->i_mmap_mutex); } } else -- cgit v0.10.2 From 7ead55119ba58523c31f52365b5eb33f3bb34b3e Mon Sep 17 00:00:00 2001 From: Devendra Naga Date: Tue, 31 Jul 2012 16:46:22 -0700 Subject: rtc/rtc-88pm80x: assign ret only when rtc_register_driver fails At the probe we are assigning ret to return value of PTR_ERR right after the rtc_register_drive()r, as we would have done it in the if (IS_ERR(ptr)) check, since the function fails and goes inside that case Signed-off-by: Devendra Naga Cc: Alessandro Zummo Cc: Ashish Jangam Cc: David Dajun Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-88pm80x.c b/drivers/rtc/rtc-88pm80x.c index a2f956d..7e050b4 100644 --- a/drivers/rtc/rtc-88pm80x.c +++ b/drivers/rtc/rtc-88pm80x.c @@ -314,8 +314,8 @@ static int __devinit pm80x_rtc_probe(struct platform_device *pdev) info->rtc_dev = rtc_device_register("88pm80x-rtc", &pdev->dev, &pm80x_rtc_ops, THIS_MODULE); - ret = PTR_ERR(info->rtc_dev); if (IS_ERR(info->rtc_dev)) { + ret = PTR_ERR(info->rtc_dev); dev_err(&pdev->dev, "Failed to register RTC device: %d\n", ret); goto out_rtc; } -- cgit v0.10.2 From 437ea90cc3afdca5229b41c6b1d38c4842756cb9 Mon Sep 17 00:00:00 2001 From: Devendra Naga Date: Tue, 31 Jul 2012 16:46:24 -0700 Subject: rtc/rtc-88pm80x: remove unneed devm_kfree devm_kzalloc() doesn't need a matching devm_kfree(), the freeing mechanism will trigger when driver unloads. Signed-off-by: Devendra Naga Cc: Alessandro Zummo Cc: Ashish Jangam Cc: David Dajun Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-88pm80x.c b/drivers/rtc/rtc-88pm80x.c index 7e050b4..6367984 100644 --- a/drivers/rtc/rtc-88pm80x.c +++ b/drivers/rtc/rtc-88pm80x.c @@ -339,7 +339,6 @@ static int __devinit pm80x_rtc_probe(struct platform_device *pdev) out_rtc: pm80x_free_irq(chip, info->irq, info); out: - devm_kfree(&pdev->dev, info); return ret; } @@ -349,7 +348,6 @@ static int __devexit pm80x_rtc_remove(struct platform_device *pdev) platform_set_drvdata(pdev, NULL); rtc_device_unregister(info->rtc_dev); pm80x_free_irq(info->chip, info->irq, info); - devm_kfree(&pdev->dev, info); return 0; } -- cgit v0.10.2