From 1d46598b7903cd5ec83c49adbd741f43bb0ffcdc Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 4 Jun 2014 16:05:30 -0700 Subject: tools/vm/page-types.c: catch sigbus if raced with truncate Recently added page-cache dumping is known to be a little bit racy. But after race with truncate it just dies due to unhandled SIGBUS when it tries to poke pages beyond the new end of file. This patch adds handler for SIGBUS which skips the rest of the file. Signed-off-by: Konstantin Khlebnikov Cc: Naoya Horiguchi Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index 05654f5..c4d6d2e 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -32,6 +32,8 @@ #include #include #include +#include +#include #include #include #include @@ -824,21 +826,38 @@ static void show_file(const char *name, const struct stat *st) atime, now - st->st_atime); } +static sigjmp_buf sigbus_jmp; + +static void * volatile sigbus_addr; + +static void sigbus_handler(int sig, siginfo_t *info, void *ucontex) +{ + (void)sig; + (void)ucontex; + sigbus_addr = info ? info->si_addr : NULL; + siglongjmp(sigbus_jmp, 1); +} + +static struct sigaction sigbus_action = { + .sa_sigaction = sigbus_handler, + .sa_flags = SA_SIGINFO, +}; + static void walk_file(const char *name, const struct stat *st) { uint8_t vec[PAGEMAP_BATCH]; uint64_t buf[PAGEMAP_BATCH], flags; unsigned long nr_pages, pfn, i; + off_t off, end = st->st_size; int fd; - off_t off; ssize_t len; void *ptr; int first = 1; fd = checked_open(name, O_RDONLY|O_NOATIME|O_NOFOLLOW); - for (off = 0; off < st->st_size; off += len) { - nr_pages = (st->st_size - off + page_size - 1) / page_size; + for (off = 0; off < end; off += len) { + nr_pages = (end - off + page_size - 1) / page_size; if (nr_pages > PAGEMAP_BATCH) nr_pages = PAGEMAP_BATCH; len = nr_pages * page_size; @@ -855,11 +874,19 @@ static void walk_file(const char *name, const struct stat *st) if (madvise(ptr, len, MADV_RANDOM)) fatal("madvice failed: %s", name); + if (sigsetjmp(sigbus_jmp, 1)) { + end = off + sigbus_addr ? sigbus_addr - ptr : 0; + fprintf(stderr, "got sigbus at offset %lld: %s\n", + (long long)end, name); + goto got_sigbus; + } + /* populate ptes */ for (i = 0; i < nr_pages ; i++) { if (vec[i] & 1) (void)*(volatile int *)(ptr + i * page_size); } +got_sigbus: /* turn off harvesting reference bits */ if (madvise(ptr, len, MADV_SEQUENTIAL)) @@ -910,6 +937,7 @@ static void walk_page_cache(void) kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY); pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); + sigaction(SIGBUS, &sigbus_action, NULL); if (stat(opt_file, &st)) fatal("stat failed: %s\n", opt_file); @@ -925,6 +953,7 @@ static void walk_page_cache(void) close(kpageflags_fd); close(pagemap_fd); + signal(SIGBUS, SIG_DFL); } static void parse_file(const char *name) -- cgit v0.10.2 From f8f1ec73b5b1ca05d2b8eef8283bc953a16901b5 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 4 Jun 2014 16:05:31 -0700 Subject: MAINTAINERS: add Joe as the get_maintainer.pl maintainer Might as well be the get_maintainer maintainer... Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index abf98f0..586cd06 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3882,6 +3882,11 @@ L: kvm@vger.kernel.org S: Supported F: drivers/uio/uio_pci_generic.c +GET_MAINTAINER SCRIPT +M: Joe Perches +S: Maintained +F: scripts/get_maintainer.pl + GFS2 FILE SYSTEM M: Steven Whitehouse L: cluster-devel@redhat.com -- cgit v0.10.2 From f9625c48ecca4aa850b50ecbbb540228ad59e92e Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Wed, 4 Jun 2014 16:05:32 -0700 Subject: MAINTAINERS: pass on hwpoison maintainership to Naoya Horiguchi Horiguchi-san has done most of the work on hwpoison in the last years and he also does most of the reviewing. So I'm passing on the hwpoison maintainership to him. Signed-off-by: Andi Kleen Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 586cd06..105ec9a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4011,9 +4011,8 @@ S: Odd Fixes F: drivers/media/usb/hdpvr/ HWPOISON MEMORY FAILURE HANDLING -M: Andi Kleen +M: Naoya Horiguchi L: linux-mm@kvack.org -T: git git://git.kernel.org/pub/scm/linux/kernel/git/ak/linux-mce-2.6.git hwpoison S: Maintained F: mm/memory-failure.c F: mm/hwpoison-inject.c -- cgit v0.10.2 From 7f39dda9d86fb4f4f17af0de170decf125726f8c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 4 Jun 2014 16:05:33 -0700 Subject: mm: fix sleeping function warning from __put_anon_vma Trinity reports BUG: sleeping function called from invalid context at kernel/locking/rwsem.c:47 in_atomic(): 0, irqs_disabled(): 0, pid: 5787, name: trinity-c27 __might_sleep < down_write < __put_anon_vma < page_get_anon_vma < migrate_pages < compact_zone < compact_zone_order < try_to_compact_pages .. Right, since conversion to mutex then rwsem, we should not put_anon_vma() from inside an rcu_read_lock()ed section: fix the two places that did so. And add might_sleep() to anon_vma_free(), as suggested by Peter Zijlstra. Fixes: 88c22088bf23 ("mm: optimize page_lock_anon_vma() fast-path") Reported-by: Dave Jones Signed-off-by: Hugh Dickins Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/rmap.c b/mm/rmap.c index 9c3e773..10aef96 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -103,6 +103,7 @@ static inline void anon_vma_free(struct anon_vma *anon_vma) * LOCK should suffice since the actual taking of the lock must * happen _before_ what follows. */ + might_sleep(); if (rwsem_is_locked(&anon_vma->root->rwsem)) { anon_vma_lock_write(anon_vma); anon_vma_unlock_write(anon_vma); @@ -426,8 +427,9 @@ struct anon_vma *page_get_anon_vma(struct page *page) * above cannot corrupt). */ if (!page_mapped(page)) { + rcu_read_unlock(); put_anon_vma(anon_vma); - anon_vma = NULL; + return NULL; } out: rcu_read_unlock(); @@ -477,9 +479,9 @@ struct anon_vma *page_lock_anon_vma_read(struct page *page) } if (!page_mapped(page)) { + rcu_read_unlock(); put_anon_vma(anon_vma); - anon_vma = NULL; - goto out; + return NULL; } /* we pinned the anon_vma, its safe to sleep */ -- cgit v0.10.2 From c177c81e09e517bbf75b67762cdab1b83aba6976 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 4 Jun 2014 16:05:35 -0700 Subject: hugetlb: restrict hugepage_migration_support() to x86_64 Currently hugepage migration is available for all archs which support pmd-level hugepage, but testing is done only for x86_64 and there're bugs for other archs. So to avoid breaking such archs, this patch limits the availability strictly to x86_64 until developers of other archs get interested in enabling this feature. Simply disabling hugepage migration on non-x86_64 archs is not enough to fix the reported problem where sys_move_pages() hits the BUG_ON() in follow_page(FOLL_GET), so let's fix this by checking if hugepage migration is supported in vma_migratable(). Signed-off-by: Naoya Horiguchi Reported-by: Michael Ellerman Tested-by: Michael Ellerman Acked-by: Hugh Dickins Cc: Benjamin Herrenschmidt Cc: Tony Luck Cc: Russell King Cc: Martin Schwidefsky Cc: James Hogan Cc: Ralf Baechle Cc: David Miller Cc: [3.12+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arm/mm/hugetlbpage.c b/arch/arm/mm/hugetlbpage.c index 54ee616..66781bf3 100644 --- a/arch/arm/mm/hugetlbpage.c +++ b/arch/arm/mm/hugetlbpage.c @@ -56,8 +56,3 @@ int pmd_huge(pmd_t pmd) { return pmd_val(pmd) && !(pmd_val(pmd) & PMD_TABLE_BIT); } - -int pmd_huge_support(void) -{ - return 1; -} diff --git a/arch/arm64/mm/hugetlbpage.c b/arch/arm64/mm/hugetlbpage.c index 31eb959..023747b 100644 --- a/arch/arm64/mm/hugetlbpage.c +++ b/arch/arm64/mm/hugetlbpage.c @@ -58,11 +58,6 @@ int pud_huge(pud_t pud) #endif } -int pmd_huge_support(void) -{ - return 1; -} - static __init int setup_hugepagesz(char *opt) { unsigned long ps = memparse(opt, &opt); diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 68232db..76069c1 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -114,11 +114,6 @@ int pud_huge(pud_t pud) return 0; } -int pmd_huge_support(void) -{ - return 0; -} - struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/metag/mm/hugetlbpage.c b/arch/metag/mm/hugetlbpage.c index 0424315..3c52fa6 100644 --- a/arch/metag/mm/hugetlbpage.c +++ b/arch/metag/mm/hugetlbpage.c @@ -110,11 +110,6 @@ int pud_huge(pud_t pud) return 0; } -int pmd_huge_support(void) -{ - return 1; -} - struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/mips/mm/hugetlbpage.c b/arch/mips/mm/hugetlbpage.c index 77e0ae0..4ec8ee1 100644 --- a/arch/mips/mm/hugetlbpage.c +++ b/arch/mips/mm/hugetlbpage.c @@ -84,11 +84,6 @@ int pud_huge(pud_t pud) return (pud_val(pud) & _PAGE_HUGE) != 0; } -int pmd_huge_support(void) -{ - return 1; -} - struct page * follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index eb92365..7e70ae9 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -86,11 +86,6 @@ int pgd_huge(pgd_t pgd) */ return ((pgd_val(pgd) & 0x3) != 0x0); } - -int pmd_huge_support(void) -{ - return 1; -} #else int pmd_huge(pmd_t pmd) { @@ -106,11 +101,6 @@ int pgd_huge(pgd_t pgd) { return 0; } - -int pmd_huge_support(void) -{ - return 0; -} #endif pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 0727a55d..0ff66a7 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -220,11 +220,6 @@ int pud_huge(pud_t pud) return 0; } -int pmd_huge_support(void) -{ - return 1; -} - struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmdp, int write) { diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 0d676a4..d776234 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -83,11 +83,6 @@ int pud_huge(pud_t pud) return 0; } -int pmd_huge_support(void) -{ - return 0; -} - struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 9bd9ce8..d329537 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -231,11 +231,6 @@ int pud_huge(pud_t pud) return 0; } -int pmd_huge_support(void) -{ - return 0; -} - struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/tile/mm/hugetlbpage.c b/arch/tile/mm/hugetlbpage.c index 0cb3bba..e514899 100644 --- a/arch/tile/mm/hugetlbpage.c +++ b/arch/tile/mm/hugetlbpage.c @@ -166,11 +166,6 @@ int pud_huge(pud_t pud) return !!(pud_val(pud) & _PAGE_HUGE_PAGE); } -int pmd_huge_support(void) -{ - return 1; -} - struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int write) { diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7d5feb5..e41b258 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1873,6 +1873,10 @@ config ARCH_ENABLE_SPLIT_PMD_PTLOCK def_bool y depends on X86_64 || X86_PAE +config ARCH_ENABLE_HUGEPAGE_MIGRATION + def_bool y + depends on X86_64 && HUGETLB_PAGE && MIGRATION + menu "Power management and ACPI options" config ARCH_HIBERNATION_HEADER diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8c9f647..8b977eb 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -58,11 +58,6 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address, { return NULL; } - -int pmd_huge_support(void) -{ - return 0; -} #else struct page * @@ -80,11 +75,6 @@ int pud_huge(pud_t pud) { return !!(pud_val(pud) & _PAGE_PSE); } - -int pmd_huge_support(void) -{ - return 1; -} #endif #ifdef CONFIG_HUGETLB_PAGE diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index b65166d..d0bad1a 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -392,15 +392,13 @@ static inline pgoff_t basepage_index(struct page *page) extern void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); -int pmd_huge_support(void); -/* - * Currently hugepage migration is enabled only for pmd-based hugepage. - * This function will be updated when hugepage migration is more widely - * supported. - */ static inline int hugepage_migration_support(struct hstate *h) { - return pmd_huge_support() && (huge_page_shift(h) == PMD_SHIFT); +#ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION + return huge_page_shift(h) == PMD_SHIFT; +#else + return 0; +#endif } static inline spinlock_t *huge_pte_lockptr(struct hstate *h, @@ -450,7 +448,6 @@ static inline pgoff_t basepage_index(struct page *page) return page->index; } #define dissolve_free_huge_pages(s, e) do {} while (0) -#define pmd_huge_support() 0 #define hugepage_migration_support(h) 0 static inline spinlock_t *huge_pte_lockptr(struct hstate *h, diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3c1b968..f230a97 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -175,6 +175,12 @@ static inline int vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & (VM_IO | VM_PFNMAP)) return 0; + +#ifndef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION + if (vma->vm_flags & VM_HUGETLB) + return 0; +#endif + /* * Migration allocates pages in the highest zone. If we cannot * do so then migration (at least from node to node) is not diff --git a/mm/Kconfig b/mm/Kconfig index 28cec51..75ac479 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -267,6 +267,9 @@ config MIGRATION pages as migration can relocate pages to satisfy a huge page allocation instead of reclaiming. +config ARCH_ENABLE_HUGEPAGE_MIGRATION + boolean + config PHYS_ADDR_T_64BIT def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT -- cgit v0.10.2 From 8fe6929cfd43c44834858a53e129ffdc7c166298 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Wed, 4 Jun 2014 16:05:36 -0700 Subject: kthread: fix return value of kthread_create() upon SIGKILL. Commit 786235eeba0e ("kthread: make kthread_create() killable") meant for allowing kthread_create() to abort as soon as killed by the OOM-killer. But returning -ENOMEM is wrong if killed by SIGKILL from userspace. Change kthread_create() to return -EINTR upon SIGKILL. Signed-off-by: Tetsuo Handa Cc: Oleg Nesterov Acked-by: David Rientjes Cc: [3.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/kthread.c b/kernel/kthread.c index 9a130ec..c2390f4 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -262,7 +262,7 @@ static void create_kthread(struct kthread_create_info *create) * kthread_stop() has been called). The return value should be zero * or a negative error number; it will be passed to kthread_stop(). * - * Returns a task_struct or ERR_PTR(-ENOMEM). + * Returns a task_struct or ERR_PTR(-ENOMEM) or ERR_PTR(-EINTR). */ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), void *data, int node, @@ -298,7 +298,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), * that thread. */ if (xchg(&create->done, NULL)) - return ERR_PTR(-ENOMEM); + return ERR_PTR(-EINTR); /* * kthreadd (or new kernel thread) will call complete() * shortly. -- cgit v0.10.2 From 36dfd116edd48fa6174d5694c143f1d4bd81aba8 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:05:38 -0700 Subject: fs/fscache: convert printk to pr_foo() All printk converted to pr_foo() except internal.h: printk(KERN_DEBUG Coalesce formats. Add pr_fmt Signed-off-by: Fabian Frederick Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c index f7cff36..56cce7f 100644 --- a/fs/fscache/cache.c +++ b/fs/fscache/cache.c @@ -280,15 +280,15 @@ int fscache_add_cache(struct fscache_cache *cache, spin_unlock(&fscache_fsdef_index.lock); up_write(&fscache_addremove_sem); - printk(KERN_NOTICE "FS-Cache: Cache \"%s\" added (type %s)\n", - cache->tag->name, cache->ops->name); + pr_notice("Cache \"%s\" added (type %s)\n", + cache->tag->name, cache->ops->name); kobject_uevent(cache->kobj, KOBJ_ADD); _leave(" = 0 [%s]", cache->identifier); return 0; tag_in_use: - printk(KERN_ERR "FS-Cache: Cache tag '%s' already in use\n", tagname); + pr_err("Cache tag '%s' already in use\n", tagname); __fscache_release_cache_tag(tag); _leave(" = -EXIST"); return -EEXIST; @@ -317,8 +317,7 @@ EXPORT_SYMBOL(fscache_add_cache); void fscache_io_error(struct fscache_cache *cache) { if (!test_and_set_bit(FSCACHE_IOERROR, &cache->flags)) - printk(KERN_ERR "FS-Cache:" - " Cache '%s' stopped due to I/O error\n", + pr_err("Cache '%s' stopped due to I/O error\n", cache->ops->name); } EXPORT_SYMBOL(fscache_io_error); @@ -369,8 +368,8 @@ void fscache_withdraw_cache(struct fscache_cache *cache) _enter(""); - printk(KERN_NOTICE "FS-Cache: Withdrawing cache \"%s\"\n", - cache->tag->name); + pr_notice("Withdrawing cache \"%s\"\n", + cache->tag->name); /* make the cache unavailable for cookie acquisition */ if (test_and_set_bit(FSCACHE_CACHE_WITHDRAWN, &cache->flags)) diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c index 29d7feb..aec01be 100644 --- a/fs/fscache/cookie.c +++ b/fs/fscache/cookie.c @@ -519,7 +519,7 @@ void __fscache_disable_cookie(struct fscache_cookie *cookie, bool invalidate) ASSERTCMP(atomic_read(&cookie->n_active), >, 0); if (atomic_read(&cookie->n_children) != 0) { - printk(KERN_ERR "FS-Cache: Cookie '%s' still has children\n", + pr_err("Cookie '%s' still has children\n", cookie->def->name); BUG(); } diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 4226f66..bc6c08f 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -22,6 +22,12 @@ * */ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) "FS-Cache: " fmt + #include #include @@ -413,8 +419,8 @@ do { \ #define ASSERT(X) \ do { \ if (unlikely(!(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) @@ -422,9 +428,9 @@ do { \ #define ASSERTCMP(X, OP, Y) \ do { \ if (unlikely(!((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ - printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ @@ -433,8 +439,8 @@ do { \ #define ASSERTIF(C, X) \ do { \ if (unlikely((C) && !(X))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ BUG(); \ } \ } while (0) @@ -442,9 +448,9 @@ do { \ #define ASSERTIFCMP(C, X, OP, Y) \ do { \ if (unlikely((C) && !((X) OP (Y)))) { \ - printk(KERN_ERR "\n"); \ - printk(KERN_ERR "FS-Cache: Assertion failed\n"); \ - printk(KERN_ERR "%lx " #OP " %lx is false\n", \ + pr_err("\n"); \ + pr_err("Assertion failed\n"); \ + pr_err("%lx " #OP " %lx is false\n", \ (unsigned long)(X), (unsigned long)(Y)); \ BUG(); \ } \ diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 7c27907..acd4bf1 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -146,8 +146,7 @@ static int __init fscache_init(void) 0, fscache_cookie_init_once); if (!fscache_cookie_jar) { - printk(KERN_NOTICE - "FS-Cache: Failed to allocate a cookie jar\n"); + pr_notice("Failed to allocate a cookie jar\n"); ret = -ENOMEM; goto error_cookie_jar; } @@ -156,7 +155,7 @@ static int __init fscache_init(void) if (!fscache_root) goto error_kobj; - printk(KERN_NOTICE "FS-Cache: Loaded\n"); + pr_notice("Loaded\n"); return 0; error_kobj: @@ -192,7 +191,7 @@ static void __exit fscache_exit(void) fscache_proc_cleanup(); destroy_workqueue(fscache_op_wq); destroy_workqueue(fscache_object_wq); - printk(KERN_NOTICE "FS-Cache: Unloaded\n"); + pr_notice("Unloaded\n"); } module_exit(fscache_exit); diff --git a/fs/fscache/netfs.c b/fs/fscache/netfs.c index 989f394..6d941f5 100644 --- a/fs/fscache/netfs.c +++ b/fs/fscache/netfs.c @@ -65,8 +65,7 @@ int __fscache_register_netfs(struct fscache_netfs *netfs) list_add(&netfs->link, &fscache_netfs_list); ret = 0; - printk(KERN_NOTICE "FS-Cache: Netfs '%s' registered for caching\n", - netfs->name); + pr_notice("Netfs '%s' registered for caching\n", netfs->name); already_registered: up_write(&fscache_addremove_sem); @@ -97,8 +96,8 @@ void __fscache_unregister_netfs(struct fscache_netfs *netfs) up_write(&fscache_addremove_sem); - printk(KERN_NOTICE "FS-Cache: Netfs '%s' unregistered from caching\n", - netfs->name); + pr_notice("Netfs '%s' unregistered from caching\n", + netfs->name); _leave(""); } diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index 318071a..e7b87a0 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -51,8 +51,7 @@ void fscache_enqueue_operation(struct fscache_operation *op) _debug("queue for caller's attention"); break; default: - printk(KERN_ERR "FS-Cache: Unexpected op type %lx", - op->flags); + pr_err("Unexpected op type %lx", op->flags); BUG(); break; } diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 7f5c658..ed70714 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -1108,10 +1108,8 @@ void fscache_mark_page_cached(struct fscache_retrieval *op, struct page *page) static bool once_only; if (!once_only) { once_only = true; - printk(KERN_WARNING "FS-Cache:" - " Cookie type %s marked page %lx" - " multiple times\n", - cookie->def->name, page->index); + pr_warn("Cookie type %s marked page %lx multiple times\n", + cookie->def->name, page->index); } } -- cgit v0.10.2 From 3185a88ce37490938d56119c474aa48616d386e8 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:05:39 -0700 Subject: fs/fscache: replace seq_printf by seq_puts Replace seq_printf where possible + coalesce formats from 2 existing seq_puts Signed-off-by: Fabian Frederick Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fscache/histogram.c b/fs/fscache/histogram.c index bad4967..7d637e2 100644 --- a/fs/fscache/histogram.c +++ b/fs/fscache/histogram.c @@ -31,12 +31,10 @@ static int fscache_histogram_show(struct seq_file *m, void *v) switch ((unsigned long) v) { case 1: - seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS " - " RETRV DLY RETRIEVLS\n"); + seq_puts(m, "JIFS SECS OBJ INST OP RUNS OBJ RUNS RETRV DLY RETRIEVLS\n"); return 0; case 2: - seq_puts(m, "===== ===== ========= ========= =========" - " ========= =========\n"); + seq_puts(m, "===== ===== ========= ========= ========= ========= =========\n"); return 0; default: index = (unsigned long) v - 3; diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index b5ebc2d..b8179ca 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -285,20 +285,20 @@ static int fscache_objlist_show(struct seq_file *m, void *v) fscache_unuse_cookie(obj); if (keylen > 0 || auxlen > 0) { - seq_printf(m, " "); + seq_puts(m, " "); for (p = buf; keylen > 0; keylen--) seq_printf(m, "%02x", *p++); if (auxlen > 0) { if (config & FSCACHE_OBJLIST_CONFIG_KEY) - seq_printf(m, ", "); + seq_puts(m, ", "); for (; auxlen > 0; auxlen--) seq_printf(m, "%02x", *p++); } } - seq_printf(m, "\n"); + seq_puts(m, "\n"); } else { - seq_printf(m, "\n"); + seq_puts(m, "\n"); } return 0; } -- cgit v0.10.2 From 0a8dd2db579f7a0ac7033d6b857c3d5dbaa77563 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Wed, 4 Jun 2014 16:05:40 -0700 Subject: fanotify: FAN_MARK_FLUSH: avoid having to provide a fake/invalid fd and path Originally from Tvrtko Ursulin (https://lkml.org/lkml/2011/1/12/112) Avoid having to provide a fake/invalid fd and path when flushing marks Currently for a group to flush marks it has set it needs to provide a fake or invalid (but resolvable) file descriptor and path when calling fanotify_mark. This patch pulls the flush handling a bit up so file descriptor and path are completely ignored when flushing. I reworked the patch to be applicable again (the signature of fanotify_mark has changed since Tvrtko's work). Signed-off-by: Heinrich Schuchardt Cc: Tvrtko Ursulin Reviewed-by: Jan Kara Acked-by: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 732648b..f6ac6a2 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -813,6 +813,15 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, group->priority == FS_PRIO_0) goto fput_and_out; + if (flags & FAN_MARK_FLUSH) { + ret = 0; + if (flags & FAN_MARK_MOUNT) + fsnotify_clear_vfsmount_marks_by_group(group); + else + fsnotify_clear_inode_marks_by_group(group); + goto fput_and_out; + } + ret = fanotify_find_path(dfd, pathname, &path, flags); if (ret) goto fput_and_out; @@ -824,7 +833,7 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, mnt = path.mnt; /* create/update an inode mark */ - switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE | FAN_MARK_FLUSH)) { + switch (flags & (FAN_MARK_ADD | FAN_MARK_REMOVE)) { case FAN_MARK_ADD: if (flags & FAN_MARK_MOUNT) ret = fanotify_add_vfsmount_mark(group, mnt, mask, flags); @@ -837,12 +846,6 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, else ret = fanotify_remove_inode_mark(group, inode, mask, flags); break; - case FAN_MARK_FLUSH: - if (flags & FAN_MARK_MOUNT) - fsnotify_clear_vfsmount_marks_by_group(group); - else - fsnotify_clear_inode_marks_by_group(group); - break; default: ret = -EINVAL; } -- cgit v0.10.2 From d4c7cf6cffb1bc711a833b5e304ba5bcfe76398b Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Wed, 4 Jun 2014 16:05:41 -0700 Subject: fanotify: create FAN_ACCESS event for readdir Before the patch, read creates FAN_ACCESS_PERM and FAN_ACCESS events, readdir creates only FAN_ACCESS_PERM events. This is inconsistent. After the patch, readdir creates FAN_ACCESS_PERM and FAN_ACCESS events. Signed-off-by: Heinrich Schuchardt Reviewed-by: Jan Kara Cc: Eric Paris Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/readdir.c b/fs/readdir.c index 5b53d99..33fd922 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -40,6 +41,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx) ctx->pos = file->f_pos; res = file->f_op->iterate(file, ctx); file->f_pos = ctx->pos; + fsnotify_access(file); file_accessed(file); } mutex_unlock(&inode->i_mutex); -- cgit v0.10.2 From efa8f7e5d7bc2a6f1d1f9b43f9514d02f00b9cb1 Mon Sep 17 00:00:00 2001 From: David Cohen Date: Wed, 4 Jun 2014 16:05:42 -0700 Subject: fs/notify/mark.c: trivial cleanup Do not initialize private_destroy_list twice. list_replace_init() already takes care of initializing private_destroy_list. We don't need to initialize it with LIST_HEAD() beforehand. Signed-off-by: David Cohen Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 923fe4a..d90deaa 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -340,7 +340,7 @@ void fsnotify_init_mark(struct fsnotify_mark *mark, static int fsnotify_mark_destroy(void *ignored) { struct fsnotify_mark *mark, *next; - LIST_HEAD(private_destroy_list); + struct list_head private_destroy_list; for (;;) { spin_lock(&destroy_lock); -- cgit v0.10.2 From cc299a98eb13a9853675a9cbb90b30b4011e1406 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Wed, 4 Jun 2014 16:05:43 -0700 Subject: fs/notify/fanotify/fanotify_user.c: fix FAN_MARK_FLUSH flag checking If fanotify_mark is called with illegal value of arguments flags and marks it usually returns EINVAL. When fanotify_mark is called with FAN_MARK_FLUSH the argument flags is not checked for irrelevant flags like FAN_MARK_IGNORED_MASK. The patch removes this inconsistency. If an irrelevant flag is set error EINVAL is returned. Signed-off-by: Heinrich Schuchardt Acked-by: Michael Kerrisk Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index f6ac6a2..9163a6e 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -776,7 +776,10 @@ SYSCALL_DEFINE5(fanotify_mark, int, fanotify_fd, unsigned int, flags, case FAN_MARK_REMOVE: if (!mask) return -EINVAL; + break; case FAN_MARK_FLUSH: + if (flags & ~(FAN_MARK_MOUNT | FAN_MARK_FLUSH)) + return -EINVAL; break; default: return -EINVAL; -- cgit v0.10.2 From 48149e9d3a7e924010a0daab30a6197b7d7b6580 Mon Sep 17 00:00:00 2001 From: Heinrich Schuchardt Date: Wed, 4 Jun 2014 16:05:44 -0700 Subject: fanotify: check file flags passed in fanotify_init Without this patch fanotify_init does not validate the value passed in event_f_flags. When a fanotify event is read from the fanotify file descriptor a new file descriptor is created where file.f_flags = event_f_flags. Internal and external open flags are stored together in field f_flags of struct file. Hence, an application might create file descriptors with internal flags like FMODE_EXEC, FMODE_NOCMTIME set. Jan Kara and Eric Paris both aggreed that this is a bug and the value of event_f_flags should be checked: https://lkml.org/lkml/2014/4/29/522 https://lkml.org/lkml/2014/4/29/539 This updated patch version considers the comments by Michael Kerrisk in https://lkml.org/lkml/2014/5/4/10 With the patch the value of event_f_flags is checked. When specifying an invalid value error EINVAL is returned. Internal flags are disallowed. File creation flags are disallowed: O_CREAT, O_DIRECTORY, O_EXCL, O_NOCTTY, O_NOFOLLOW, O_TRUNC, and O_TTY_INIT. Flags which do not make sense with fanotify are disallowed: __O_TMPFILE, O_PATH, FASYNC, and O_DIRECT. This leaves us with the following allowed values: O_RDONLY, O_WRONLY, O_RDWR are basic functionality. The are stored in the bits given by O_ACCMODE. O_APPEND is working as expected. The value might be useful in a logging application which appends the current status each time the log is opened. O_LARGEFILE is needed for files exceeding 4GB on 32bit systems. O_NONBLOCK may be useful when monitoring slow devices like tapes. O_NDELAY is equal to O_NONBLOCK except for platform parisc. To avoid code breaking on parisc either both flags should be allowed or none. The patch allows both. __O_SYNC and O_DSYNC may be used to avoid data loss on power disruption. O_NOATIME may be useful to reduce disk activity. O_CLOEXEC may be useful, if separate processes shall be used to scan files. Once this patch is accepted, the fanotify_init.2 manpage has to be updated. Signed-off-by: Heinrich Schuchardt Reviewed-by: Jan Kara Cc: Michael Kerrisk Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 9163a6e..3fdc8a3 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -25,6 +25,19 @@ #define FANOTIFY_DEFAULT_MAX_MARKS 8192 #define FANOTIFY_DEFAULT_MAX_LISTENERS 128 +/* + * All flags that may be specified in parameter event_f_flags of fanotify_init. + * + * Internal and external open flags are stored together in field f_flags of + * struct file. Only external open flags shall be allowed in event_f_flags. + * Internal flags like FMODE_NONOTIFY, FMODE_EXEC, FMODE_NOCMTIME shall be + * excluded. + */ +#define FANOTIFY_INIT_ALL_EVENT_F_BITS ( \ + O_ACCMODE | O_APPEND | O_NONBLOCK | \ + __O_SYNC | O_DSYNC | O_CLOEXEC | \ + O_LARGEFILE | O_NOATIME ) + extern const struct fsnotify_ops fanotify_fsnotify_ops; static struct kmem_cache *fanotify_mark_cache __read_mostly; @@ -669,6 +682,18 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if (flags & ~FAN_ALL_INIT_FLAGS) return -EINVAL; + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) + return -EINVAL; + + switch (event_f_flags & O_ACCMODE) { + case O_RDONLY: + case O_RDWR: + case O_WRONLY: + break; + default: + return -EINVAL; + } + user = get_current_user(); if (atomic_read(&user->fanotify_listeners) > FANOTIFY_DEFAULT_MAX_LISTENERS) { free_uid(user); -- cgit v0.10.2 From 504e0e2f3dcac242eb529a01a4b0ea45e4a34eb7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:05:50 -0700 Subject: ntfs: remove NULL value assignments Static values are automatically initialized to NULL. Signed-off-by: Fabian Frederick Acked-by: Anton Altaparmakov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ntfs/compress.c b/fs/ntfs/compress.c index ee4144c..f82498c 100644 --- a/fs/ntfs/compress.c +++ b/fs/ntfs/compress.c @@ -58,7 +58,7 @@ typedef enum { /** * ntfs_compression_buffer - one buffer for the decompression engine */ -static u8 *ntfs_compression_buffer = NULL; +static u8 *ntfs_compression_buffer; /** * ntfs_cb_lock - spinlock which protects ntfs_compression_buffer diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 9de2491..6c3296e 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -50,8 +50,8 @@ static unsigned long ntfs_nr_compression_users; /* A global default upcase table and a corresponding reference count. */ -static ntfschar *default_upcase = NULL; -static unsigned long ntfs_nr_upcase_users = 0; +static ntfschar *default_upcase; +static unsigned long ntfs_nr_upcase_users; /* Error constants/strings used in inode.c::ntfs_show_options(). */ typedef enum { diff --git a/fs/ntfs/sysctl.c b/fs/ntfs/sysctl.c index 79a8918..1927170 100644 --- a/fs/ntfs/sysctl.c +++ b/fs/ntfs/sysctl.c @@ -56,7 +56,7 @@ static ctl_table sysctls_root[] = { }; /* Storage for the sysctls header. */ -static struct ctl_table_header *sysctls_root_table = NULL; +static struct ctl_table_header *sysctls_root_table; /** * ntfs_sysctl - add or remove the debug sysctl -- cgit v0.10.2 From c473b2c6f6c6d012da98416b5de28cc48c4306c9 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 4 Jun 2014 16:05:51 -0700 Subject: sh: Replace __get_cpu_var uses __get_cpu_var() is used for multiple purposes in the kernel source. One of them is address calculation via the form &__get_cpu_var(x). This calculates the address for the instance of the percpu variable of the current processor based on an offset. Other use cases are for storing and retrieving data from the current processors percpu area. __get_cpu_var() can be used as an lvalue when writing data or on the right side of an assignment. __get_cpu_var() is defined as : #define __get_cpu_var(var) (*this_cpu_ptr(&(var))) __get_cpu_var() always only does an address determination. However, store and retrieve operations could use a segment prefix (or global register on other platforms) to avoid the address calculation. this_cpu_write() and this_cpu_read() can directly take an offset into a percpu area and use optimized assembly code to read and write per cpu variables. This patch converts __get_cpu_var into either an explicit address calculation using this_cpu_ptr() or into a use of this_cpu operations that use the offset. Thereby address calculations are avoided and less registers are used when code is generated. At the end of the patch set all uses of __get_cpu_var have been removed so the macro is removed too. The patch set includes passes over all arches as well. Once these operations are used throughout then specialized macros can be defined in non -x86 arches as well in order to optimize per cpu access by f.e. using a global register that may be set to the per cpu base. Transformations done to __get_cpu_var() 1. Determine the address of the percpu instance of the current processor. DEFINE_PER_CPU(int, y); int *x = &__get_cpu_var(y); Converts to int *x = this_cpu_ptr(&y); 2. Same as #1 but this time an array structure is involved. DEFINE_PER_CPU(int, y[20]); int *x = __get_cpu_var(y); Converts to int *x = this_cpu_ptr(y); 3. Retrieve the content of the current processors instance of a per cpu variable. DEFINE_PER_CPU(int, y); int x = __get_cpu_var(y) Converts to int x = __this_cpu_read(y); 4. Retrieve the content of a percpu struct DEFINE_PER_CPU(struct mystruct, y); struct mystruct x = __get_cpu_var(y); Converts to memcpy(&x, this_cpu_ptr(&y), sizeof(x)); 5. Assignment to a per cpu variable DEFINE_PER_CPU(int, y) __get_cpu_var(y) = x; Converts to __this_cpu_write(y, x); 6. Increment/Decrement etc of a per cpu variable DEFINE_PER_CPU(int, y); __get_cpu_var(y)++ Converts to __this_cpu_inc(y) Signed-off-by: Christoph Lameter Tested-by: Geert Uytterhoeven [compilation only] Cc: Paul Mundt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/sh/kernel/hw_breakpoint.c b/arch/sh/kernel/hw_breakpoint.c index f9173766..2197fc5 100644 --- a/arch/sh/kernel/hw_breakpoint.c +++ b/arch/sh/kernel/hw_breakpoint.c @@ -52,7 +52,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp) int i; for (i = 0; i < sh_ubc->num_events; i++) { - struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); if (!*slot) { *slot = bp; @@ -84,7 +84,7 @@ void arch_uninstall_hw_breakpoint(struct perf_event *bp) int i; for (i = 0; i < sh_ubc->num_events; i++) { - struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]); + struct perf_event **slot = this_cpu_ptr(&bp_per_reg[i]); if (*slot == bp) { *slot = NULL; diff --git a/arch/sh/kernel/kprobes.c b/arch/sh/kernel/kprobes.c index 42b46e6..83acbf3 100644 --- a/arch/sh/kernel/kprobes.c +++ b/arch/sh/kernel/kprobes.c @@ -102,7 +102,7 @@ int __kprobes kprobe_handle_illslot(unsigned long pc) void __kprobes arch_remove_kprobe(struct kprobe *p) { - struct kprobe *saved = &__get_cpu_var(saved_next_opcode); + struct kprobe *saved = this_cpu_ptr(&saved_next_opcode); if (saved->addr) { arch_disarm_kprobe(p); @@ -111,7 +111,7 @@ void __kprobes arch_remove_kprobe(struct kprobe *p) saved->addr = NULL; saved->opcode = 0; - saved = &__get_cpu_var(saved_next_opcode2); + saved = this_cpu_ptr(&saved_next_opcode2); if (saved->addr) { arch_disarm_kprobe(saved); @@ -129,14 +129,14 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb) static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) { - __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; + __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp); kcb->kprobe_status = kcb->prev_kprobe.status; } static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb) { - __get_cpu_var(current_kprobe) = p; + __this_cpu_write(current_kprobe, p); } /* @@ -146,15 +146,15 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, */ static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs) { - __get_cpu_var(saved_current_opcode).addr = (kprobe_opcode_t *)regs->pc; + __this_cpu_write(saved_current_opcode.addr, (kprobe_opcode_t *)regs->pc); if (p != NULL) { struct kprobe *op1, *op2; arch_disarm_kprobe(p); - op1 = &__get_cpu_var(saved_next_opcode); - op2 = &__get_cpu_var(saved_next_opcode2); + op1 = this_cpu_ptr(&saved_next_opcode); + op2 = this_cpu_ptr(&saved_next_opcode2); if (OPCODE_JSR(p->opcode) || OPCODE_JMP(p->opcode)) { unsigned int reg_nr = ((p->opcode >> 8) & 0x000F); @@ -249,7 +249,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) kcb->kprobe_status = KPROBE_REENTER; return 1; } else { - p = __get_cpu_var(current_kprobe); + p = __this_cpu_read(current_kprobe); if (p->break_handler && p->break_handler(p, regs)) { goto ss_probe; } @@ -336,9 +336,9 @@ int __kprobes trampoline_probe_handler(struct kprobe *p, struct pt_regs *regs) continue; if (ri->rp && ri->rp->handler) { - __get_cpu_var(current_kprobe) = &ri->rp->kp; + __this_cpu_write(current_kprobe, &ri->rp->kp); ri->rp->handler(ri, regs); - __get_cpu_var(current_kprobe) = NULL; + __this_cpu_write(current_kprobe, NULL); } orig_ret_address = (unsigned long)ri->ret_addr; @@ -383,19 +383,19 @@ static int __kprobes post_kprobe_handler(struct pt_regs *regs) cur->post_handler(cur, regs, 0); } - p = &__get_cpu_var(saved_next_opcode); + p = this_cpu_ptr(&saved_next_opcode); if (p->addr) { arch_disarm_kprobe(p); p->addr = NULL; p->opcode = 0; - addr = __get_cpu_var(saved_current_opcode).addr; - __get_cpu_var(saved_current_opcode).addr = NULL; + addr = __this_cpu_read(saved_current_opcode.addr); + __this_cpu_write(saved_current_opcode.addr, NULL); p = get_kprobe(addr); arch_arm_kprobe(p); - p = &__get_cpu_var(saved_next_opcode2); + p = this_cpu_ptr(&saved_next_opcode2); if (p->addr) { arch_disarm_kprobe(p); p->addr = NULL; @@ -511,7 +511,7 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self, if (kprobe_handler(args->regs)) { ret = NOTIFY_STOP; } else { - p = __get_cpu_var(current_kprobe); + p = __this_cpu_read(current_kprobe); if (p->break_handler && p->break_handler(p, args->regs)) ret = NOTIFY_STOP; diff --git a/arch/sh/kernel/localtimer.c b/arch/sh/kernel/localtimer.c index 8bfc6df..b880a7e 100644 --- a/arch/sh/kernel/localtimer.c +++ b/arch/sh/kernel/localtimer.c @@ -32,7 +32,7 @@ static DEFINE_PER_CPU(struct clock_event_device, local_clockevent); */ void local_timer_interrupt(void) { - struct clock_event_device *clk = &__get_cpu_var(local_clockevent); + struct clock_event_device *clk = this_cpu_ptr(&local_clockevent); irq_enter(); clk->event_handler(clk); diff --git a/arch/sh/kernel/perf_event.c b/arch/sh/kernel/perf_event.c index b9cefeb..0233167 100644 --- a/arch/sh/kernel/perf_event.c +++ b/arch/sh/kernel/perf_event.c @@ -227,7 +227,7 @@ again: static void sh_pmu_stop(struct perf_event *event, int flags) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; @@ -245,7 +245,7 @@ static void sh_pmu_stop(struct perf_event *event, int flags) static void sh_pmu_start(struct perf_event *event, int flags) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; @@ -262,7 +262,7 @@ static void sh_pmu_start(struct perf_event *event, int flags) static void sh_pmu_del(struct perf_event *event, int flags) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); sh_pmu_stop(event, PERF_EF_UPDATE); __clear_bit(event->hw.idx, cpuc->used_mask); @@ -272,7 +272,7 @@ static void sh_pmu_del(struct perf_event *event, int flags) static int sh_pmu_add(struct perf_event *event, int flags) { - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; int ret = -EAGAIN; diff --git a/arch/sh/kernel/smp.c b/arch/sh/kernel/smp.c index 86a7936..fc5acfc 100644 --- a/arch/sh/kernel/smp.c +++ b/arch/sh/kernel/smp.c @@ -111,7 +111,7 @@ void play_dead_common(void) irq_ctx_exit(raw_smp_processor_id()); mb(); - __get_cpu_var(cpu_state) = CPU_DEAD; + __this_cpu_write(cpu_state, CPU_DEAD); local_irq_disable(); } -- cgit v0.10.2 From 220108361f7cef9bc3ac0b4c84cb556d36ec2a6f Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:05:52 -0700 Subject: fs/squashfs/squashfs.h: replace pr_warning by pr_warn Update the last pr_warning callsite in fs branch Signed-off-by: Fabian Frederick Cc: Phillip Lougher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 9e1bb79..887d6d2 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -25,7 +25,7 @@ #define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args) -#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) +#define WARNING(s, args...) pr_warn("SQUASHFS: "s, ## args) /* block.c */ extern int squashfs_read_data(struct super_block *, u64, int, u64 *, -- cgit v0.10.2 From acc8a1c00585c5cd62fcafd9309ef40ac35e8bfa Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:05:54 -0700 Subject: arch/unicore32/mm/ioremap.c: convert printk/warn_on to warn() Coalesce formats. [akpm@linux-foundation.org: undo crazy long line] Signed-off-by: Fabian Frederick Cc: Guan Xuetao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c index 13068ee..a0840fa 100644 --- a/arch/unicore32/mm/ioremap.c +++ b/arch/unicore32/mm/ioremap.c @@ -143,13 +143,11 @@ void __iomem *__uc32_ioremap_pfn_caller(unsigned long pfn, /* * Don't allow RAM to be mapped */ - if (pfn_valid(pfn)) { - printk(KERN_WARNING "BUG: Your driver calls ioremap() on\n" + if (pfn_valid(pfn)) + WARN(1, "BUG: Your driver calls ioremap() on\n" "system memory. This leads to architecturally\n" "unpredictable behaviour, and ioremap() will fail in\n" "the next kernel release. Please fix your driver.\n"); - WARN_ON(1); - } type = get_mem_type(mtype); if (!type) -- cgit v0.10.2 From 2accff4ef5c5831a9cc6319394a9f61cc9de8534 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Jun 2014 16:05:54 -0700 Subject: arch/unicore32/mm/ioremap.c: return NULL on invalid pfn __uc32_ioremap_pfn_caller() should return NULL when the pfn is found to be invalid. From a recommendation by Guan Xuetao. Cc: Guan Xuetao Cc: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/unicore32/mm/ioremap.c b/arch/unicore32/mm/ioremap.c index a0840fa..bf012b2 100644 --- a/arch/unicore32/mm/ioremap.c +++ b/arch/unicore32/mm/ioremap.c @@ -143,11 +143,13 @@ void __iomem *__uc32_ioremap_pfn_caller(unsigned long pfn, /* * Don't allow RAM to be mapped */ - if (pfn_valid(pfn)) + if (pfn_valid(pfn)) { WARN(1, "BUG: Your driver calls ioremap() on\n" "system memory. This leads to architecturally\n" "unpredictable behaviour, and ioremap() will fail in\n" "the next kernel release. Please fix your driver.\n"); + return NULL; + } type = get_mem_type(mtype); if (!type) -- cgit v0.10.2 From f6b1fe7c27800adba0ccf6063ee97478046eeafe Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:05:57 -0700 Subject: fs/configs/item.c: kernel-doc fixes + clean-up Fix function parameter documentation EXPORT_SYMBOLS moved after corresponding functions Small coding style and checkpatch warning fixes Signed-off-by: Fabian Frederick Acked-by: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 50cee7f..e65f9ff 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -19,7 +19,7 @@ * Boston, MA 021110-1307, USA. * * Based on kobject: - * kobject is Copyright (c) 2002-2003 Patrick Mochel + * kobject is Copyright (c) 2002-2003 Patrick Mochel * * configfs Copyright (C) 2005 Oracle. All rights reserved. * @@ -35,9 +35,9 @@ #include -static inline struct config_item * to_item(struct list_head * entry) +static inline struct config_item *to_item(struct list_head *entry) { - return container_of(entry,struct config_item,ci_entry); + return container_of(entry, struct config_item, ci_entry); } /* Evil kernel */ @@ -47,34 +47,35 @@ static void config_item_release(struct kref *kref); * config_item_init - initialize item. * @item: item in question. */ -void config_item_init(struct config_item * item) +void config_item_init(struct config_item *item) { kref_init(&item->ci_kref); INIT_LIST_HEAD(&item->ci_entry); } +EXPORT_SYMBOL(config_item_init); /** * config_item_set_name - Set the name of an item * @item: item. - * @name: name. + * @fmt: The vsnprintf()'s format string. * * If strlen(name) >= CONFIGFS_ITEM_NAME_LEN, then use a * dynamically allocated string that @item->ci_name points to. * Otherwise, use the static @item->ci_namebuf array. */ -int config_item_set_name(struct config_item * item, const char * fmt, ...) +int config_item_set_name(struct config_item *item, const char *fmt, ...) { int error = 0; int limit = CONFIGFS_ITEM_NAME_LEN; int need; va_list args; - char * name; + char *name; /* * First, try the static array */ - va_start(args,fmt); - need = vsnprintf(item->ci_namebuf,limit,fmt,args); + va_start(args, fmt); + need = vsnprintf(item->ci_namebuf, limit, fmt, args); va_end(args); if (need < limit) name = item->ci_namebuf; @@ -83,13 +84,13 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...) * Need more space? Allocate it and try again */ limit = need + 1; - name = kmalloc(limit,GFP_KERNEL); + name = kmalloc(limit, GFP_KERNEL); if (!name) { error = -ENOMEM; goto Done; } - va_start(args,fmt); - need = vsnprintf(name,limit,fmt,args); + va_start(args, fmt); + need = vsnprintf(name, limit, fmt, args); va_end(args); /* Still? Give up. */ @@ -109,7 +110,6 @@ int config_item_set_name(struct config_item * item, const char * fmt, ...) Done: return error; } - EXPORT_SYMBOL(config_item_set_name); void config_item_init_type_name(struct config_item *item, @@ -131,20 +131,21 @@ void config_group_init_type_name(struct config_group *group, const char *name, } EXPORT_SYMBOL(config_group_init_type_name); -struct config_item * config_item_get(struct config_item * item) +struct config_item *config_item_get(struct config_item *item) { if (item) kref_get(&item->ci_kref); return item; } +EXPORT_SYMBOL(config_item_get); -static void config_item_cleanup(struct config_item * item) +static void config_item_cleanup(struct config_item *item) { - struct config_item_type * t = item->ci_type; - struct config_group * s = item->ci_group; - struct config_item * parent = item->ci_parent; + struct config_item_type *t = item->ci_type; + struct config_group *s = item->ci_group; + struct config_item *parent = item->ci_parent; - pr_debug("config_item %s: cleaning up\n",config_item_name(item)); + pr_debug("config_item %s: cleaning up\n", config_item_name(item)); if (item->ci_name != item->ci_namebuf) kfree(item->ci_name); item->ci_name = NULL; @@ -167,21 +168,23 @@ static void config_item_release(struct kref *kref) * * Decrement the refcount, and if 0, call config_item_cleanup(). */ -void config_item_put(struct config_item * item) +void config_item_put(struct config_item *item) { if (item) kref_put(&item->ci_kref, config_item_release); } +EXPORT_SYMBOL(config_item_put); /** * config_group_init - initialize a group for use - * @k: group + * @group: config_group */ void config_group_init(struct config_group *group) { config_item_init(&group->cg_item); INIT_LIST_HEAD(&group->cg_children); } +EXPORT_SYMBOL(config_group_init); /** * config_group_find_item - search for item in group. @@ -195,11 +198,11 @@ void config_group_init(struct config_group *group) struct config_item *config_group_find_item(struct config_group *group, const char *name) { - struct list_head * entry; - struct config_item * ret = NULL; + struct list_head *entry; + struct config_item *ret = NULL; - list_for_each(entry,&group->cg_children) { - struct config_item * item = to_item(entry); + list_for_each(entry, &group->cg_children) { + struct config_item *item = to_item(entry); if (config_item_name(item) && !strcmp(config_item_name(item), name)) { ret = config_item_get(item); @@ -208,9 +211,4 @@ struct config_item *config_group_find_item(struct config_group *group, } return ret; } - -EXPORT_SYMBOL(config_item_init); -EXPORT_SYMBOL(config_group_init); -EXPORT_SYMBOL(config_item_get); -EXPORT_SYMBOL(config_item_put); EXPORT_SYMBOL(config_group_find_item); -- cgit v0.10.2 From c6686931335757c2b348fc9a74ff564d8bd10f0a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:05:58 -0700 Subject: fs/configfs: convert printk to pr_foo() Signed-off-by: Fabian Frederick Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index e081acb..727d06e 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -940,9 +940,9 @@ static void client_drop_item(struct config_item *parent_item, #ifdef DEBUG static void configfs_dump_one(struct configfs_dirent *sd, int level) { - printk(KERN_INFO "%*s\"%s\":\n", level, " ", configfs_get_name(sd)); + pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd)); -#define type_print(_type) if (sd->s_type & _type) printk(KERN_INFO "%*s %s\n", level, " ", #_type); +#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type); type_print(CONFIGFS_ROOT); type_print(CONFIGFS_DIR); type_print(CONFIGFS_ITEM_ATTR); @@ -1699,7 +1699,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) struct dentry *root = dentry->d_sb->s_root; if (dentry->d_parent != root) { - printk(KERN_ERR "configfs: Tried to unregister non-subsystem!\n"); + pr_err("configfs: Tried to unregister non-subsystem!\n"); return; } @@ -1709,7 +1709,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { - printk(KERN_ERR "configfs: Tried to unregister non-empty subsystem!\n"); + pr_err("configfs: Tried to unregister non-empty subsystem!\n"); } spin_unlock(&configfs_dirent_lock); mutex_unlock(&configfs_symlink_mutex); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index a9d35b0..fbb30db 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -168,9 +168,8 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, * In practice the maximum level of locking depth is * already reached. Just inform about possible reasons. */ - printk(KERN_INFO "configfs: Too many levels of inodes" - " for the locking correctness validator.\n"); - printk(KERN_INFO "Spurious warnings may appear.\n"); + pr_info("configfs: Too many levels of inodes for the locking correctness validator.\n"); + pr_info("Spurious warnings may appear.\n"); } } } diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 7f26c3c..af08de0 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -155,7 +155,7 @@ static int __init configfs_init(void) return 0; out4: - printk(KERN_ERR "configfs: Unable to register filesystem!\n"); + pr_err("configfs: Unable to register filesystem!\n"); configfs_inode_exit(); out3: kobject_put(config_kobj); -- cgit v0.10.2 From 1d88aa441f25811a55c0ed688add108c0a45613e Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:05:59 -0700 Subject: fs/configfs: use pr_fmt Add pr_fmt based on module name. Signed-off-by: Fabian Frederick Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index b5f0a3b..bd4a3c1 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -24,6 +24,12 @@ * configfs Copyright (C) 2005 Oracle. All rights reserved. */ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 727d06e..668dcab 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1699,7 +1699,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) struct dentry *root = dentry->d_sb->s_root; if (dentry->d_parent != root) { - pr_err("configfs: Tried to unregister non-subsystem!\n"); + pr_err("Tried to unregister non-subsystem!\n"); return; } @@ -1709,7 +1709,7 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { - pr_err("configfs: Tried to unregister non-empty subsystem!\n"); + pr_err("Tried to unregister non-empty subsystem!\n"); } spin_unlock(&configfs_dirent_lock); mutex_unlock(&configfs_symlink_mutex); diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index fbb30db..5946ad9 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -168,7 +168,7 @@ static void configfs_set_inode_lock_class(struct configfs_dirent *sd, * In practice the maximum level of locking depth is * already reached. Just inform about possible reasons. */ - pr_info("configfs: Too many levels of inodes for the locking correctness validator.\n"); + pr_info("Too many levels of inodes for the locking correctness validator.\n"); pr_info("Spurious warnings may appear.\n"); } } diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index af08de0..f6c2858 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -85,7 +85,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); } else { - pr_debug("configfs: could not get root inode\n"); + pr_debug("could not get root inode\n"); return -ENOMEM; } @@ -155,7 +155,7 @@ static int __init configfs_init(void) return 0; out4: - pr_err("configfs: Unable to register filesystem!\n"); + pr_err("Unable to register filesystem!\n"); configfs_inode_exit(); out3: kobject_put(config_kobj); -- cgit v0.10.2 From d7ffef289dd7332a7153e4957db78622b34d2680 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:00 -0700 Subject: lib/debugobjects.c: convert printk to pr_foo() Convert all printk to pr_foo() except KERN_DEBUG (see Documentation/CodingStyle Chapter 13) Signed-off-by: Fabian Frederick Reviewed-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/debugobjects.c b/lib/debugobjects.c index e0731c3..ea4c737 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -218,7 +218,7 @@ static void debug_objects_oom(void) unsigned long flags; int i; - printk(KERN_WARNING "ODEBUG: Out of memory. ODEBUG disabled\n"); + pr_warn("ODEBUG: Out of memory. ODEBUG disabled\n"); for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { raw_spin_lock_irqsave(&db->lock, flags); @@ -292,11 +292,9 @@ static void debug_object_is_on_stack(void *addr, int onstack) limit++; if (is_on_stack) - printk(KERN_WARNING - "ODEBUG: object is on stack, but not annotated\n"); + pr_warn("ODEBUG: object is on stack, but not annotated\n"); else - printk(KERN_WARNING - "ODEBUG: object is not on stack, but annotated\n"); + pr_warn("ODEBUG: object is not on stack, but annotated\n"); WARN_ON(1); } @@ -985,7 +983,7 @@ static void __init debug_objects_selftest(void) if (check_results(&obj, ODEBUG_STATE_NONE, ++fixups, ++warnings)) goto out; #endif - printk(KERN_INFO "ODEBUG: selftest passed\n"); + pr_info("ODEBUG: selftest passed\n"); out: debug_objects_fixups = oldfixups; @@ -1090,7 +1088,7 @@ void __init debug_objects_mem_init(void) debug_objects_enabled = 0; if (obj_cache) kmem_cache_destroy(obj_cache); - printk(KERN_WARNING "ODEBUG: out of memory.\n"); + pr_warn("ODEBUG: out of memory.\n"); } else debug_objects_selftest(); } -- cgit v0.10.2 From 719e484396e2793f40829b98a22d55c2fcdbe74b Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:04 -0700 Subject: lib/debugobjects.c: add pr_fmt to logging Add ODEBUG: prefix to pr_fmt Signed-off-by: Fabian Frederick Reviewed-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/debugobjects.c b/lib/debugobjects.c index ea4c737..b628247 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -7,6 +7,9 @@ * * For licencing details see kernel-base/COPYING */ + +#define pr_fmt(fmt) "ODEBUG: " fmt + #include #include #include @@ -218,7 +221,7 @@ static void debug_objects_oom(void) unsigned long flags; int i; - pr_warn("ODEBUG: Out of memory. ODEBUG disabled\n"); + pr_warn("Out of memory. ODEBUG disabled\n"); for (i = 0; i < ODEBUG_HASH_SIZE; i++, db++) { raw_spin_lock_irqsave(&db->lock, flags); @@ -292,9 +295,9 @@ static void debug_object_is_on_stack(void *addr, int onstack) limit++; if (is_on_stack) - pr_warn("ODEBUG: object is on stack, but not annotated\n"); + pr_warn("object is on stack, but not annotated\n"); else - pr_warn("ODEBUG: object is not on stack, but annotated\n"); + pr_warn("object is not on stack, but annotated\n"); WARN_ON(1); } @@ -983,7 +986,7 @@ static void __init debug_objects_selftest(void) if (check_results(&obj, ODEBUG_STATE_NONE, ++fixups, ++warnings)) goto out; #endif - pr_info("ODEBUG: selftest passed\n"); + pr_info("selftest passed\n"); out: debug_objects_fixups = oldfixups; @@ -1088,7 +1091,7 @@ void __init debug_objects_mem_init(void) debug_objects_enabled = 0; if (obj_cache) kmem_cache_destroy(obj_cache); - pr_warn("ODEBUG: out of memory.\n"); + pr_warn("out of memory.\n"); } else debug_objects_selftest(); } -- cgit v0.10.2 From c0f35cc0be0e1d06b89d5867a6db09eda5033189 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:05 -0700 Subject: lib/debugobjects.c: convert printk(KERN_DEBUG to pr_debug Direct conversion of one KERN_DEBUG message without DEBUG definition (suggested by Josh Triplett) That message will now be disabled by default. (see Documentation/CodingStyle Chapter 13) Signed-off-by: Fabian Frederick Reviewed-by: Josh Triplett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/debugobjects.c b/lib/debugobjects.c index b628247..547f7f9 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -1061,8 +1061,8 @@ static int __init debug_objects_replace_static_objects(void) } local_irq_enable(); - printk(KERN_DEBUG "ODEBUG: %d of %d active objects replaced\n", cnt, - obj_pool_used); + pr_debug("%d of %d active objects replaced\n", + cnt, obj_pool_used); return 0; free: hlist_for_each_entry_safe(obj, tmp, &objects, node) { -- cgit v0.10.2 From 1a5c4e2a0e339a01fdfbd519ba664d8efdc8d702 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:06 -0700 Subject: ocfs2: remove NULL assignments on static Static values are automatically initialized to NULL. Signed-off-by: Fabian Frederick Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index c6b90e6..a68e07a 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -108,7 +108,7 @@ static struct rb_root o2net_handler_tree = RB_ROOT; static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; /* XXX someday we'll need better accounting */ -static struct socket *o2net_listen_sock = NULL; +static struct socket *o2net_listen_sock; /* * listen work is only queued by the listening socket callbacks on the diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index e33cd7a..18f13c2 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -338,7 +338,7 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) #ifdef CONFIG_DEBUG_FS -static struct dentry *dlm_debugfs_root = NULL; +static struct dentry *dlm_debugfs_root; #define DLM_DEBUGFS_DIR "o2dlm" #define DLM_DEBUGFS_DLM_STATE "dlm_state" diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 5d32f75..66c2a49 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -52,7 +52,7 @@ #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" -static struct kmem_cache *dlm_lock_cache = NULL; +static struct kmem_cache *dlm_lock_cache; static DEFINE_SPINLOCK(dlm_cookie_lock); static u64 dlm_next_cookie = 1; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index ee1f884..3087a21 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -82,9 +82,9 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, return 1; } -static struct kmem_cache *dlm_lockres_cache = NULL; -static struct kmem_cache *dlm_lockname_cache = NULL; -static struct kmem_cache *dlm_mle_cache = NULL; +static struct kmem_cache *dlm_lockres_cache; +static struct kmem_cache *dlm_lockname_cache; +static struct kmem_cache *dlm_mle_cache; static void dlm_mle_release(struct kref *kref); static void dlm_init_mle(struct dlm_master_list_entry *mle, diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 83f1a66..5d965e8 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -709,7 +709,7 @@ static struct ctl_table ocfs2_root_table[] = { { } }; -static struct ctl_table_header *ocfs2_table_header = NULL; +static struct ctl_table_header *ocfs2_table_header; /* diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index a7cdd56..9027729 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -75,7 +75,7 @@ #include "buffer_head_io.h" -static struct kmem_cache *ocfs2_inode_cachep = NULL; +static struct kmem_cache *ocfs2_inode_cachep; struct kmem_cache *ocfs2_dquot_cachep; struct kmem_cache *ocfs2_qf_chunk_cachep; @@ -85,7 +85,7 @@ struct kmem_cache *ocfs2_qf_chunk_cachep; * workqueue and schedule on our own. */ struct workqueue_struct *ocfs2_wq = NULL; -static struct dentry *ocfs2_debugfs_root = NULL; +static struct dentry *ocfs2_debugfs_root; MODULE_AUTHOR("Oracle"); MODULE_LICENSE("GPL"); diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 52eaf33..82e17b0 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -67,7 +67,7 @@ struct ocfs2_meta_cache_item { sector_t c_block; }; -static struct kmem_cache *ocfs2_uptodate_cachep = NULL; +static struct kmem_cache *ocfs2_uptodate_cachep; u64 ocfs2_metadata_cache_owner(struct ocfs2_caching_info *ci) { -- cgit v0.10.2 From 69201bb1132718f45078ba6454093f5e220c0350 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:07 -0700 Subject: fs/ocfs2/super.c: use OCFS2_MAX_VOL_LABEL_LEN and strlcpy Replace strncpy(size 63) by defined value. Signed-off-by: Fabian Frederick Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 9027729..c7a89cea 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2292,8 +2292,8 @@ static int ocfs2_initialize_super(struct super_block *sb, goto bail; } - strncpy(osb->vol_label, di->id2.i_super.s_label, 63); - osb->vol_label[63] = '\0'; + strlcpy(osb->vol_label, di->id2.i_super.s_label, + OCFS2_MAX_VOL_LABEL_LEN); osb->root_blkno = le64_to_cpu(di->id2.i_super.s_root_blkno); osb->system_dir_blkno = le64_to_cpu(di->id2.i_super.s_system_dir_blkno); osb->first_cluster_group_blkno = -- cgit v0.10.2 From b3821c3f866a1c02d1537ab67abe9a824d1353ae Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Wed, 4 Jun 2014 16:06:08 -0700 Subject: ocfs2: remove some redundant casting There are two standard techniques for dereferencing structures pointed to by void *: cast to the right type each time they're used, or assign to local variables of the right type. But there's no need to do *both*. Signed-off-by: George Spelvin Cc: Mark Fasheh Acked-by: Joel Becker Reviewed-by: Jie Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 6ba4bcb..714e53b 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -1408,10 +1408,9 @@ static void swap_refcount_rec(void *a, void *b, int size) { struct ocfs2_refcount_rec *l = a, *r = b, tmp; - tmp = *(struct ocfs2_refcount_rec *)l; - *(struct ocfs2_refcount_rec *)l = - *(struct ocfs2_refcount_rec *)r; - *(struct ocfs2_refcount_rec *)r = tmp; + tmp = *l; + *l = *r; + *r = tmp; } /* -- cgit v0.10.2 From 55b465b66809368b459674b9d205010730953c2e Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 4 Jun 2014 16:06:09 -0700 Subject: ocfs2: limit printk when journal is aborted Once JBD2_ABORT is set, ocfs2_commit_cache will fail in ocfs2_commit_thread. Then it will get into a loop with mass logs. This will meaninglessly consume a larger number of resource and may lead to the system hanging. So limit printk in this case. [akpm@linux-foundation.org: document the msleep] Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 03ea931..4b0c688 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -2185,8 +2186,20 @@ static int ocfs2_commit_thread(void *arg) || kthread_should_stop()); status = ocfs2_commit_cache(osb); - if (status < 0) - mlog_errno(status); + if (status < 0) { + static unsigned long abort_warn_time; + + /* Warn about this once per minute */ + if (printk_timed_ratelimit(&abort_warn_time, 60*HZ)) + mlog(ML_ERROR, "status = %d, journal is " + "already aborted.\n", status); + /* + * After ocfs2_commit_cache() fails, j_num_trans has a + * non-zero value. Sleep here to avoid a busy-wait + * loop. + */ + msleep_interruptible(1000); + } if (kthread_should_stop() && atomic_read(&journal->j_num_trans)){ mlog(ML_KTHREAD, -- cgit v0.10.2 From 6718cb5e0e75faa2b938dc1ee247fbd70136ccd7 Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 4 Jun 2014 16:06:10 -0700 Subject: ocfs2/dlm: fix possible convert=sion deadlock We found there is a conversion deadlock when the owner of lockres happened to crash before send DLM_PROXY_AST_MSG for a downconverting lock. The situation is as follows: Node1 Node2 Node3 the owner of lockresA lock_1 granted at EX mode and call ocfs2_cluster_unlock to decrease ex_holders. converting lock_3 from NL to EX send DLM_PROXY_AST_MSG to Node1, asking Node 1 to downconvert. receiving DLM_PROXY_AST_MSG, thread ocfs2dc send DLM_CONVERT_LOCK_MSG to Node2 to downconvert lock_1(EX->NL). lock_1 can be granted and put it into pending_asts list, return DLM_NORMAL. then something happened and Node2 crashed. received DLM_NORMAL, waiting for DLM_PROXY_AST_MSG. selected as the recovery master, receving migrate lock from Node1, queue lock_1 to the tail of converting list. After dlm recovery, converting list in the master of lockresA(Node3) will be: converting list head <-> lock_3(NL->EX) <->lock_1(EX<->NL). Requested mode of lock_3 is not compatible with the granted mode of lock_1, so it can not be granted. and lock_1 can not downconvert because covnerting queue is strictly FIFO. So a deadlock is created. We think function dlm_process_recovery_data() should queue_ast for lock_1 or alter the order of lock_1 and lock_3, so dlm_thread can process lock_1 first. And if there are multiple downconverting locks, they must convert form PR to NL, so no need to sort them. Signed-off-by: joyce.xue Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index fe29f79..5de0194 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1986,7 +1986,15 @@ skip_lvb: } if (!bad) { dlm_lock_get(newlock); - list_add_tail(&newlock->list, queue); + if (mres->flags & DLM_MRES_RECOVERY && + ml->list == DLM_CONVERTING_LIST && + newlock->ml.type > + newlock->ml.convert_type) { + /* newlock is doing downconvert, add it to the + * head of converting list */ + list_add(&newlock->list, queue); + } else + list_add_tail(&newlock->list, queue); mlog(0, "%s:%.*s: added lock for node %u, " "setting refmap bit\n", dlm->name, res->lockname.len, res->lockname.name, ml->node); -- cgit v0.10.2 From c253ed1f6fd68df8542efed2d6b7656c3324f3e4 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:12 -0700 Subject: fs/ocfs2/ioctl.c: add static to local functions ocfs_info_foo() and ocfs2_get_request_ptr functions are only used in ioctl.c Signed-off-by: Fabian Frederick Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 490229f..6f66b37 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -143,8 +143,8 @@ bail: return status; } -int ocfs2_info_handle_blocksize(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_blocksize(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_blocksize oib; @@ -167,8 +167,8 @@ bail: return status; } -int ocfs2_info_handle_clustersize(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_clustersize(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_clustersize oic; @@ -192,8 +192,8 @@ bail: return status; } -int ocfs2_info_handle_maxslots(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_maxslots(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_maxslots oim; @@ -217,8 +217,8 @@ bail: return status; } -int ocfs2_info_handle_label(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_label(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_label oil; @@ -242,8 +242,8 @@ bail: return status; } -int ocfs2_info_handle_uuid(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_uuid(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_uuid oiu; @@ -267,8 +267,8 @@ bail: return status; } -int ocfs2_info_handle_fs_features(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_fs_features(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_fs_features oif; @@ -294,8 +294,8 @@ bail: return status; } -int ocfs2_info_handle_journal_size(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_journal_size(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_journal_size oij; @@ -319,9 +319,10 @@ bail: return status; } -int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, - struct inode *inode_alloc, u64 blkno, - struct ocfs2_info_freeinode *fi, u32 slot) +static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, + struct inode *inode_alloc, u64 blkno, + struct ocfs2_info_freeinode *fi, + u32 slot) { int status = 0, unlock = 0; @@ -366,8 +367,8 @@ bail: return status; } -int ocfs2_info_handle_freeinode(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_freeinode(struct inode *inode, + struct ocfs2_info_request __user *req) { u32 i; u64 blkno = -1; @@ -462,19 +463,19 @@ static void o2ffg_update_stats(struct ocfs2_info_freefrag_stats *stats, stats->ffs_free_chunks_real++; } -void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, - unsigned int chunksize) +static void ocfs2_info_update_ffg(struct ocfs2_info_freefrag *ffg, + unsigned int chunksize) { o2ffg_update_histogram(&(ffg->iff_ffs.ffs_fc_hist), chunksize); o2ffg_update_stats(&(ffg->iff_ffs), chunksize); } -int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, - struct inode *gb_inode, - struct ocfs2_dinode *gb_dinode, - struct ocfs2_chain_rec *rec, - struct ocfs2_info_freefrag *ffg, - u32 chunks_in_group) +static int ocfs2_info_freefrag_scan_chain(struct ocfs2_super *osb, + struct inode *gb_inode, + struct ocfs2_dinode *gb_dinode, + struct ocfs2_chain_rec *rec, + struct ocfs2_info_freefrag *ffg, + u32 chunks_in_group) { int status = 0, used; u64 blkno; @@ -572,9 +573,9 @@ bail: return status; } -int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, - struct inode *gb_inode, u64 blkno, - struct ocfs2_info_freefrag *ffg) +static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, + struct inode *gb_inode, u64 blkno, + struct ocfs2_info_freefrag *ffg) { u32 chunks_in_group; int status = 0, unlock = 0, i; @@ -652,8 +653,8 @@ bail: return status; } -int ocfs2_info_handle_freefrag(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_freefrag(struct inode *inode, + struct ocfs2_info_request __user *req) { u64 blkno = -1; char namebuf[40]; @@ -723,8 +724,8 @@ out_err: return status; } -int ocfs2_info_handle_unknown(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_unknown(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_request oir; @@ -752,8 +753,8 @@ bail: * - distinguish different requests. * - validate size of different requests. */ -int ocfs2_info_handle_request(struct inode *inode, - struct ocfs2_info_request __user *req) +static int ocfs2_info_handle_request(struct inode *inode, + struct ocfs2_info_request __user *req) { int status = -EFAULT; struct ocfs2_info_request oir; @@ -811,8 +812,8 @@ bail: return status; } -int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, - u64 *req_addr, int compat_flag) +static int ocfs2_get_request_ptr(struct ocfs2_info *info, int idx, + u64 *req_addr, int compat_flag) { int status = -EFAULT; u64 __user *bp = NULL; @@ -849,8 +850,8 @@ bail: * a better backward&forward compatibility, since a small piece of * request will be less likely to be broken if disk layout get changed. */ -int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, - int compat_flag) +static int ocfs2_info_handle(struct inode *inode, struct ocfs2_info *info, + int compat_flag) { int i, status = 0; u64 req_addr; -- cgit v0.10.2 From a9e9acaeb0a981a6dfa54b32dd756103aeefa6a7 Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 4 Jun 2014 16:06:13 -0700 Subject: ocfs2: fix umount hang while shutting down truncate log Revert commit 75f82eaa502c ("ocfs2: fix NULL pointer dereference when dismount and ocfs2rec simultaneously") because it may cause a umount hang while shutting down the truncate log. fix NULL pointer dereference when dismount and ocfs2rec simultaneously The situation is as followes: ocfs2_dismout_volume -> ocfs2_recovery_exit -> free osb->recovery_map -> ocfs2_truncate_shutdown -> lock global bitmap inode -> ocfs2_wait_for_recovery -> check whether osb->recovery_map->rm_used is zero Because osb->recovery_map is already freed, rm_used can be any other values, so it may yield umount hang. To prevent NULL pointer dereference while getting sys_root_inode, we use a osb_tl_disable flag to disable schedule osb_truncate_log_wq after truncate log shutdown. Signed-off-by: joyce.xue Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index b4deb5f..9d8fcf2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6046,7 +6046,8 @@ static void ocfs2_truncate_log_worker(struct work_struct *work) void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, int cancel) { - if (osb->osb_tl_inode) { + if (osb->osb_tl_inode && + atomic_read(&osb->osb_tl_disable) == 0) { /* We want to push off log flushes while truncates are * still running. */ if (cancel) @@ -6223,6 +6224,8 @@ void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; + atomic_set(&osb->osb_tl_disable, 1); + if (tl_inode) { cancel_delayed_work(&osb->osb_truncate_log_wq); flush_workqueue(ocfs2_wq); @@ -6254,6 +6257,7 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) * until we're sure all is well. */ INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, ocfs2_truncate_log_worker); + atomic_set(&osb->osb_tl_disable, 0); osb->osb_tl_bh = tl_bh; osb->osb_tl_inode = tl_inode; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 8d64a97..bbec539 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -422,6 +422,7 @@ struct ocfs2_super struct inode *osb_tl_inode; struct buffer_head *osb_tl_bh; struct delayed_work osb_truncate_log_wq; + atomic_t osb_tl_disable; /* * How many clusters in our truncate log. * It must be protected by osb_tl_inode->i_mutex. -- cgit v0.10.2 From 01c6222f876062355599e5a63560c514b6de25d2 Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 4 Jun 2014 16:06:14 -0700 Subject: ocfs2/dlm: disallow node joining when recovery is on going We found a race situation when dlm recovery and node joining occurs simultaneously if the network state is bad. N1 N4 start joining dlm and send query join to all live nodes set joining node to N1, return OK send query join to other live nodes and it may take a while call dlm_send_join_assert() to send assert join message when N2 is down, so keep trying to send message to N2 until find N2 is down send assert join message to N3, but connection is down with N3, so it may take a while become the recovery master for N2 and send begin reco message to other nodes in domain map but no N1 connection with N3 is rebuild, then send assert join to N4 call dlm_assert_joined_handler(), add N1 to domain_map dlm recovery done, send finalize message to nodes in domain map, including N1 receiving finalize message, trigger the BUG() because recovery master mismatch. Signed-off-by: joyce.xue Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index c973690..8d46907 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -959,6 +959,14 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data, * domain. Set him in the map and clean up our * leftover join state. */ BUG_ON(dlm->joining_node != assert->node_idx); + + if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) { + mlog(0, "dlm recovery is ongoing, disallow join\n"); + spin_unlock(&dlm->spinlock); + spin_unlock(&dlm_domain_lock); + return -EAGAIN; + } + set_bit(assert->node_idx, dlm->domain_map); clear_bit(assert->node_idx, dlm->exit_domain_map); __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); @@ -1517,6 +1525,7 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, unsigned int node) { int status; + int ret; struct dlm_assert_joined assert_msg; mlog(0, "Sending join assert to node %u\n", node); @@ -1528,11 +1537,13 @@ static int dlm_send_one_join_assert(struct dlm_ctxt *dlm, status = o2net_send_message(DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, &assert_msg, sizeof(assert_msg), node, - NULL); + &ret); if (status < 0) mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " "node %u\n", status, DLM_ASSERT_JOINED_MSG, DLM_MOD_KEY, node); + else + status = ret; return status; } -- cgit v0.10.2 From b7ac233515c6263d6cb471d9e0f57bdd7ea36894 Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 4 Jun 2014 16:06:15 -0700 Subject: ocfs2: cleanup unused paramters in ocfs2_calc_new_backup_super Parameters new_clusters and first_new_cluster are not used in ocfs2_update_last_group_and_inode, so remove them. Signed-off-by: Joseph Qi Reviewed-by: joyce.xue Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 822ebc1..168739c 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -53,8 +53,6 @@ */ static u16 ocfs2_calc_new_backup_super(struct inode *inode, struct ocfs2_group_desc *gd, - int new_clusters, - u32 first_new_cluster, u16 cl_cpg, int set) { @@ -127,8 +125,6 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, OCFS2_FEATURE_COMPAT_BACKUP_SB)) { backups = ocfs2_calc_new_backup_super(bm_inode, group, - new_clusters, - first_new_cluster, cl_cpg, 1); le16_add_cpu(&group->bg_free_bits_count, -1 * backups); } @@ -167,8 +163,6 @@ out_rollback: if (ret < 0) { ocfs2_calc_new_backup_super(bm_inode, group, - new_clusters, - first_new_cluster, cl_cpg, 0); le16_add_cpu(&group->bg_free_bits_count, backups); le16_add_cpu(&group->bg_bits, -1 * num_bits); -- cgit v0.10.2 From 17bf1418b78d9c51370f29c2b840f7d0a1bf311a Mon Sep 17 00:00:00 2001 From: Joseph Qi Date: Wed, 4 Jun 2014 16:06:17 -0700 Subject: ocfs2: fix incorrect i_size of global bitmap inode after resize Ocfs2 cluster size may be 1MB, which has 20 bits. When resize, the input new clusters is mostly the number of clusters in a group descriptor(32256). Since the input clusters is defined as type int, so it will overflow when shift left 20 bits and then lead to incorrect global bitmap i_size. Signed-off-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 168739c..d5da6f6 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -153,7 +153,7 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, spin_lock(&OCFS2_I(bm_inode)->ip_lock); OCFS2_I(bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - le64_add_cpu(&fe->i_size, new_clusters << osb->s_clustersize_bits); + le64_add_cpu(&fe->i_size, (u64)new_clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(bm_inode)->ip_lock); i_size_write(bm_inode, le64_to_cpu(fe->i_size)); @@ -563,7 +563,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) spin_lock(&OCFS2_I(main_bm_inode)->ip_lock); OCFS2_I(main_bm_inode)->ip_clusters = le32_to_cpu(fe->i_clusters); - le64_add_cpu(&fe->i_size, input->clusters << osb->s_clustersize_bits); + le64_add_cpu(&fe->i_size, (u64)input->clusters << osb->s_clustersize_bits); spin_unlock(&OCFS2_I(main_bm_inode)->ip_lock); i_size_write(main_bm_inode, le64_to_cpu(fe->i_size)); -- cgit v0.10.2 From e72db989e1c01fde28aabf7fd29faeaa08538e24 Mon Sep 17 00:00:00 2001 From: Xue jiufei Date: Wed, 4 Jun 2014 16:06:24 -0700 Subject: ocfs2: remove some unused code dlm_recovery_ctxt.received is unused. ocfs2_should_refresh_lock_res() can only return 0 or 1, so the error handling code in ocfs2_super_lock() is unneeded. Signed-off-by: joyce.xue Cc: Joel Becker Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index e051776..a106b3f 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -108,7 +108,6 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) struct dlm_recovery_ctxt { struct list_head resources; - struct list_head received; struct list_head node_data; u8 new_master; u8 dead_node; diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8d46907..39efc505 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -2034,7 +2034,6 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, INIT_LIST_HEAD(&dlm->list); INIT_LIST_HEAD(&dlm->dirty_list); INIT_LIST_HEAD(&dlm->reco.resources); - INIT_LIST_HEAD(&dlm->reco.received); INIT_LIST_HEAD(&dlm->reco.node_data); INIT_LIST_HEAD(&dlm->purge_list); INIT_LIST_HEAD(&dlm->dlm_domain_handlers); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 6bd690b..52cfe99 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2544,11 +2544,6 @@ int ocfs2_super_lock(struct ocfs2_super *osb, * refreshed, so we do it here. Of course, making sense of * everything is up to the caller :) */ status = ocfs2_should_refresh_lock_res(lockres); - if (status < 0) { - ocfs2_cluster_unlock(osb, lockres, level); - mlog_errno(status); - goto bail; - } if (status) { status = ocfs2_refresh_slot_info(osb); -- cgit v0.10.2 From bdbeacdea24bad95bb72b6712fd9f716206c3142 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:25 -0700 Subject: fs/9p/v9fs.c: add __init to v9fs_sysfs_init v9fs_sysfs_init is only called by __init init_v9fs Signed-off-by: Fabian Frederick Cc: Eric Van Hensbergen Cc: Ron Minnich Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 14da825..6894b08 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -537,7 +537,7 @@ static struct attribute_group v9fs_attr_group = { * */ -static int v9fs_sysfs_init(void) +static int __init v9fs_sysfs_init(void) { v9fs_kobj = kobject_create_and_add("9p", fs_kobj); if (!v9fs_kobj) -- cgit v0.10.2 From fd2916bd77109b69891573fd1e1205ecc619893e Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:26 -0700 Subject: fs/9p: kerneldoc fixes Function parameters comment fixing. Signed-off-by: Fabian Frederick Cc: Eric Van Hensbergen Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 4d0c2e0..0b3bfa3 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -42,7 +42,6 @@ /** * struct p9_rdir - readdir accounting - * @mutex: mutex protecting readdir * @head: start offset of current dirread buffer * @tail: end offset of current dirread buffer * @buf: dirread buffer diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 59e3fe3..96e5507 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -681,7 +681,7 @@ v9fs_direct_read(struct file *filp, char __user *udata, size_t count, /** * v9fs_cached_file_read - read from a file * @filp: file pointer to read - * @udata: user data buffer to read data into + * @data: user data buffer to read data into * @count: size of buffer * @offset: offset at which to read data * @@ -698,7 +698,7 @@ v9fs_cached_file_read(struct file *filp, char __user *data, size_t count, /** * v9fs_mmap_file_read - read from a file * @filp: file pointer to read - * @udata: user data buffer to read data into + * @data: user data buffer to read data into * @count: size of buffer * @offset: offset at which to read data * diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 53161ec..00d140f 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -580,7 +580,7 @@ static int v9fs_at_to_dotl_flags(int flags) * v9fs_remove - helper function to remove files and directories * @dir: directory inode that is being deleted * @dentry: dentry that is being deleted - * @rmdir: removing a directory + * @flags: removing a directory * */ @@ -778,7 +778,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode * v9fs_vfs_lookup - VFS lookup hook to "walk" to a new inode * @dir: inode that is being walked from * @dentry: dentry that is being walked to? - * @nameidata: path data + * @flags: lookup flags (unused) * */ @@ -1324,7 +1324,7 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) * v9fs_vfs_mkspecial - create a special file * @dir: inode to create special file in * @dentry: dentry to create - * @mode: mode to create special file + * @perm: mode to create special file * @extension: 9p2000.u format extension string representing special file * */ diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 59dc8e8..1fa85aa 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -226,7 +226,7 @@ int v9fs_open_to_dotl_flags(int flags) * v9fs_vfs_create_dotl - VFS hook to create files for 9P2000.L protocol. * @dir: directory inode that is being created * @dentry: dentry that is being deleted - * @mode: create permissions + * @omode: create permissions * */ @@ -375,7 +375,7 @@ err_clunk_old_fid: * v9fs_vfs_mkdir_dotl - VFS mkdir hook to create a directory * @dir: inode that is being unlinked * @dentry: dentry that is being unlinked - * @mode: mode for new directory + * @omode: mode for new directory * */ @@ -607,7 +607,6 @@ int v9fs_vfs_setattr_dotl(struct dentry *dentry, struct iattr *iattr) * v9fs_stat2inode_dotl - populate an inode structure with stat info * @stat: stat structure * @inode: inode to populate - * @sb: superblock of filesystem * */ @@ -808,7 +807,7 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, * v9fs_vfs_mknod_dotl - create a special file * @dir: inode destination for new link * @dentry: dentry for file - * @mode: mode for creation + * @omode: mode for creation * @rdev: device associated with special file * */ -- cgit v0.10.2 From ac13a829f6adb674015ab399594c089990104af7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:27 -0700 Subject: fs/libfs.c: add generic data flush to fsync Description by Jan Kara: "A lot of older filesystems don't properly flush volatile disk caches on fsync(2) which can lead to loss of fsynced data after power failure. This patch makes generic_file_fsync() issue proper cache flush to fix the problem. Sysadmin can use /sys/devices/.../cache_type to tell the system it should not send the cache flush." [akpm@linux-foundation.org: nuke ifdef] [akpm@linux-foundation.org: fix warning] Signed-off-by: Fabian Frederick Suggested-by: Jan Kara Suggested-by: Christoph Hellwig Cc: Jan Kara Cc: Christoph Hellwig Cc: Alexander Viro Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/libfs.c b/fs/libfs.c index a184424..88e3e00 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -3,6 +3,7 @@ * Library for filesystems writers. */ +#include #include #include #include @@ -923,16 +924,19 @@ struct dentry *generic_fh_to_parent(struct super_block *sb, struct fid *fid, EXPORT_SYMBOL_GPL(generic_fh_to_parent); /** - * generic_file_fsync - generic fsync implementation for simple filesystems + * __generic_file_fsync - generic fsync implementation for simple filesystems + * * @file: file to synchronize + * @start: start offset in bytes + * @end: end offset in bytes (inclusive) * @datasync: only synchronize essential metadata if true * * This is a generic implementation of the fsync method for simple * filesystems which track all non-inode metadata in the buffers list * hanging off the address_space structure. */ -int generic_file_fsync(struct file *file, loff_t start, loff_t end, - int datasync) +int __generic_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) { struct inode *inode = file->f_mapping->host; int err; @@ -952,10 +956,34 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end, err = sync_inode_metadata(inode, 1); if (ret == 0) ret = err; + out: mutex_unlock(&inode->i_mutex); return ret; } +EXPORT_SYMBOL(__generic_file_fsync); + +/** + * generic_file_fsync - generic fsync implementation for simple filesystems + * with flush + * @file: file to synchronize + * @start: start offset in bytes + * @end: end offset in bytes (inclusive) + * @datasync: only synchronize essential metadata if true + * + */ + +int generic_file_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct inode *inode = file->f_mapping->host; + int err; + + err = __generic_file_fsync(file, start, end, datasync); + if (err) + return err; + return blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); +} EXPORT_SYMBOL(generic_file_fsync); /** diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 8aba35f..45cf6e5 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1607,6 +1607,9 @@ struct block_device_operations { extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long); #else /* CONFIG_BLOCK */ + +struct block_device; + /* * stubs for when the block layer is configured out */ @@ -1642,6 +1645,12 @@ static inline bool blk_needs_flush_plug(struct task_struct *tsk) return false; } +static inline int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask, + sector_t *error_sector) +{ + return 0; +} + #endif /* CONFIG_BLOCK */ #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index 8780312..c3f46e4 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2590,6 +2590,7 @@ extern ssize_t simple_read_from_buffer(void __user *to, size_t count, extern ssize_t simple_write_to_buffer(void *to, size_t available, loff_t *ppos, const void __user *from, size_t count); +extern int __generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_file_fsync(struct file *, loff_t, loff_t, int); extern int generic_check_addressable(unsigned, u64); -- cgit v0.10.2 From 4468dd76f51f8be75d4f04f1d721e379596e7262 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:06:29 -0700 Subject: x86: require x86-64 for automatic NUMA balancing 32-bit support for NUMA is an oddity on its own but with automatic NUMA balancing on top there is a reasonable risk that the CPUPID information cannot be stored in the page flags. This patch removes support for automatic NUMA support on 32-bit x86. Signed-off-by: Mel Gorman Cc: David Vrabel Cc: Ingo Molnar Cc: Peter Anvin Cc: Fengguang Wu Cc: Linus Torvalds Cc: Steven Noonan Cc: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Srikar Dronamraju Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e41b258..896a411a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -26,7 +26,7 @@ config X86 select ARCH_MIGHT_HAVE_PC_SERIO select HAVE_AOUT if X86_32 select HAVE_UNSTABLE_SCHED_CLOCK - select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_INT128 if X86_64 select ARCH_WANTS_PROT_NUMA_PROT_NONE select HAVE_IDE -- cgit v0.10.2 From c46a7c817e662a820373bb76b88d0ad67d6abe5d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:06:30 -0700 Subject: x86: define _PAGE_NUMA by reusing software bits on the PMD and PTE levels _PAGE_NUMA is currently an alias of _PROT_PROTNONE to trap NUMA hinting faults on x86. Care is taken such that _PAGE_NUMA is used only in situations where the VMA flags distinguish between NUMA hinting faults and prot_none faults. This decision was x86-specific and conceptually it is difficult requiring special casing to distinguish between PROTNONE and NUMA ptes based on context. Fundamentally, we only need the _PAGE_NUMA bit to tell the difference between an entry that is really unmapped and a page that is protected for NUMA hinting faults as if the PTE is not present then a fault will be trapped. Swap PTEs on x86-64 use the bits after _PAGE_GLOBAL for the offset. This patch shrinks the maximum possible swap size and uses the bit to uniquely distinguish between NUMA hinting ptes and swap ptes. Signed-off-by: Mel Gorman Cc: David Vrabel Cc: Ingo Molnar Cc: Peter Anvin Cc: Fengguang Wu Cc: Linus Torvalds Cc: Steven Noonan Cc: Rik van Riel Cc: Peter Zijlstra Cc: Andrea Arcangeli Cc: Dave Hansen Cc: Srikar Dronamraju Cc: Cyrill Gorcunov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 3ebb188..d98c1ec 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -44,6 +44,12 @@ static inline int pte_present(pte_t pte) return pte_val(pte) & (_PAGE_PRESENT | _PAGE_NUMA); } +#define pte_present_nonuma pte_present_nonuma +static inline int pte_present_nonuma(pte_t pte) +{ + return pte_val(pte) & (_PAGE_PRESENT); +} + #define pte_numa pte_numa static inline int pte_numa(pte_t pte) { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index b459ddf..66276c1 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -131,7 +131,8 @@ static inline int pte_exec(pte_t pte) static inline int pte_special(pte_t pte) { - return pte_flags(pte) & _PAGE_SPECIAL; + return (pte_flags(pte) & (_PAGE_PRESENT|_PAGE_SPECIAL)) == + (_PAGE_PRESENT|_PAGE_SPECIAL); } static inline unsigned long pte_pfn(pte_t pte) @@ -452,6 +453,12 @@ static inline int pte_present(pte_t a) _PAGE_NUMA); } +#define pte_present_nonuma pte_present_nonuma +static inline int pte_present_nonuma(pte_t a) +{ + return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE); +} + #define pte_accessible pte_accessible static inline bool pte_accessible(struct mm_struct *mm, pte_t a) { @@ -860,19 +867,19 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { - VM_BUG_ON(pte_present(pte)); + VM_BUG_ON(pte_present_nonuma(pte)); return pte_set_flags(pte, _PAGE_SWP_SOFT_DIRTY); } static inline int pte_swp_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present(pte)); + VM_BUG_ON(pte_present_nonuma(pte)); return pte_flags(pte) & _PAGE_SWP_SOFT_DIRTY; } static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) { - VM_BUG_ON(pte_present(pte)); + VM_BUG_ON(pte_present_nonuma(pte)); return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index e22c1db..6d6ecd0 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -145,8 +145,16 @@ static inline int pgd_large(pgd_t pgd) { return 0; } /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) +#ifdef CONFIG_NUMA_BALANCING +/* Automatic NUMA balancing needs to be distinguishable from swap entries */ +#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 2) +#else #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +#endif #else +#ifdef CONFIG_NUMA_BALANCING +#error Incompatible format for automatic NUMA balancing +#endif #define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) #endif diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index eb3d449..f216963 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -16,15 +16,26 @@ #define _PAGE_BIT_PSE 7 /* 4 MB (or 2MB) page */ #define _PAGE_BIT_PAT 7 /* on 4KB pages */ #define _PAGE_BIT_GLOBAL 8 /* Global TLB entry PPro+ */ -#define _PAGE_BIT_UNUSED1 9 /* available for programmer */ -#define _PAGE_BIT_IOMAP 10 /* flag used to indicate IO mapping */ -#define _PAGE_BIT_HIDDEN 11 /* hidden by kmemcheck */ +#define _PAGE_BIT_SOFTW1 9 /* available for programmer */ +#define _PAGE_BIT_SOFTW2 10 /* " */ +#define _PAGE_BIT_SOFTW3 11 /* " */ #define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ -#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 -#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 -#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */ +#define _PAGE_BIT_SPECIAL _PAGE_BIT_SOFTW1 +#define _PAGE_BIT_CPA_TEST _PAGE_BIT_SOFTW1 +#define _PAGE_BIT_SPLITTING _PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */ +#define _PAGE_BIT_IOMAP _PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */ +#define _PAGE_BIT_HIDDEN _PAGE_BIT_SOFTW3 /* hidden by kmemcheck */ +#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_SOFTW3 /* software dirty tracking */ #define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ +/* + * Swap offsets on configurations that allow automatic NUMA balancing use the + * bits after _PAGE_BIT_GLOBAL. To uniquely distinguish NUMA hinting PTEs from + * swap entries, we use the first bit after _PAGE_BIT_GLOBAL and shrink the + * maximum possible swap space from 16TB to 8TB. + */ +#define _PAGE_BIT_NUMA (_PAGE_BIT_GLOBAL+1) + /* If _PAGE_BIT_PRESENT is clear, we use these: */ /* - if the user mapped it with PROT_NONE; pte_present gives true */ #define _PAGE_BIT_PROTNONE _PAGE_BIT_GLOBAL @@ -40,7 +51,7 @@ #define _PAGE_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_DIRTY) #define _PAGE_PSE (_AT(pteval_t, 1) << _PAGE_BIT_PSE) #define _PAGE_GLOBAL (_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL) -#define _PAGE_UNUSED1 (_AT(pteval_t, 1) << _PAGE_BIT_UNUSED1) +#define _PAGE_SOFTW1 (_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1) #define _PAGE_IOMAP (_AT(pteval_t, 1) << _PAGE_BIT_IOMAP) #define _PAGE_PAT (_AT(pteval_t, 1) << _PAGE_BIT_PAT) #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) @@ -61,8 +72,6 @@ * they do not conflict with each other. */ -#define _PAGE_BIT_SOFT_DIRTY _PAGE_BIT_HIDDEN - #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY (_AT(pteval_t, 1) << _PAGE_BIT_SOFT_DIRTY) #else @@ -70,6 +79,21 @@ #endif /* + * _PAGE_NUMA distinguishes between a numa hinting minor fault and a page + * that is not present. The hinting fault gathers numa placement statistics + * (see pte_numa()). The bit is always zero when the PTE is not present. + * + * The bit picked must be always zero when the pmd is present and not + * present, so that we don't lose information when we set it while + * atomically clearing the present bit. + */ +#ifdef CONFIG_NUMA_BALANCING +#define _PAGE_NUMA (_AT(pteval_t, 1) << _PAGE_BIT_NUMA) +#else +#define _PAGE_NUMA (_AT(pteval_t, 0)) +#endif + +/* * Tracking soft dirty bit when a page goes to a swap is tricky. * We need a bit which can be stored in pte _and_ not conflict * with swap entry format. On x86 bits 6 and 7 are *not* involved @@ -94,26 +118,6 @@ #define _PAGE_FILE (_AT(pteval_t, 1) << _PAGE_BIT_FILE) #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -/* - * _PAGE_NUMA indicates that this page will trigger a numa hinting - * minor page fault to gather numa placement statistics (see - * pte_numa()). The bit picked (8) is within the range between - * _PAGE_FILE (6) and _PAGE_PROTNONE (8) bits. Therefore, it doesn't - * require changes to the swp entry format because that bit is always - * zero when the pte is not present. - * - * The bit picked must be always zero when the pmd is present and not - * present, so that we don't lose information when we set it while - * atomically clearing the present bit. - * - * Because we shared the same bit (8) with _PAGE_PROTNONE this can be - * interpreted as _PAGE_NUMA only in places that _PAGE_PROTNONE - * couldn't reach, like handle_mm_fault() (see access_error in - * arch/x86/mm/fault.c, the vma protection must not be PROT_NONE for - * handle_mm_fault() to be invoked). - */ -#define _PAGE_NUMA _PAGE_PROTNONE - #define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ @@ -122,8 +126,8 @@ /* Set of bits not changed in pte_modify */ #define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY | \ - _PAGE_SOFT_DIRTY) -#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE) + _PAGE_SOFT_DIRTY | _PAGE_NUMA) +#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE | _PAGE_NUMA) #define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) #define _PAGE_CACHE_WB (0) diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 461bc82..6629f39 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -35,7 +35,7 @@ enum { static int pte_testbit(pte_t pte) { - return pte_flags(pte) & _PAGE_UNUSED1; + return pte_flags(pte) & _PAGE_SOFTW1; } struct split_state { diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index a8015a7..53b2acc 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -233,6 +233,10 @@ static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) # define pte_accessible(mm, pte) ((void)(pte), 1) #endif +#ifndef pte_present_nonuma +#define pte_present_nonuma(pte) pte_present(pte) +#endif + #ifndef flush_tlb_fix_spurious_fault #define flush_tlb_fix_spurious_fault(vma, address) flush_tlb_page(vma, address) #endif @@ -670,7 +674,7 @@ static inline int pmd_trans_unstable(pmd_t *pmd) static inline int pte_numa(pte_t pte) { return (pte_flags(pte) & - (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; + (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; } #endif @@ -678,7 +682,7 @@ static inline int pte_numa(pte_t pte) static inline int pmd_numa(pmd_t pmd) { return (pmd_flags(pmd) & - (_PAGE_NUMA|_PAGE_PRESENT)) == _PAGE_NUMA; + (_PAGE_NUMA|_PAGE_PROTNONE|_PAGE_PRESENT)) == _PAGE_NUMA; } #endif diff --git a/include/linux/swapops.h b/include/linux/swapops.h index c0f7526..6adfb7b 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry) /* check whether a pte points to a swap entry */ static inline int is_swap_pte(pte_t pte) { - return !pte_none(pte) && !pte_present(pte) && !pte_file(pte); + return !pte_none(pte) && !pte_present_nonuma(pte) && !pte_file(pte); } #endif diff --git a/mm/memory.c b/mm/memory.c index e302ae1..0897830 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -756,7 +756,7 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn = pte_pfn(pte); if (HAVE_PTE_SPECIAL) { - if (likely(!pte_special(pte))) + if (likely(!pte_special(pte) || pte_numa(pte))) goto check_pfn; if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) return NULL; @@ -782,14 +782,15 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, } } - if (is_zero_pfn(pfn)) - return NULL; check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); return NULL; } + if (is_zero_pfn(pfn)) + return NULL; + /* * NOTE! We still have PageReserved() pages in the page tables. * eg. VDSO mappings can cause them to exist. @@ -1722,13 +1723,9 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); /* - * If FOLL_FORCE and FOLL_NUMA are both set, handle_mm_fault - * would be called on PROT_NONE ranges. We must never invoke - * handle_mm_fault on PROT_NONE ranges or the NUMA hinting - * page faults would unprotect the PROT_NONE ranges if - * _PAGE_NUMA and _PAGE_PROTNONE are sharing the same pte/pmd - * bitflag. So to avoid that, don't set FOLL_NUMA if - * FOLL_FORCE is set. + * If FOLL_FORCE is set then do not force a full fault as the hinting + * fault information is unrelated to the reference behaviour of a task + * using the address space */ if (!(gup_flags & FOLL_FORCE)) gup_flags |= FOLL_NUMA; -- cgit v0.10.2 From 982792c782ef337381e982fd2047391886f89693 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 4 Jun 2014 16:06:32 -0700 Subject: x86, mm: probe memory block size for generic x86 64bit On system with 2TiB ram, current x86_64 have 128M as section size, and one memory_block only include one section. So will have 16400 entries under /sys/devices/system/memory/. Current code try to use block id to find block pointer in /sys for any section, and reuse that block pointer. that finding will take some time even after commit 7c243c7168dc ("mm: speedup in __early_pfn_to_nid") that will skip the search in that case during booting up. So solution could be increase block size just like SGI UV system did. (harded code to 2g). This patch is trying to probe the block size to make it match mmio remap size. for example, Intel Nehalem later system will have memory range [0, TOML), [4g, TOMH]. If the memory hole is 2g and total is 128g, TOM will be 2g, and TOM2 will be 130g. We could use 2g as block size instead of default 128M. That will reduce number of entries in /sys/devices/system/memory/ On system 6TiB system will reduce boot time by 35 seconds. Signed-off-by: Yinghai Lu Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index f35c66c..b92591f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1230,17 +1230,43 @@ const char *arch_vma_name(struct vm_area_struct *vma) return NULL; } -#ifdef CONFIG_X86_UV -unsigned long memory_block_size_bytes(void) +static unsigned long probe_memory_block_size(void) { + /* start from 2g */ + unsigned long bz = 1UL<<31; + +#ifdef CONFIG_X86_UV if (is_uv_system()) { printk(KERN_INFO "UV: memory block size 2GB\n"); return 2UL * 1024 * 1024 * 1024; } - return MIN_MEMORY_BLOCK_SIZE; -} #endif + /* less than 64g installed */ + if ((max_pfn << PAGE_SHIFT) < (16UL << 32)) + return MIN_MEMORY_BLOCK_SIZE; + + /* get the tail size */ + while (bz > MIN_MEMORY_BLOCK_SIZE) { + if (!((max_pfn << PAGE_SHIFT) & (bz - 1))) + break; + bz >>= 1; + } + + printk(KERN_DEBUG "memory block size : %ldMB\n", bz >> 20); + + return bz; +} + +static unsigned long memory_block_size_probed; +unsigned long memory_block_size_bytes(void) +{ + if (!memory_block_size_probed) + memory_block_size_probed = probe_memory_block_size(); + + return memory_block_size_probed; +} + #ifdef CONFIG_SPARSEMEM_VMEMMAP /* * Initialise the sparsemem vmemmap using huge-pages at the PMD level. -- cgit v0.10.2 From f9f58285947d9c88079bfb7b7666c987011e3377 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:34 -0700 Subject: mm/slub.c: convert printk to pr_foo() All printk(KERN_foo converted to pr_foo() Default printk converted to pr_warn() Coalesce format fragments Signed-off-by: Fabian Frederick Acked-by: Christoph Lameter Cc: Joe Perches Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index 2b1ce69..1594b14 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -403,7 +403,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif return 0; @@ -444,7 +444,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, stat(s, CMPXCHG_DOUBLE_FAIL); #ifdef SLUB_DEBUG_CMPXCHG - printk(KERN_INFO "%s %s: cmpxchg double redo ", n, s->name); + pr_info("%s %s: cmpxchg double redo ", n, s->name); #endif return 0; @@ -546,14 +546,14 @@ static void print_track(const char *s, struct track *t) if (!t->addr) return; - printk(KERN_ERR "INFO: %s in %pS age=%lu cpu=%u pid=%d\n", - s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); + pr_err("INFO: %s in %pS age=%lu cpu=%u pid=%d\n", + s, (void *)t->addr, jiffies - t->when, t->cpu, t->pid); #ifdef CONFIG_STACKTRACE { int i; for (i = 0; i < TRACK_ADDRS_COUNT; i++) if (t->addrs[i]) - printk(KERN_ERR "\t%pS\n", (void *)t->addrs[i]); + pr_err("\t%pS\n", (void *)t->addrs[i]); else break; } @@ -571,8 +571,7 @@ static void print_tracking(struct kmem_cache *s, void *object) static void print_page_info(struct page *page) { - printk(KERN_ERR - "INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", + pr_err("INFO: Slab 0x%p objects=%u used=%u fp=0x%p flags=0x%04lx\n", page, page->objects, page->inuse, page->freelist, page->flags); } @@ -585,11 +584,9 @@ static void slab_bug(struct kmem_cache *s, char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_ERR "========================================" - "=====================================\n"); - printk(KERN_ERR "BUG %s (%s): %s\n", s->name, print_tainted(), buf); - printk(KERN_ERR "----------------------------------------" - "-------------------------------------\n\n"); + pr_err("=============================================================================\n"); + pr_err("BUG %s (%s): %s\n", s->name, print_tainted(), buf); + pr_err("-----------------------------------------------------------------------------\n\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } @@ -602,7 +599,7 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...) va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); va_end(args); - printk(KERN_ERR "FIX %s: %s\n", s->name, buf); + pr_err("FIX %s: %s\n", s->name, buf); } static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) @@ -614,8 +611,8 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) print_page_info(page); - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", - p, p - addr, get_freepointer(s, p)); + pr_err("INFO: Object 0x%p @offset=%tu fp=0x%p\n\n", + p, p - addr, get_freepointer(s, p)); if (p > addr + 16) print_section("Bytes b4 ", p - 16, 16); @@ -698,7 +695,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page, end--; slab_bug(s, "%s overwritten", what); - printk(KERN_ERR "INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", + pr_err("INFO: 0x%p-0x%p. First byte 0x%x instead of 0x%x\n", fault, end - 1, fault[0], value); print_trailer(s, page, object); @@ -931,7 +928,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object, int alloc) { if (s->flags & SLAB_TRACE) { - printk(KERN_INFO "TRACE %s %s 0x%p inuse=%d fp=0x%p\n", + pr_info("TRACE %s %s 0x%p inuse=%d fp=0x%p\n", s->name, alloc ? "alloc" : "free", object, page->inuse, @@ -1134,9 +1131,8 @@ static noinline struct kmem_cache_node *free_debug_processing( slab_err(s, page, "Attempt to free object(0x%p) " "outside of slab", object); } else if (!page->slab_cache) { - printk(KERN_ERR - "SLUB : no slab for object 0x%p.\n", - object); + pr_err("SLUB : no slab for object 0x%p.\n", + object); dump_stack(); } else object_err(s, page, object, @@ -1219,8 +1215,8 @@ static int __init setup_slub_debug(char *str) slub_debug |= SLAB_FAILSLAB; break; default: - printk(KERN_ERR "slub_debug option '%c' " - "unknown. skipped\n", *str); + pr_err("slub_debug option '%c' unknown. skipped\n", + *str); } } @@ -1770,19 +1766,19 @@ static inline void note_cmpxchg_failure(const char *n, #ifdef SLUB_DEBUG_CMPXCHG unsigned long actual_tid = __this_cpu_read(s->cpu_slab->tid); - printk(KERN_INFO "%s %s: cmpxchg redo ", n, s->name); + pr_info("%s %s: cmpxchg redo ", n, s->name); #ifdef CONFIG_PREEMPT if (tid_to_cpu(tid) != tid_to_cpu(actual_tid)) - printk("due to cpu change %d -> %d\n", + pr_warn("due to cpu change %d -> %d\n", tid_to_cpu(tid), tid_to_cpu(actual_tid)); else #endif if (tid_to_event(tid) != tid_to_event(actual_tid)) - printk("due to cpu running other code. Event %ld->%ld\n", + pr_warn("due to cpu running other code. Event %ld->%ld\n", tid_to_event(tid), tid_to_event(actual_tid)); else - printk("for unknown reason: actual=%lx was=%lx target=%lx\n", + pr_warn("for unknown reason: actual=%lx was=%lx target=%lx\n", actual_tid, tid, next_tid(tid)); #endif stat(s, CMPXCHG_DOUBLE_CPU_FAIL); @@ -2154,16 +2150,15 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { int node; - printk(KERN_WARNING - "SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", + pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", nid, gfpflags); - printk(KERN_WARNING " cache: %s, object size: %d, buffer size: %d, " - "default order: %d, min order: %d\n", s->name, s->object_size, - s->size, oo_order(s->oo), oo_order(s->min)); + pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", + s->name, s->object_size, s->size, oo_order(s->oo), + oo_order(s->min)); if (oo_order(s->min) > get_order(s->object_size)) - printk(KERN_WARNING " %s debugging increased min order, use " - "slub_debug=O to disable.\n", s->name); + pr_warn(" %s debugging increased min order, use slub_debug=O to disable.\n", + s->name); for_each_online_node(node) { struct kmem_cache_node *n = get_node(s, node); @@ -2178,8 +2173,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) nr_slabs = node_nr_slabs(n); nr_objs = node_nr_objs(n); - printk(KERN_WARNING - " node %d: slabs: %ld, objs: %ld, free: %ld\n", + pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } } @@ -2894,10 +2888,8 @@ static void early_kmem_cache_node_alloc(int node) BUG_ON(!page); if (page_to_nid(page) != node) { - printk(KERN_ERR "SLUB: Unable to allocate memory from " - "node %d\n", node); - printk(KERN_ERR "SLUB: Allocating a useless per node structure " - "in order to be able to continue\n"); + pr_err("SLUB: Unable to allocate memory from node %d\n", node); + pr_err("SLUB: Allocating a useless per node structure in order to be able to continue\n"); } n = page->freelist; @@ -3182,8 +3174,7 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page, for_each_object(p, s, addr, page->objects) { if (!test_bit(slab_index(p, s, addr), map)) { - printk(KERN_ERR "INFO: Object 0x%p @offset=%tu\n", - p, p - addr); + pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr); print_tracking(s, p); } } @@ -3650,9 +3641,7 @@ void __init kmem_cache_init(void) register_cpu_notifier(&slab_notifier); #endif - printk(KERN_INFO - "SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d," - " CPUs=%d, Nodes=%d\n", + pr_info("SLUB: HWalign=%d, Order=%d-%d, MinObjects=%d, CPUs=%d, Nodes=%d\n", cache_line_size(), slub_min_order, slub_max_order, slub_min_objects, nr_cpu_ids, nr_node_ids); @@ -3934,8 +3923,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != n->nr_partial) - printk(KERN_ERR "SLUB %s: %ld partial slabs counted but " - "counter=%ld\n", s->name, count, n->nr_partial); + pr_err("SLUB %s: %ld partial slabs counted but counter=%ld\n", + s->name, count, n->nr_partial); if (!(s->flags & SLAB_STORE_USER)) goto out; @@ -3945,9 +3934,8 @@ static int validate_slab_node(struct kmem_cache *s, count++; } if (count != atomic_long_read(&n->nr_slabs)) - printk(KERN_ERR "SLUB: %s %ld slabs counted but " - "counter=%ld\n", s->name, count, - atomic_long_read(&n->nr_slabs)); + pr_err("SLUB: %s %ld slabs counted but counter=%ld\n", + s->name, count, atomic_long_read(&n->nr_slabs)); out: spin_unlock_irqrestore(&n->list_lock, flags); @@ -4211,53 +4199,50 @@ static void resiliency_test(void) BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || KMALLOC_SHIFT_HIGH < 10); - printk(KERN_ERR "SLUB resiliency testing\n"); - printk(KERN_ERR "-----------------------\n"); - printk(KERN_ERR "A. Corruption after allocation\n"); + pr_err("SLUB resiliency testing\n"); + pr_err("-----------------------\n"); + pr_err("A. Corruption after allocation\n"); p = kzalloc(16, GFP_KERNEL); p[16] = 0x12; - printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer" - " 0x12->0x%p\n\n", p + 16); + pr_err("\n1. kmalloc-16: Clobber Redzone/next pointer 0x12->0x%p\n\n", + p + 16); validate_slab_cache(kmalloc_caches[4]); /* Hmmm... The next two are dangerous */ p = kzalloc(32, GFP_KERNEL); p[32 + sizeof(void *)] = 0x34; - printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab" - " 0x34 -> -0x%p\n", p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n2. kmalloc-32: Clobber next pointer/next slab 0x34 -> -0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[5]); p = kzalloc(64, GFP_KERNEL); p += 64 + (get_cycles() & 0xff) * sizeof(void *); *p = 0x56; - printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", - p); - printk(KERN_ERR - "If allocated object is overwritten then not detectable\n\n"); + pr_err("\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n", + p); + pr_err("If allocated object is overwritten then not detectable\n\n"); validate_slab_cache(kmalloc_caches[6]); - printk(KERN_ERR "\nB. Corruption after free\n"); + pr_err("\nB. Corruption after free\n"); p = kzalloc(128, GFP_KERNEL); kfree(p); *p = 0x78; - printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); + pr_err("1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[7]); p = kzalloc(256, GFP_KERNEL); kfree(p); p[50] = 0x9a; - printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", - p); + pr_err("\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[8]); p = kzalloc(512, GFP_KERNEL); kfree(p); p[512] = 0xab; - printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); + pr_err("\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p); validate_slab_cache(kmalloc_caches[9]); } #else @@ -5303,7 +5288,7 @@ static int __init slab_sysfs_init(void) slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj); if (!slab_kset) { mutex_unlock(&slab_mutex); - printk(KERN_ERR "Cannot register slab subsystem.\n"); + pr_err("Cannot register slab subsystem.\n"); return -ENOSYS; } @@ -5312,8 +5297,8 @@ static int __init slab_sysfs_init(void) list_for_each_entry(s, &slab_caches, list) { err = sysfs_slab_add(s); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab %s" - " to sysfs\n", s->name); + pr_err("SLUB: Unable to add boot slab %s to sysfs\n", + s->name); } while (alias_list) { @@ -5322,8 +5307,8 @@ static int __init slab_sysfs_init(void) alias_list = alias_list->next; err = sysfs_slab_alias(al->s, al->name); if (err) - printk(KERN_ERR "SLUB: Unable to add boot slab alias" - " %s to sysfs\n", al->name); + pr_err("SLUB: Unable to add boot slab alias %s to sysfs\n", + al->name); kfree(al); } -- cgit v0.10.2 From ecc42fbe952fa4aae88c2413e21912b1d665fb93 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:06:35 -0700 Subject: mm/slub.c: convert vnsprintf-static to va_format Inspired by Joe Perches suggestion in ntfs logging clean-up. Signed-off-by: Fabian Frederick Acked-by: Christoph Lameter Cc: Joe Perches Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index 1594b14..de99d50 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -578,28 +578,30 @@ static void print_page_info(struct page *page) static void slab_bug(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); - va_end(args); + vaf.fmt = fmt; + vaf.va = &args; pr_err("=============================================================================\n"); - pr_err("BUG %s (%s): %s\n", s->name, print_tainted(), buf); + pr_err("BUG %s (%s): %pV\n", s->name, print_tainted(), &vaf); pr_err("-----------------------------------------------------------------------------\n\n"); add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + va_end(args); } static void slab_fix(struct kmem_cache *s, char *fmt, ...) { + struct va_format vaf; va_list args; - char buf[100]; va_start(args, fmt); - vsnprintf(buf, sizeof(buf), fmt, args); + vaf.fmt = fmt; + vaf.va = &args; + pr_err("FIX %s: %pV\n", s->name, &vaf); va_end(args); - pr_err("FIX %s: %s\n", s->name, buf); } static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) -- cgit v0.10.2 From 9a02d699935c9acdfefe431bbc33771d1d87da7f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:06:36 -0700 Subject: mm, slab: suppress out of memory warning unless debug is enabled When the slab or slub allocators cannot allocate additional slab pages, they emit diagnostic information to the kernel log such as current number of slabs, number of objects, active objects, etc. This is always coupled with a page allocation failure warning since it is controlled by !__GFP_NOWARN. Suppress this out of memory warning if the allocator is configured without debug supported. The page allocation failure warning will indicate it is a failed slab allocation, the order, and the gfp mask, so this is only useful to diagnose allocator issues. Since CONFIG_SLUB_DEBUG is already enabled by default for the slub allocator, there is no functional change with this patch. If debug is disabled, however, the warnings are now suppressed. Signed-off-by: David Rientjes Cc: Pekka Enberg Acked-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 19d9218..5c846d2 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1621,10 +1621,16 @@ __initcall(cpucache_init); static noinline void slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) { +#if DEBUG struct kmem_cache_node *n; struct page *page; unsigned long flags; int node; + static DEFINE_RATELIMIT_STATE(slab_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slab_oom_rs)) + return; printk(KERN_WARNING "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", @@ -1662,6 +1668,7 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) node, active_slabs, num_slabs, active_objs, num_objs, free_objects); } +#endif } /* @@ -1683,8 +1690,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { - if (!(flags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(cachep, flags, nodeid); + slab_out_of_memory(cachep, flags, nodeid); return NULL; } diff --git a/mm/slub.c b/mm/slub.c index de99d50..65a0a5c 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2119,11 +2119,19 @@ static inline int node_match(struct page *page, int node) return 1; } +#ifdef CONFIG_SLUB_DEBUG static int count_free(struct page *page) { return page->objects - page->inuse; } +static inline unsigned long node_nr_objs(struct kmem_cache_node *n) +{ + return atomic_long_read(&n->total_objects); +} +#endif /* CONFIG_SLUB_DEBUG */ + +#if defined(CONFIG_SLUB_DEBUG) || defined(CONFIG_SYSFS) static unsigned long count_partial(struct kmem_cache_node *n, int (*get_count)(struct page *)) { @@ -2137,21 +2145,19 @@ static unsigned long count_partial(struct kmem_cache_node *n, spin_unlock_irqrestore(&n->list_lock, flags); return x; } - -static inline unsigned long node_nr_objs(struct kmem_cache_node *n) -{ -#ifdef CONFIG_SLUB_DEBUG - return atomic_long_read(&n->total_objects); -#else - return 0; -#endif -} +#endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ static noinline void slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) { +#ifdef CONFIG_SLUB_DEBUG + static DEFINE_RATELIMIT_STATE(slub_oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); int node; + if ((gfpflags & __GFP_NOWARN) || !__ratelimit(&slub_oom_rs)) + return; + pr_warn("SLUB: Unable to allocate memory on node %d (gfp=0x%x)\n", nid, gfpflags); pr_warn(" cache: %s, object size: %d, buffer size: %d, default order: %d, min order: %d\n", @@ -2178,6 +2184,7 @@ slab_out_of_memory(struct kmem_cache *s, gfp_t gfpflags, int nid) pr_warn(" node %d: slabs: %ld, objs: %ld, free: %ld\n", node, nr_slabs, nr_objs, nr_free); } +#endif } static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, @@ -2356,9 +2363,7 @@ new_slab: freelist = new_slab_objects(s, gfpflags, node, &c); if (unlikely(!freelist)) { - if (!(gfpflags & __GFP_NOWARN) && printk_ratelimit()) - slab_out_of_memory(s, gfpflags, node); - + slab_out_of_memory(s, gfpflags, node); local_irq_restore(flags); return NULL; } -- cgit v0.10.2 From 8eae1492675d0ffc12189f8db573624413232e15 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:06:37 -0700 Subject: mm: slub: fix ALLOC_SLOWPATH stat There used to be only one path out of __slab_alloc(), and ALLOC_SLOWPATH got bumped in that exit path. Now there are two, and a bunch of gotos. ALLOC_SLOWPATH can now get set more than once during a single call to __slab_alloc() which is pretty bogus. Here's the sequence: 1. Enter __slab_alloc(), fall through all the way to the stat(s, ALLOC_SLOWPATH); 2. hit 'if (!freelist)', and bump DEACTIVATE_BYPASS, jump to new_slab (goto #1) 3. Hit 'if (c->partial)', bump CPU_PARTIAL_ALLOC, goto redo (goto #2) 4. Fall through in the same path we did before all the way to stat(s, ALLOC_SLOWPATH) 5. bump ALLOC_REFILL stat, then return Doing this is obviously bogus. It keeps us from being able to accurately compare ALLOC_SLOWPATH vs. ALLOC_FASTPATH. It also means that the total number of allocs always exceeds the total number of frees. This patch moves stat(s, ALLOC_SLOWPATH) to be called from the same place that __slab_alloc() is. This makes it much less likely that ALLOC_SLOWPATH will get botched again in the spaghetti-code inside __slab_alloc(). Signed-off-by: Dave Hansen Acked-by: Christoph Lameter Acked-by: David Rientjes Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slub.c b/mm/slub.c index 65a0a5c..d05a548 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2326,8 +2326,6 @@ redo: if (freelist) goto load_freelist; - stat(s, ALLOC_SLOWPATH); - freelist = get_freelist(s, page); if (!freelist) { @@ -2432,10 +2430,10 @@ redo: object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node))) + if (unlikely(!object || !node_match(page, node))) { object = __slab_alloc(s, gfpflags, node, addr, c); - - else { + stat(s, ALLOC_SLOWPATH); + } else { void *next_object = get_freepointer_safe(s, object); /* -- cgit v0.10.2 From 5dfb417509921eb90ee123a4d1525e8916b4ace4 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:06:38 -0700 Subject: sl[au]b: charge slabs to kmemcg explicitly We have only a few places where we actually want to charge kmem so instead of intruding into the general page allocation path with __GFP_KMEMCG it's better to explictly charge kmem there. All kmem charges will be easier to follow that way. This is a step towards removing __GFP_KMEMCG. It removes __GFP_KMEMCG from memcg caches' allocflags. Instead it makes slab allocation path call memcg_charge_kmem directly getting memcg to charge from the cache's memcg params. This also eliminates any possibility of misaccounting an allocation going from one memcg's cache to another memcg, because now we always charge slabs against the memcg the cache belongs to. That's why this patch removes the big comment to memcg_kmem_get_cache. Signed-off-by: Vladimir Davydov Acked-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index b569b8b..96e5d25 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -506,6 +506,9 @@ void memcg_update_array_size(int num_groups); struct kmem_cache * __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size); +void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size); + void mem_cgroup_destroy_cache(struct kmem_cache *cachep); int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); @@ -583,17 +586,7 @@ memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order) * @cachep: the original global kmem cache * @gfp: allocation flags. * - * This function assumes that the task allocating, which determines the memcg - * in the page allocator, belongs to the same cgroup throughout the whole - * process. Misacounting can happen if the task calls memcg_kmem_get_cache() - * while belonging to a cgroup, and later on changes. This is considered - * acceptable, and should only happen upon task migration. - * - * Before the cache is created by the memcg core, there is also a possible - * imbalance: the task belongs to a memcg, but the cache being allocated from - * is the global cache, since the child cache is not yet guaranteed to be - * ready. This case is also fine, since in this case the GFP_KMEMCG will not be - * passed and the page allocator will not attempt any cgroup accounting. + * All memory allocated from a per-memcg cache is charged to the owner memcg. */ static __always_inline struct kmem_cache * memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5177c6d..56a768b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2953,7 +2953,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) } #endif -static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; int ret = 0; @@ -2991,7 +2991,7 @@ static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) return ret; } -static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) { res_counter_uncharge(&memcg->res, size); if (do_swap_account) diff --git a/mm/slab.c b/mm/slab.c index 5c846d2..944ac58 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1688,8 +1688,12 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, if (cachep->flags & SLAB_RECLAIM_ACCOUNT) flags |= __GFP_RECLAIMABLE; + if (memcg_charge_slab(cachep, flags, cachep->gfporder)) + return NULL; + page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); if (!page) { + memcg_uncharge_slab(cachep, cachep->gfporder); slab_out_of_memory(cachep, flags, nodeid); return NULL; } @@ -1747,7 +1751,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; - __free_memcg_kmem_pages(page, cachep->gfporder); + __free_pages(page, cachep->gfporder); + memcg_uncharge_slab(cachep, cachep->gfporder); } static void kmem_rcu_free(struct rcu_head *head) diff --git a/mm/slab.h b/mm/slab.h index 6bd4c35..863e67b 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -192,6 +192,26 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) return s; return s->memcg_params->root_cache; } + +static __always_inline int memcg_charge_slab(struct kmem_cache *s, + gfp_t gfp, int order) +{ + if (!memcg_kmem_enabled()) + return 0; + if (is_root_cache(s)) + return 0; + return memcg_charge_kmem(s->memcg_params->memcg, gfp, + PAGE_SIZE << order); +} + +static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) +{ + if (!memcg_kmem_enabled()) + return; + if (is_root_cache(s)) + return; + memcg_uncharge_kmem(s->memcg_params->memcg, PAGE_SIZE << order); +} #else static inline bool is_root_cache(struct kmem_cache *s) { @@ -227,6 +247,15 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s) { return s; } + +static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order) +{ + return 0; +} + +static inline void memcg_uncharge_slab(struct kmem_cache *s, int order) +{ +} #endif static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) diff --git a/mm/slab_common.c b/mm/slab_common.c index 102cc6f..06f0c61 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -290,12 +290,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c root_cache->size, root_cache->align, root_cache->flags, root_cache->ctor, memcg, root_cache); - if (IS_ERR(s)) { + if (IS_ERR(s)) kfree(cache_name); - goto out_unlock; - } - - s->allocflags |= __GFP_KMEMCG; out_unlock: mutex_unlock(&slab_mutex); diff --git a/mm/slub.c b/mm/slub.c index d05a548..fc98318 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1312,17 +1312,26 @@ static inline void slab_free_hook(struct kmem_cache *s, void *x) /* * Slab allocation and freeing */ -static inline struct page *alloc_slab_page(gfp_t flags, int node, - struct kmem_cache_order_objects oo) +static inline struct page *alloc_slab_page(struct kmem_cache *s, + gfp_t flags, int node, struct kmem_cache_order_objects oo) { + struct page *page; int order = oo_order(oo); flags |= __GFP_NOTRACK; + if (memcg_charge_slab(s, flags, order)) + return NULL; + if (node == NUMA_NO_NODE) - return alloc_pages(flags, order); + page = alloc_pages(flags, order); else - return alloc_pages_exact_node(node, flags, order); + page = alloc_pages_exact_node(node, flags, order); + + if (!page) + memcg_uncharge_slab(s, order); + + return page; } static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) @@ -1344,7 +1353,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) */ alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL; - page = alloc_slab_page(alloc_gfp, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (unlikely(!page)) { oo = s->min; alloc_gfp = flags; @@ -1352,7 +1361,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) * Allocation may have failed due to fragmentation. * Try a lower order alloc if possible */ - page = alloc_slab_page(alloc_gfp, node, oo); + page = alloc_slab_page(s, alloc_gfp, node, oo); if (page) stat(s, ORDER_FALLBACK); @@ -1468,7 +1477,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page) page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; - __free_memcg_kmem_pages(page, order); + __free_pages(page, order); + memcg_uncharge_slab(s, order); } #define need_reserve_slab_rcu \ -- cgit v0.10.2 From 52383431b37cdbec63944e953ffc2698a7ad9722 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:06:39 -0700 Subject: mm: get rid of __GFP_KMEMCG Currently to allocate a page that should be charged to kmemcg (e.g. threadinfo), we pass __GFP_KMEMCG flag to the page allocator. The page allocated is then to be freed by free_memcg_kmem_pages. Apart from looking asymmetrical, this also requires intrusion to the general allocation path. So let's introduce separate functions that will alloc/free pages charged to kmemcg. The new functions are called alloc_kmem_pages and free_kmem_pages. They should be used when the caller actually would like to use kmalloc, but has to fall back to the page allocator for the allocation is large. They only differ from alloc_pages and free_pages in that besides allocating or freeing pages they also charge them to the kmem resource counter of the current memory cgroup. [sfr@canb.auug.org.au: export kmalloc_order() to modules] Signed-off-by: Vladimir Davydov Acked-by: Greg Thelen Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Glauber Costa Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 39b81dc..d382db7 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -31,7 +31,6 @@ struct vm_area_struct; #define ___GFP_HARDWALL 0x20000u #define ___GFP_THISNODE 0x40000u #define ___GFP_RECLAIMABLE 0x80000u -#define ___GFP_KMEMCG 0x100000u #define ___GFP_NOTRACK 0x200000u #define ___GFP_NO_KSWAPD 0x400000u #define ___GFP_OTHER_NODE 0x800000u @@ -91,7 +90,6 @@ struct vm_area_struct; #define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */ -#define __GFP_KMEMCG ((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */ #define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) /* Allocator intends to dirty page */ /* @@ -353,6 +351,10 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, #define alloc_page_vma_node(gfp_mask, vma, addr, node) \ alloc_pages_vma(gfp_mask, 0, vma, addr, node) +extern struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order); +extern struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, + unsigned int order); + extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); @@ -372,8 +374,8 @@ extern void free_pages(unsigned long addr, unsigned int order); extern void free_hot_cold_page(struct page *page, int cold); extern void free_hot_cold_page_list(struct list_head *list, int cold); -extern void __free_memcg_kmem_pages(struct page *page, unsigned int order); -extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order); +extern void __free_kmem_pages(struct page *page, unsigned int order); +extern void free_kmem_pages(unsigned long addr, unsigned int order); #define __free_page(page) __free_pages((page), 0) #define free_page(addr) free_pages((addr), 0) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 96e5d25..5155d09 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -537,7 +537,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order) * res_counter_charge_nofail, but we hope those allocations are rare, * and won't be worth the trouble. */ - if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL)) + if (gfp & __GFP_NOFAIL) return true; if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD)) return true; diff --git a/include/linux/slab.h b/include/linux/slab.h index 307bfbe..a6aab2c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -369,16 +369,7 @@ kmem_cache_alloc_node_trace(struct kmem_cache *s, #include #endif -static __always_inline void * -kmalloc_order(size_t size, gfp_t flags, unsigned int order) -{ - void *ret; - - flags |= (__GFP_COMP | __GFP_KMEMCG); - ret = (void *) __get_free_pages(flags, order); - kmemleak_alloc(ret, size, 1, flags); - return ret; -} +extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order); #ifdef CONFIG_TRACING extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order); diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index cb0cec9..ff307b5 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -61,8 +61,6 @@ extern long do_no_restart_syscall(struct restart_block *parm); # define THREADINFO_GFP (GFP_KERNEL | __GFP_NOTRACK) #endif -#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG) - /* * flag set/clear/test wrappers * - pass TIF_xxxx constants to these functions diff --git a/include/trace/events/gfpflags.h b/include/trace/events/gfpflags.h index 1eddbf1..d6fd8e5 100644 --- a/include/trace/events/gfpflags.h +++ b/include/trace/events/gfpflags.h @@ -34,7 +34,6 @@ {(unsigned long)__GFP_HARDWALL, "GFP_HARDWALL"}, \ {(unsigned long)__GFP_THISNODE, "GFP_THISNODE"}, \ {(unsigned long)__GFP_RECLAIMABLE, "GFP_RECLAIMABLE"}, \ - {(unsigned long)__GFP_KMEMCG, "GFP_KMEMCG"}, \ {(unsigned long)__GFP_MOVABLE, "GFP_MOVABLE"}, \ {(unsigned long)__GFP_NOTRACK, "GFP_NOTRACK"}, \ {(unsigned long)__GFP_NO_KSWAPD, "GFP_NO_KSWAPD"}, \ diff --git a/kernel/fork.c b/kernel/fork.c index 54a8d26..59e3dcc 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -150,15 +150,15 @@ void __weak arch_release_thread_info(struct thread_info *ti) static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, int node) { - struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED, - THREAD_SIZE_ORDER); + struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, + THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 56a768b..7bab1de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3540,11 +3540,12 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) /* * Disabling accounting is only relevant for some specific memcg * internal allocations. Therefore we would initially not have such - * check here, since direct calls to the page allocator that are marked - * with GFP_KMEMCG only happen outside memcg core. We are mostly - * concerned with cache allocations, and by having this test at - * memcg_kmem_get_cache, we are already able to relay the allocation to - * the root cache and bypass the memcg cache altogether. + * check here, since direct calls to the page allocator that are + * accounted to kmemcg (alloc_kmem_pages and friends) only happen + * outside memcg core. We are mostly concerned with cache allocations, + * and by having this test at memcg_kmem_get_cache, we are already able + * to relay the allocation to the root cache and bypass the memcg cache + * altogether. * * There is one exception, though: the SLUB allocator does not create * large order caches, but rather service large kmallocs directly from diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5dba293..7cfdcd8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2697,7 +2697,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; - struct mem_cgroup *memcg = NULL; gfp_mask &= gfp_allowed_mask; @@ -2716,13 +2715,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (unlikely(!zonelist->_zonerefs->zone)) return NULL; - /* - * Will only have any effect when __GFP_KMEMCG is set. This is - * verified in the (always inline) callee - */ - if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) - return NULL; - retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); @@ -2782,8 +2774,6 @@ out: if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie))) goto retry_cpuset; - memcg_kmem_commit_charge(page, memcg, order); - return page; } EXPORT_SYMBOL(__alloc_pages_nodemask); @@ -2837,27 +2827,51 @@ void free_pages(unsigned long addr, unsigned int order) EXPORT_SYMBOL(free_pages); /* - * __free_memcg_kmem_pages and free_memcg_kmem_pages will free - * pages allocated with __GFP_KMEMCG. + * alloc_kmem_pages charges newly allocated pages to the kmem resource counter + * of the current memory cgroup. * - * Those pages are accounted to a particular memcg, embedded in the - * corresponding page_cgroup. To avoid adding a hit in the allocator to search - * for that information only to find out that it is NULL for users who have no - * interest in that whatsoever, we provide these functions. - * - * The caller knows better which flags it relies on. + * It should be used when the caller would like to use kmalloc, but since the + * allocation is large, it has to fall back to the page allocator. + */ +struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages(gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order) +{ + struct page *page; + struct mem_cgroup *memcg = NULL; + + if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order)) + return NULL; + page = alloc_pages_node(nid, gfp_mask, order); + memcg_kmem_commit_charge(page, memcg, order); + return page; +} + +/* + * __free_kmem_pages and free_kmem_pages will free pages allocated with + * alloc_kmem_pages. */ -void __free_memcg_kmem_pages(struct page *page, unsigned int order) +void __free_kmem_pages(struct page *page, unsigned int order) { memcg_kmem_uncharge_pages(page, order); __free_pages(page, order); } -void free_memcg_kmem_pages(unsigned long addr, unsigned int order) +void free_kmem_pages(unsigned long addr, unsigned int order) { if (addr != 0) { VM_BUG_ON(!virt_addr_valid((void *)addr)); - __free_memcg_kmem_pages(virt_to_page((void *)addr), order); + __free_kmem_pages(virt_to_page((void *)addr), order); } } diff --git a/mm/slab_common.c b/mm/slab_common.c index 06f0c61..1950c8f 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -582,6 +582,19 @@ void __init create_kmalloc_caches(unsigned long flags) } #endif /* !CONFIG_SLOB */ +void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) +{ + void *ret; + struct page *page; + + flags |= __GFP_COMP; + page = alloc_kmem_pages(flags, order); + ret = page ? page_address(page) : NULL; + kmemleak_alloc(ret, size, 1, flags); + return ret; +} +EXPORT_SYMBOL(kmalloc_order); + #ifdef CONFIG_TRACING void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { diff --git a/mm/slub.c b/mm/slub.c index fc98318..ddb6079 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3311,8 +3311,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node) struct page *page; void *ptr = NULL; - flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG; - page = alloc_pages_node(node, flags, get_order(size)); + flags |= __GFP_COMP | __GFP_NOTRACK; + page = alloc_kmem_pages_node(node, flags, get_order(size)); if (page) ptr = page_address(page); @@ -3381,7 +3381,7 @@ void kfree(const void *x) if (unlikely(!PageSlab(page))) { BUG_ON(!PageCompound(page)); kfree_hook(x); - __free_memcg_kmem_pages(page, compound_order(page)); + __free_kmem_pages(page, compound_order(page)); return; } slab_free(page->slab_cache, page, object, _RET_IP_); -- cgit v0.10.2 From 0bf073315cb29d2e9e68b6c5da97862a519e3320 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 4 Jun 2014 16:06:41 -0700 Subject: mm: softdirty: make freshly remapped file pages being softdirty unconditionally Hugh reported: | I noticed your soft_dirty work in install_file_pte(): which looked | good at first, until I realized that it's propagating the soft_dirty | of a pte it's about to zap completely, to the unrelated entry it's | about to insert in its place. Which seems very odd to me. Indeed this code ends up being nop in result -- pte_file_mksoft_dirty() operates with pte_t argument and returns new pte_t which were never used after. After looking more I think what we need is to soft-dirtify all newely remapped file pages because it should look like a new mapping for memory tracker. Signed-off-by: Cyrill Gorcunov Reported-by: Hugh Dickins Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/fremap.c b/mm/fremap.c index 34feba6..2c5646f 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -82,13 +82,10 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, ptfile = pgoff_to_pte(pgoff); - if (!pte_none(*pte)) { - if (pte_present(*pte) && pte_soft_dirty(*pte)) - pte_file_mksoft_dirty(ptfile); + if (!pte_none(*pte)) zap_pte(mm, vma, addr, pte); - } - set_pte_at(mm, addr, pte, ptfile); + set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile)); /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a -- cgit v0.10.2 From b43790eedd31e9535b89bbfa45793919e9504c34 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 4 Jun 2014 16:06:42 -0700 Subject: mm: softdirty: don't forget to save file map softdiry bit on unmap pte_file_mksoft_dirty operates with argument passed by a value and returns modified result thus we need to assign @ptfile here, otherwise itis a no-op which may lead to loss of the softdirty bit. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/rmap.c b/mm/rmap.c index 10aef96..7da400d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1361,7 +1361,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (page->index != linear_page_index(vma, address)) { pte_t ptfile = pgoff_to_pte(page->index); if (pte_soft_dirty(pteval)) - pte_file_mksoft_dirty(ptfile); + ptfile = pte_file_mksoft_dirty(ptfile); set_pte_at(mm, address, pte, ptfile); } -- cgit v0.10.2 From c86c97ff42cd6c6d1bd29eca4dfabeaf2b7f1020 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 4 Jun 2014 16:06:43 -0700 Subject: mm: softdirty: clear VM_SOFTDIRTY flag inside clear_refs_write() instead of clear_soft_dirty() clear_refs_write() is called earlier than clear_soft_dirty() and it is more natural to clear VM_SOFTDIRTY (which belongs to VMA entry but not PTEs) that early instead of clearing it a way deeper inside call chain. Signed-off-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Hugh Dickins Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 442177b..2101ce4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -737,9 +737,6 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_file_clear_soft_dirty(ptent); } - if (vma->vm_flags & VM_SOFTDIRTY) - vma->vm_flags &= ~VM_SOFTDIRTY; - set_pte_at(vma->vm_mm, addr, pte, ptent); #endif } @@ -807,8 +804,9 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, if (type == CLEAR_REFS_SOFT_DIRTY) { soft_dirty_cleared = true; - pr_warn_once("The pagemap bits 55-60 has changed their meaning! " - "See the linux/Documentation/vm/pagemap.txt for details.\n"); + pr_warn_once("The pagemap bits 55-60 has changed their meaning!" + " See the linux/Documentation/vm/pagemap.txt for " + "details.\n"); } task = get_proc_task(file_inode(file)); @@ -839,11 +837,17 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, * * Writing 3 to /proc/pid/clear_refs only affects file * mapped pages. + * + * Writing 4 to /proc/pid/clear_refs affects all pages. */ if (type == CLEAR_REFS_ANON && vma->vm_file) continue; if (type == CLEAR_REFS_MAPPED && !vma->vm_file) continue; + if (type == CLEAR_REFS_SOFT_DIRTY) { + if (vma->vm_flags & VM_SOFTDIRTY) + vma->vm_flags &= ~VM_SOFTDIRTY; + } walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } -- cgit v0.10.2 From 6f04f48dc9c0433e2bb687f5f7f7af1aba97b04d Mon Sep 17 00:00:00 2001 From: Suleiman Souhlal Date: Wed, 4 Jun 2014 16:06:44 -0700 Subject: mm: only force scan in reclaim when none of the LRUs are big enough. Prior to this change, we would decide whether to force scan a LRU during reclaim if that LRU itself was too small for the current priority. However, this can lead to the file LRU getting force scanned even if there are a lot of anonymous pages we can reclaim, leading to hot file pages getting needlessly reclaimed. To address this, we instead only force scan when none of the reclaimable LRUs are big enough. Gives huge improvements with zswap. For example, when doing -j20 kernel build in a 500MB container with zswap enabled, runtime (in seconds) is greatly reduced: x without this change + with this change N Min Max Median Avg Stddev x 5 700.997 790.076 763.928 754.05 39.59493 + 5 141.634 197.899 155.706 161.9 21.270224 Difference at 95.0% confidence -592.15 +/- 46.3521 -78.5293% +/- 6.14709% (Student's t, pooled s = 31.7819) Should also give some improvements in regular (non-zswap) swap cases. Yes, hughd found significant speedup using regular swap, with several memcgs under pressure; and it should also be effective in the non-memcg case, whenever one or another zone LRU is forced too small. Signed-off-by: Suleiman Souhlal Signed-off-by: Hugh Dickins Cc: Suleiman Souhlal Cc: Mel Gorman Acked-by: Rik van Riel Acked-by: Rafael Aquini Cc: Michal Hocko Cc: Yuanhan Liu Cc: Seth Jennings Cc: Bob Liu Cc: Minchan Kim Cc: Luigi Semenzato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 32c661d..7901cb7 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1866,6 +1866,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, bool force_scan = false; unsigned long ap, fp; enum lru_list lru; + bool some_scanned; + int pass; /* * If the zone or memcg is small, nr[l] can be 0. This @@ -1989,39 +1991,49 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, fraction[1] = fp; denominator = ap + fp + 1; out: - for_each_evictable_lru(lru) { - int file = is_file_lru(lru); - unsigned long size; - unsigned long scan; + some_scanned = false; + /* Only use force_scan on second pass. */ + for (pass = 0; !some_scanned && pass < 2; pass++) { + for_each_evictable_lru(lru) { + int file = is_file_lru(lru); + unsigned long size; + unsigned long scan; - size = get_lru_size(lruvec, lru); - scan = size >> sc->priority; + size = get_lru_size(lruvec, lru); + scan = size >> sc->priority; - if (!scan && force_scan) - scan = min(size, SWAP_CLUSTER_MAX); + if (!scan && pass && force_scan) + scan = min(size, SWAP_CLUSTER_MAX); - switch (scan_balance) { - case SCAN_EQUAL: - /* Scan lists relative to size */ - break; - case SCAN_FRACT: + switch (scan_balance) { + case SCAN_EQUAL: + /* Scan lists relative to size */ + break; + case SCAN_FRACT: + /* + * Scan types proportional to swappiness and + * their relative recent reclaim efficiency. + */ + scan = div64_u64(scan * fraction[file], + denominator); + break; + case SCAN_FILE: + case SCAN_ANON: + /* Scan one type exclusively */ + if ((scan_balance == SCAN_FILE) != file) + scan = 0; + break; + default: + /* Look ma, no brain */ + BUG(); + } + nr[lru] = scan; /* - * Scan types proportional to swappiness and - * their relative recent reclaim efficiency. + * Skip the second pass and don't force_scan, + * if we found something to scan. */ - scan = div64_u64(scan * fraction[file], denominator); - break; - case SCAN_FILE: - case SCAN_ANON: - /* Scan one type exclusively */ - if ((scan_balance == SCAN_FILE) != file) - scan = 0; - break; - default: - /* Look ma, no brain */ - BUG(); + some_scanned |= !!scan; } - nr[lru] = scan; } } -- cgit v0.10.2 From 4f115147ff802267d0aa41e361c5aa5bd933d896 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 4 Jun 2014 16:06:46 -0700 Subject: mm,vmacache: add debug data Introduce a CONFIG_DEBUG_VM_VMACACHE option to enable counting the cache hit rate -- exported in /proc/vmstat. Any updates to the caching scheme needs this kind of data, thus it can save some work re-implementing the counting all the time. Signed-off-by: Davidlohr Bueso Cc: Aswin Chandramouleeswaran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 486c397..ced9234 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -80,6 +80,10 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, #endif /* CONFIG_DEBUG_TLBFLUSH */ +#ifdef CONFIG_DEBUG_VM_VMACACHE + VMACACHE_FIND_CALLS, + VMACACHE_FIND_HITS, +#endif NR_VM_EVENT_ITEMS }; diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 45c9cd1..82e7db7 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -95,6 +95,12 @@ static inline void vm_events_fold_cpu(int cpu) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif +#ifdef CONFIG_DEBUG_VM_VMACACHE +#define count_vm_vmacache_event(x) count_vm_event(x) +#else +#define count_vm_vmacache_event(x) do {} while (0) +#endif + #define __count_zone_vm_events(item, zone, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + \ zone_idx(zone), delta) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 99c8bfe..c2de650 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -501,6 +501,16 @@ config DEBUG_VM If unsure, say N. +config DEBUG_VM_VMACACHE + bool "Debug VMA caching" + depends on DEBUG_VM + help + Enable this to turn on VMA caching debug information. Doing so + can cause significant overhead, so only enable it in non-production + environments. + + If unsure, say N. + config DEBUG_VM_RB bool "Debug VM red-black trees" depends on DEBUG_VM diff --git a/mm/vmacache.c b/mm/vmacache.c index 1037a3ba..658ed3b 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -78,6 +78,8 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) if (!vmacache_valid(mm)) return NULL; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; @@ -85,8 +87,10 @@ struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) continue; if (WARN_ON_ONCE(vma->vm_mm != mm)) break; - if (vma->vm_start <= addr && vma->vm_end > addr) + if (vma->vm_start <= addr && vma->vm_end > addr) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); return vma; + } } return NULL; @@ -102,11 +106,15 @@ struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, if (!vmacache_valid(mm)) return NULL; + count_vm_vmacache_event(VMACACHE_FIND_CALLS); + for (i = 0; i < VMACACHE_SIZE; i++) { struct vm_area_struct *vma = current->vmacache[i]; - if (vma && vma->vm_start == start && vma->vm_end == end) + if (vma && vma->vm_start == start && vma->vm_end == end) { + count_vm_vmacache_event(VMACACHE_FIND_HITS); return vma; + } } return NULL; diff --git a/mm/vmstat.c b/mm/vmstat.c index 302dd07..82ce17c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -866,6 +866,10 @@ const char * const vmstat_text[] = { "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ +#ifdef CONFIG_DEBUG_VM_VMACACHE + "vmacache_find_calls", + "vmacache_find_hits", +#endif #endif /* CONFIG_VM_EVENTS_COUNTERS */ }; #endif /* CONFIG_PROC_FS || CONFIG_SYSFS || CONFIG_NUMA */ -- cgit v0.10.2 From 6b4ebc3a9078c5b7b8c4cf495a0b1d2d0e0bfe7a Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 4 Jun 2014 16:06:47 -0700 Subject: mm,vmacache: optimize overflow system-wide flushing For single threaded workloads, we can avoid flushing and iterating through the entire list of tasks, making the whole function a lot faster, requiring only a single atomic read for the mm_users. Signed-off-by: Davidlohr Bueso Suggested-by: Oleg Nesterov Cc: Aswin Chandramouleeswaran Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmacache.c b/mm/vmacache.c index 658ed3b..9f25af8 100644 --- a/mm/vmacache.c +++ b/mm/vmacache.c @@ -17,6 +17,16 @@ void vmacache_flush_all(struct mm_struct *mm) { struct task_struct *g, *p; + /* + * Single threaded tasks need not iterate the entire + * list of process. We can avoid the flushing as well + * since the mm's seqnum was increased and don't have + * to worry about other threads' seqnum. Current's + * flush will occur upon the next lookup. + */ + if (atomic_read(&mm->mm_users) == 1) + return; + rcu_read_lock(); for_each_process_thread(g, p) { /* -- cgit v0.10.2 From d92ef66c4f8fdf7a24736b1ab6c48d32de9bfc07 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:48 -0700 Subject: x86: make dma_alloc_coherent() return zeroed memory if CMA is enabled This patchset enhances the DMA Contiguous Memory Allocator on x86. Currently the DMA CMA is only supported with pci-nommu dma_map_ops and furthermore it can't be enabled on x86_64. But I would like to allocate big contiguous memory with dma_alloc_coherent() and tell it to the device that requires it, regardless of which dma mapping implementation is actually used in the system. So this makes it work with swiotlb and intel-iommu dma_map_ops, too. And this also extends "cma=" kernel parameter to specify placement constraint by the physical address range of memory allocations. For example, CMA allocates memory below 4GB by "cma=64M@0-4G", it is required for the devices only supporting 32-bit addressing on 64-bit systems without iommu. This patch (of 5): Calling dma_alloc_coherent() with __GFP_ZERO must return zeroed memory. But when the contiguous memory allocator (CMA) is enabled on x86 and the memory region is allocated by dma_alloc_from_contiguous(), it doesn't return zeroed memory. Because dma_generic_alloc_coherent() forgot to fill the memory region with zero if it was allocated by dma_alloc_from_contiguous() Most implementations of dma_alloc_coherent() return zeroed memory regardless of whether __GFP_ZERO is specified. So this fixes it by unconditionally zeroing the allocated memory region. Alternatively, we could fix dma_alloc_from_contiguous() to return zeroed out memory and remove memset() from all caller of it. But we can't simply remove the memset on arm because __dma_clear_buffer() is used there for ensuring cache flushing and it is used in many places. Of course we can do redundant memset in dma_alloc_from_contiguous(), but I think this patch is less impact for fixing this problem. Signed-off-by: Akinobu Mita Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index f7d0672..e5f4e96 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -97,7 +97,7 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, dma_mask = dma_alloc_coherent_mask(dev, flag); - flag |= __GFP_ZERO; + flag &= ~__GFP_ZERO; again: page = NULL; /* CMA can be used only in the context which permits sleeping */ @@ -120,7 +120,7 @@ again: return NULL; } - + memset(page_address(page), 0, size); *dma_addr = addr; return page_address(page); } -- cgit v0.10.2 From 9c5a3621427da68afe6a078cadf807d2c8cc1d12 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:50 -0700 Subject: x86: enable DMA CMA with swiotlb The DMA Contiguous Memory Allocator support on x86 is disabled when swiotlb config option is enabled. So DMA CMA is always disabled on x86_64 because swiotlb is always enabled. This attempts to support for DMA CMA with enabling swiotlb config option. The contiguous memory allocator on x86 is integrated in the function dma_generic_alloc_coherent() which is .alloc callback in nommu_dma_ops for dma_alloc_coherent(). x86_swiotlb_alloc_coherent() which is .alloc callback in swiotlb_dma_ops tries to allocate with dma_generic_alloc_coherent() firstly and then swiotlb_alloc_coherent() is called as a fallback. The main part of supporting DMA CMA with swiotlb is that changing x86_swiotlb_free_coherent() which is .free callback in swiotlb_dma_ops for dma_free_coherent() so that it can distinguish memory allocated by dma_generic_alloc_coherent() from one allocated by swiotlb_alloc_coherent() and release it with dma_generic_free_coherent() which can handle contiguous memory. This change requires making is_swiotlb_buffer() global function. This also needs to change .free callback in the dma_map_ops for amd_gart and sta2x11, because these dma_ops are also using dma_generic_alloc_coherent(). Signed-off-by: Akinobu Mita Acked-by: Marek Szyprowski Acked-by: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 896a411a..4a0137f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -41,7 +41,7 @@ config X86 select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS - select HAVE_DMA_CONTIGUOUS if !SWIOTLB + select HAVE_DMA_CONTIGUOUS select HAVE_KRETPROBES select GENERIC_EARLY_IOREMAP select HAVE_OPTPROBES diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h index 977f176..ab05d73 100644 --- a/arch/x86/include/asm/swiotlb.h +++ b/arch/x86/include/asm/swiotlb.h @@ -29,4 +29,11 @@ static inline void pci_swiotlb_late_init(void) static inline void dma_mark_clean(void *addr, size_t size) {} +extern void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, + dma_addr_t *dma_handle, gfp_t flags, + struct dma_attrs *attrs); +extern void x86_swiotlb_free_coherent(struct device *dev, size_t size, + void *vaddr, dma_addr_t dma_addr, + struct dma_attrs *attrs); + #endif /* _ASM_X86_SWIOTLB_H */ diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index b574b29..8e3842f 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -512,7 +512,7 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, struct dma_attrs *attrs) { gart_unmap_page(dev, dma_addr, size, DMA_BIDIRECTIONAL, NULL); - free_pages((unsigned long)vaddr, get_order(size)); + dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 6c483ba..77dd0ad 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -14,7 +14,7 @@ #include int swiotlb __read_mostly; -static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, +void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags, struct dma_attrs *attrs) { @@ -28,11 +28,14 @@ static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, return swiotlb_alloc_coherent(hwdev, size, dma_handle, flags); } -static void x86_swiotlb_free_coherent(struct device *dev, size_t size, +void x86_swiotlb_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_addr, struct dma_attrs *attrs) { - swiotlb_free_coherent(dev, size, vaddr, dma_addr); + if (is_swiotlb_buffer(dma_to_phys(dev, dma_addr))) + swiotlb_free_coherent(dev, size, vaddr, dma_addr); + else + dma_generic_free_coherent(dev, size, vaddr, dma_addr, attrs); } static struct dma_map_ops swiotlb_dma_ops = { diff --git a/arch/x86/pci/sta2x11-fixup.c b/arch/x86/pci/sta2x11-fixup.c index 9d8a509..5ceda85 100644 --- a/arch/x86/pci/sta2x11-fixup.c +++ b/arch/x86/pci/sta2x11-fixup.c @@ -173,9 +173,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev, { void *vaddr; - vaddr = dma_generic_alloc_coherent(dev, size, dma_handle, flags, attrs); - if (!vaddr) - vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, flags); + vaddr = x86_swiotlb_alloc_coherent(dev, size, dma_handle, flags, attrs); *dma_handle = p2a(*dma_handle, to_pci_dev(dev)); return vaddr; } @@ -183,7 +181,7 @@ static void *sta2x11_swiotlb_alloc_coherent(struct device *dev, /* We have our own dma_ops: the same as swiotlb but from alloc (above) */ static struct dma_map_ops sta2x11_dma_ops = { .alloc = sta2x11_swiotlb_alloc_coherent, - .free = swiotlb_free_coherent, + .free = x86_swiotlb_free_coherent, .map_page = swiotlb_map_page, .unmap_page = swiotlb_unmap_page, .map_sg = swiotlb_map_sg_attrs, diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index a5ffd32..e7a018e 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -116,4 +116,6 @@ static inline void swiotlb_free(void) { } #endif extern void swiotlb_print_info(void); +extern int is_swiotlb_buffer(phys_addr_t paddr); + #endif /* __LINUX_SWIOTLB_H */ diff --git a/lib/swiotlb.c b/lib/swiotlb.c index b604b83..649d097 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -374,7 +374,7 @@ void __init swiotlb_free(void) io_tlb_nslabs = 0; } -static int is_swiotlb_buffer(phys_addr_t paddr) +int is_swiotlb_buffer(phys_addr_t paddr) { return paddr >= io_tlb_start && paddr < io_tlb_end; } -- cgit v0.10.2 From 367464362591d89b371e2a690638e9bc899d8ebb Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:51 -0700 Subject: intel-iommu: integrate DMA CMA This adds support for the DMA Contiguous Memory Allocator for intel-iommu. This change enables dma_alloc_coherent() to allocate big contiguous memory. It is achieved in the same way as nommu_dma_ops currently does, i.e. trying to allocate memory by dma_alloc_from_contiguous() and alloc_pages() is used as a fallback. Signed-off-by: Akinobu Mita Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/iommu/intel-iommu.c b/drivers/iommu/intel-iommu.c index f256ffc..6bb3277 100644 --- a/drivers/iommu/intel-iommu.c +++ b/drivers/iommu/intel-iommu.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -3193,7 +3194,7 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_handle, gfp_t flags, struct dma_attrs *attrs) { - void *vaddr; + struct page *page = NULL; int order; size = PAGE_ALIGN(size); @@ -3208,17 +3209,31 @@ static void *intel_alloc_coherent(struct device *dev, size_t size, flags |= GFP_DMA32; } - vaddr = (void *)__get_free_pages(flags, order); - if (!vaddr) + if (flags & __GFP_WAIT) { + unsigned int count = size >> PAGE_SHIFT; + + page = dma_alloc_from_contiguous(dev, count, order); + if (page && iommu_no_mapping(dev) && + page_to_phys(page) + size > dev->coherent_dma_mask) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } + } + + if (!page) + page = alloc_pages(flags, order); + if (!page) return NULL; - memset(vaddr, 0, size); + memset(page_address(page), 0, size); - *dma_handle = __intel_map_single(dev, virt_to_bus(vaddr), size, + *dma_handle = __intel_map_single(dev, page_to_phys(page), size, DMA_BIDIRECTIONAL, dev->coherent_dma_mask); if (*dma_handle) - return vaddr; - free_pages((unsigned long)vaddr, order); + return page_address(page); + if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) + __free_pages(page, order); + return NULL; } @@ -3226,12 +3241,14 @@ static void intel_free_coherent(struct device *dev, size_t size, void *vaddr, dma_addr_t dma_handle, struct dma_attrs *attrs) { int order; + struct page *page = virt_to_page(vaddr); size = PAGE_ALIGN(size); order = get_order(size); intel_unmap_page(dev, dma_handle, size, DMA_BIDIRECTIONAL, NULL); - free_pages((unsigned long)vaddr, order); + if (!dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT)) + __free_pages(page, order); } static void intel_unmap_sg(struct device *dev, struct scatterlist *sglist, -- cgit v0.10.2 From 2bfc2862c4fe38379a2fb2cfba33fad32ccb4ff4 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:53 -0700 Subject: memblock: introduce memblock_alloc_range() This introduces memblock_alloc_range() which allocates memblock from the specified range of physical address. I would like to use this function to specify the location of CMA. Signed-off-by: Akinobu Mita Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memblock.h b/include/linux/memblock.h index 73dc382..b660e05 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -272,6 +272,8 @@ static inline bool memblock_bottom_up(void) { return false; } #define MEMBLOCK_ALLOC_ANYWHERE (~(phys_addr_t)0) #define MEMBLOCK_ALLOC_ACCESSIBLE 0 +phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, + phys_addr_t start, phys_addr_t end); phys_addr_t memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr); phys_addr_t __memblock_alloc_base(phys_addr_t size, phys_addr_t align, diff --git a/mm/memblock.c b/mm/memblock.c index a810ba9..1467364 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1033,22 +1033,35 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size, } #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, - phys_addr_t align, phys_addr_t max_addr, - int nid) +static phys_addr_t __init memblock_alloc_range_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t start, + phys_addr_t end, int nid) { phys_addr_t found; if (!align) align = SMP_CACHE_BYTES; - found = memblock_find_in_range_node(size, align, 0, max_addr, nid); + found = memblock_find_in_range_node(size, align, start, end, nid); if (found && !memblock_reserve(found, size)) return found; return 0; } +phys_addr_t __init memblock_alloc_range(phys_addr_t size, phys_addr_t align, + phys_addr_t start, phys_addr_t end) +{ + return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE); +} + +static phys_addr_t __init memblock_alloc_base_nid(phys_addr_t size, + phys_addr_t align, phys_addr_t max_addr, + int nid) +{ + return memblock_alloc_range_nid(size, align, 0, max_addr, nid); +} + phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid) { return memblock_alloc_base_nid(size, align, MEMBLOCK_ALLOC_ACCESSIBLE, nid); -- cgit v0.10.2 From 5ea3b1b2f8ad9162684431ce6188102ca4c64b7a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:54 -0700 Subject: cma: add placement specifier for "cma=" kernel parameter Currently, "cma=" kernel parameter is used to specify the size of CMA, but we can't specify where it is located. We want to locate CMA below 4GB for devices only supporting 32-bit addressing on 64-bit systems without iommu. This enables to specify the placement of CMA by extending "cma=" kernel parameter. Examples: 1. locate 64MB CMA below 4GB by "cma=64M@0-4G" 2. locate 64MB CMA exact at 512MB by "cma=64M@512M" Note that the DMA contiguous memory allocator on x86 assumes that page_address() works for the pages to allocate. So this change requires to limit end address of contiguous memory area upto max_pfn_mapped to prevent from locating it on highmem area by the argument of dma_contiguous_reserve(). Signed-off-by: Akinobu Mita Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index af55e13..adea3a2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -630,8 +630,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted. Also note the kernel might malfunction if you disable some critical bits. - cma=nn[MG] [ARM,KNL] - Sets the size of kernel global memory area for contiguous + cma=nn[MG]@[start[MG][-end[MG]]] + [ARM,X86,KNL] + Sets the size of kernel global memory area for + contiguous memory allocations and optionally the + placement constraint by the physical address range of memory allocations. For more information, see include/linux/dma-contiguous.h diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 09c76d2..78a0e62 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1119,7 +1119,7 @@ void __init setup_arch(char **cmdline_p) setup_real_mode(); memblock_set_current_limit(get_max_mapped()); - dma_contiguous_reserve(0); + dma_contiguous_reserve(max_pfn_mapped << PAGE_SHIFT); /* * NOTE: On x86-32, only from this point on, fixmaps are ready for use. diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index c34ec33..83969f8 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -60,11 +60,22 @@ struct cma *dma_contiguous_default_area; */ static const phys_addr_t size_bytes = CMA_SIZE_MBYTES * SZ_1M; static phys_addr_t size_cmdline = -1; +static phys_addr_t base_cmdline; +static phys_addr_t limit_cmdline; static int __init early_cma(char *p) { pr_debug("%s(%s)\n", __func__, p); size_cmdline = memparse(p, &p); + if (*p != '@') + return 0; + base_cmdline = memparse(p + 1, &p); + if (*p != '-') { + limit_cmdline = base_cmdline + size_cmdline; + return 0; + } + limit_cmdline = memparse(p + 1, &p); + return 0; } early_param("cma", early_cma); @@ -108,11 +119,18 @@ static inline __maybe_unused phys_addr_t cma_early_percent_memory(void) void __init dma_contiguous_reserve(phys_addr_t limit) { phys_addr_t selected_size = 0; + phys_addr_t selected_base = 0; + phys_addr_t selected_limit = limit; + bool fixed = false; pr_debug("%s(limit %08lx)\n", __func__, (unsigned long)limit); if (size_cmdline != -1) { selected_size = size_cmdline; + selected_base = base_cmdline; + selected_limit = min_not_zero(limit_cmdline, limit); + if (base_cmdline + size_cmdline == limit_cmdline) + fixed = true; } else { #ifdef CONFIG_CMA_SIZE_SEL_MBYTES selected_size = size_bytes; @@ -129,10 +147,12 @@ void __init dma_contiguous_reserve(phys_addr_t limit) pr_debug("%s: reserving %ld MiB for global area\n", __func__, (unsigned long)selected_size / SZ_1M); - dma_contiguous_reserve_area(selected_size, 0, limit, - &dma_contiguous_default_area); + dma_contiguous_reserve_area(selected_size, selected_base, + selected_limit, + &dma_contiguous_default_area, + fixed); } -}; +} static DEFINE_MUTEX(cma_mutex); @@ -189,15 +209,20 @@ core_initcall(cma_init_reserved_areas); * @base: Base address of the reserved area optional, use 0 for any * @limit: End address of the reserved memory (optional, 0 for any). * @res_cma: Pointer to store the created cma region. + * @fixed: hint about where to place the reserved area * * This function reserves memory from early allocator. It should be * called by arch specific code once the early allocator (memblock or bootmem) * has been activated and all other subsystems have already allocated/reserved * memory. This function allows to create custom reserved areas for specific * devices. + * + * If @fixed is true, reserve contiguous area at exactly @base. If false, + * reserve in range from @base to @limit. */ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, - phys_addr_t limit, struct cma **res_cma) + phys_addr_t limit, struct cma **res_cma, + bool fixed) { struct cma *cma = &cma_areas[cma_area_count]; phys_addr_t alignment; @@ -223,18 +248,15 @@ int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, limit &= ~(alignment - 1); /* Reserve memory */ - if (base) { + if (base && fixed) { if (memblock_is_region_reserved(base, size) || memblock_reserve(base, size) < 0) { ret = -EBUSY; goto err; } } else { - /* - * Use __memblock_alloc_base() since - * memblock_alloc_base() panic()s. - */ - phys_addr_t addr = __memblock_alloc_base(size, alignment, limit); + phys_addr_t addr = memblock_alloc_range(size, alignment, base, + limit); if (!addr) { ret = -ENOMEM; goto err; diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index 3b28f93..772eab5 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -88,7 +88,8 @@ static inline void dma_contiguous_set_default(struct cma *cma) void dma_contiguous_reserve(phys_addr_t addr_limit); int __init dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, - phys_addr_t limit, struct cma **res_cma); + phys_addr_t limit, struct cma **res_cma, + bool fixed); /** * dma_declare_contiguous() - reserve area for contiguous memory handling @@ -108,7 +109,7 @@ static inline int dma_declare_contiguous(struct device *dev, phys_addr_t size, { struct cma *cma; int ret; - ret = dma_contiguous_reserve_area(size, base, limit, &cma); + ret = dma_contiguous_reserve_area(size, base, limit, &cma, true); if (ret == 0) dev_set_cma_area(dev, cma); @@ -136,7 +137,9 @@ static inline void dma_contiguous_set_default(struct cma *cma) { } static inline void dma_contiguous_reserve(phys_addr_t limit) { } static inline int dma_contiguous_reserve_area(phys_addr_t size, phys_addr_t base, - phys_addr_t limit, struct cma **res_cma) { + phys_addr_t limit, struct cma **res_cma, + bool fixed) +{ return -ENOSYS; } -- cgit v0.10.2 From 38f7ea5a082bbde9e64b7ece389f20e71a9806f4 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Wed, 4 Jun 2014 16:06:56 -0700 Subject: arch/x86/kernel/pci-dma.c: fix dma_generic_alloc_coherent() when CONFIG_DMA_CMA is enabled dma_generic_alloc_coherent() firstly attempts to allocate by dma_alloc_from_contiguous() if CONFIG_DMA_CMA is enabled. But the memory region allocated by it may not fit within the device's DMA mask. This change makes it fall back to usual alloc_pages_node() allocation for such cases. Signed-off-by: Akinobu Mita Cc: Marek Szyprowski Cc: Konrad Rzeszutek Wilk Cc: David Woodhouse Cc: Don Dutile Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index e5f4e96..a25e202 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -101,8 +101,13 @@ void *dma_generic_alloc_coherent(struct device *dev, size_t size, again: page = NULL; /* CMA can be used only in the context which permits sleeping */ - if (flag & __GFP_WAIT) + if (flag & __GFP_WAIT) { page = dma_alloc_from_contiguous(dev, count, get_order(size)); + if (page && page_to_phys(page) + size > dma_mask) { + dma_release_from_contiguous(dev, page, count); + page = NULL; + } + } /* fallback */ if (!page) page = alloc_pages_node(dev_to_node(dev), flag, get_order(size)); -- cgit v0.10.2 From ff9e43eb4f2eb78067d7b783cc893773b3e129b1 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:06:57 -0700 Subject: thp: consolidate assert checks in __split_huge_page() It doesn't make sense to have two assert checks for each invariant: one for printing and one for BUG(). Let's trigger BUG() if we print error message. Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/huge_memory.c b/mm/huge_memory.c index d199d2d..2434d90 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1830,10 +1830,11 @@ static void __split_huge_page(struct page *page, * the newly established pmd of the child later during the * walk, to be able to set it as pmd_trans_splitting too. */ - if (mapcount != page_mapcount(page)) + if (mapcount != page_mapcount(page)) { printk(KERN_ERR "mapcount %d page_mapcount %d\n", mapcount, page_mapcount(page)); - BUG_ON(mapcount != page_mapcount(page)); + BUG(); + } __split_huge_page_refcount(page, list); @@ -1844,10 +1845,11 @@ static void __split_huge_page(struct page *page, BUG_ON(is_vma_temporary_stack(vma)); mapcount2 += __split_huge_page_map(page, vma, addr); } - if (mapcount != mapcount2) + if (mapcount != mapcount2) { printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", mapcount, mapcount2, page_mapcount(page)); - BUG_ON(mapcount != mapcount2); + BUG(); + } } /* -- cgit v0.10.2 From ae3a8c1c235345dfeb9b4b8c9e118802e3e84533 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Jun 2014 16:06:58 -0700 Subject: mm/huge_memory.c: complete conversion to pr_foo() It was using a mix of pr_foo() and printk(KERN_ERR ...). Cc: Rik van Riel Cc: Mel Gorman Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2434d90..e60837d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -5,6 +5,8 @@ * the COPYING file in the top-level directory. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -151,8 +153,7 @@ static int start_khugepaged(void) khugepaged_thread = kthread_run(khugepaged, NULL, "khugepaged"); if (unlikely(IS_ERR(khugepaged_thread))) { - printk(KERN_ERR - "khugepaged: kthread_run(khugepaged) failed\n"); + pr_err("khugepaged: kthread_run(khugepaged) failed\n"); err = PTR_ERR(khugepaged_thread); khugepaged_thread = NULL; } @@ -584,19 +585,19 @@ static int __init hugepage_init_sysfs(struct kobject **hugepage_kobj) *hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!*hugepage_kobj)) { - printk(KERN_ERR "hugepage: failed to create transparent hugepage kobject\n"); + pr_err("failed to create transparent hugepage kobject\n"); return -ENOMEM; } err = sysfs_create_group(*hugepage_kobj, &hugepage_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); + pr_err("failed to register transparent hugepage group\n"); goto delete_obj; } err = sysfs_create_group(*hugepage_kobj, &khugepaged_attr_group); if (err) { - printk(KERN_ERR "hugepage: failed to register transparent hugepage group\n"); + pr_err("failed to register transparent hugepage group\n"); goto remove_hp_group; } @@ -689,8 +690,7 @@ static int __init setup_transparent_hugepage(char *str) } out: if (!ret) - printk(KERN_WARNING - "transparent_hugepage= cannot parse, ignored\n"); + pr_warn("transparent_hugepage= cannot parse, ignored\n"); return ret; } __setup("transparent_hugepage=", setup_transparent_hugepage); @@ -1831,8 +1831,8 @@ static void __split_huge_page(struct page *page, * walk, to be able to set it as pmd_trans_splitting too. */ if (mapcount != page_mapcount(page)) { - printk(KERN_ERR "mapcount %d page_mapcount %d\n", - mapcount, page_mapcount(page)); + pr_err("mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); BUG(); } @@ -1846,8 +1846,8 @@ static void __split_huge_page(struct page *page, mapcount2 += __split_huge_page_map(page, vma, addr); } if (mapcount != mapcount2) { - printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", - mapcount, mapcount2, page_mapcount(page)); + pr_err("mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); BUG(); } } -- cgit v0.10.2 From 02a8efeda894d3541c7143ed818b25b299504190 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Jun 2014 16:06:59 -0700 Subject: include/linux/mmdebug.h: add VM_WARN_ON() and VM_WARN_ON_ONCE() WARN_ON() and WARN_ON_ONCE(), dependent on CONFIG_DEBUG_VM Cc: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index 2d57efa..a3499d7 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -11,9 +11,13 @@ extern void dump_page_badflags(struct page *page, const char *reason, #define VM_BUG_ON(cond) BUG_ON(cond) #define VM_BUG_ON_PAGE(cond, page) \ do { if (unlikely(cond)) { dump_page(page, NULL); BUG(); } } while (0) +#define VM_WARN_ON(cond) WARN_ON(cond) +#define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) #else #define VM_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) #define VM_BUG_ON_PAGE(cond, page) VM_BUG_ON(cond) +#define VM_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) +#define VM_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) #endif #ifdef CONFIG_DEBUG_VIRTUAL -- cgit v0.10.2 From 8bf8fcb07653fbaea74f96bba1e4ed0f851675ab Mon Sep 17 00:00:00 2001 From: Sebastian Ott Date: Wed, 4 Jun 2014 16:07:00 -0700 Subject: mm/mempool: warn about __GFP_ZERO usage Memory obtained via mempool_alloc is not always zeroed even when called with __GFP_ZERO. Add a note and VM_BUG_ON statement to make that clear. [akpm@linux-foundation.org: use VM_WARN_ON_ONCE] Signed-off-by: Sebastian Ott Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mempool.c b/mm/mempool.c index 905434f..455d468 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -192,6 +192,7 @@ EXPORT_SYMBOL(mempool_resize); * returns NULL. Note that due to preallocation, this function * *never* fails when called from process contexts. (it might * fail if called from an IRQ context.) + * Note: using __GFP_ZERO is not supported. */ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) { @@ -200,6 +201,7 @@ void * mempool_alloc(mempool_t *pool, gfp_t gfp_mask) wait_queue_t wait; gfp_t gfp_temp; + VM_WARN_ON_ONCE(gfp_mask & __GFP_ZERO); might_sleep_if(gfp_mask & __GFP_WAIT); gfp_mask |= __GFP_NOMEMALLOC; /* don't allocate emergency reserves */ -- cgit v0.10.2 From 3dae7fec5e884a4e72e5416db0894de66f586201 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 4 Jun 2014 16:07:01 -0700 Subject: mm: memcontrol: remove hierarchy restrictions for swappiness and oom_control Per-memcg swappiness and oom killing can currently not be tweaked on a memcg that is part of a hierarchy, but not the root of that hierarchy. Users have complained that they can't configure this when they turned on hierarchy mode. In fact, with hierarchy mode becoming the default, this restriction disables the tunables entirely. But there is no good reason for this restriction. The settings for swappiness and OOM killing are taken from whatever memcg whose limit triggered reclaim and OOM invocation, regardless of its position in the hierarchy tree. Allow setting swappiness on any group. The knob on the root memcg already reads the global VM swappiness, make it writable as well. Allow disabling the OOM killer on any non-root memcg. Signed-off-by: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 2622115..1829c65 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -535,17 +535,15 @@ Note: 5.3 swappiness -Similar to /proc/sys/vm/swappiness, but affecting a hierarchy of groups only. +Similar to /proc/sys/vm/swappiness, but only affecting reclaim that is +triggered by this cgroup's hard limit. The tunable in the root cgroup +corresponds to the global swappiness setting. + Please note that unlike the global swappiness, memcg knob set to 0 really prevents from any swapping even if there is a swap storage available. This might lead to memcg OOM killer if there are no file pages to reclaim. -Following cgroups' swappiness can't be changed. -- root cgroup (uses /proc/sys/vm/swappiness). -- a cgroup which uses hierarchy and it has other cgroup(s) below it. -- a cgroup which uses hierarchy and not the root of hierarchy. - 5.4 failcnt A memory cgroup provides memory.failcnt and memory.memsw.failcnt files. @@ -754,7 +752,6 @@ You can disable the OOM-killer by writing "1" to memory.oom_control file, as: #echo 1 > memory.oom_control -This operation is only allowed to the top cgroup of a sub-hierarchy. If OOM-killer is disabled, tasks under cgroup will hang/sleep in memory cgroup's OOM-waitqueue when they request accountable memory. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7bab1de..20f47d9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5444,22 +5444,14 @@ static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); - if (val > 100 || !parent) + if (val > 100) return -EINVAL; - mutex_lock(&memcg_create_mutex); - - /* If under hierarchy, only empty-root can set this value */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); - return -EINVAL; - } - - memcg->swappiness = val; - - mutex_unlock(&memcg_create_mutex); + if (css_parent(css)) + memcg->swappiness = val; + else + vm_swappiness = val; return 0; } @@ -5791,22 +5783,15 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, struct cftype *cft, u64 val) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); /* cannot set to root cgroup and only 0 and 1 are allowed */ - if (!parent || !((val == 0) || (val == 1))) + if (!css_parent(css) || !((val == 0) || (val == 1))) return -EINVAL; - mutex_lock(&memcg_create_mutex); - /* oom-kill-disable is a flag for subhierarchy. */ - if ((parent->use_hierarchy) || memcg_has_children(memcg)) { - mutex_unlock(&memcg_create_mutex); - return -EINVAL; - } memcg->oom_kill_disable = val; if (!val) memcg_oom_recover(memcg); - mutex_unlock(&memcg_create_mutex); + return 0; } -- cgit v0.10.2 From e4f674229ce63dac60be0c4ddfb5ef8d1225d30d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:07:02 -0700 Subject: mm: pass VM_BUG_ON() reason to dump_page() I recently added a patch to let folks pass a "reason" string dump_page() which gets dumped out along with the page's data. This essentially saves the bug-reader a trip in to the source to figure out why we BUG_ON()'d. The new VM_BUG_ON_PAGE() passes in NULL for "reason". It seems like we might as well pass the BUG_ON() condition if we have it. This will bloat kernels a bit with ~160 new strings, but this is all under a debugging option anyway. page:ffffea0008560280 count:1 mapcount:0 mapping:(null) index:0x0 page flags: 0xbfffc0000000001(locked) page dumped because: VM_BUG_ON_PAGE(PageLocked(page)) ------------[ cut here ]------------ kernel BUG at /home/davehans/linux.git/mm/filemap.c:464! invalid opcode: 0000 [#1] SMP CPU: 0 PID: 1 Comm: swapper/0 Not tainted 3.14.0+ #251 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 ... [akpm@linux-foundation.org: include stringify.h] Signed-off-by: Dave Hansen Acked-by: Kirill A. Shutemov Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmdebug.h b/include/linux/mmdebug.h index a3499d7..edd82a1 100644 --- a/include/linux/mmdebug.h +++ b/include/linux/mmdebug.h @@ -1,6 +1,8 @@ #ifndef LINUX_MM_DEBUG_H #define LINUX_MM_DEBUG_H 1 +#include + struct page; extern void dump_page(struct page *page, const char *reason); @@ -9,8 +11,13 @@ extern void dump_page_badflags(struct page *page, const char *reason, #ifdef CONFIG_DEBUG_VM #define VM_BUG_ON(cond) BUG_ON(cond) -#define VM_BUG_ON_PAGE(cond, page) \ - do { if (unlikely(cond)) { dump_page(page, NULL); BUG(); } } while (0) +#define VM_BUG_ON_PAGE(cond, page) \ + do { \ + if (unlikely(cond)) { \ + dump_page(page, "VM_BUG_ON_PAGE(" __stringify(cond)")");\ + BUG(); \ + } \ + } while (0) #define VM_WARN_ON(cond) WARN_ON(cond) #define VM_WARN_ON_ONCE(cond) WARN_ON_ONCE(cond) #else -- cgit v0.10.2 From 56a3c655a3d31cb1afef25b530b5ef6a1e7ddefd Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Wed, 4 Jun 2014 16:07:03 -0700 Subject: memory-hotplug: update documentation to hide information about SECTIONS and remove end_phys_index Seems we all agree that information about SECTION, e.g. section size, sections per memory block should be kept as kernel internals, and not exposed to userspace. This patch updates Documentation/memory-hotplug.txt to refer to memory blocks instead of memory sections where appropriate and added a paragraph to explain that memory blocks are made of memory sections. The documentation update is mostly provided by Nathan. Also, as end_phys_index in code is actually not the end section id, but the end memory block id, which should always be the same as phys_index. So it is removed here. Signed-off-by: Li Zhong Reviewed-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt index 58340d5..f304edb 100644 --- a/Documentation/memory-hotplug.txt +++ b/Documentation/memory-hotplug.txt @@ -88,16 +88,21 @@ phase by hand. 1.3. Unit of Memory online/offline operation ------------ -Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory -into chunks of the same size. The chunk is called a "section". The size of -a section is architecture dependent. For example, power uses 16MiB, ia64 uses -1GiB. The unit of online/offline operation is "one section". (see Section 3.) +Memory hotplug uses SPARSEMEM memory model which allows memory to be divided +into chunks of the same size. These chunks are called "sections". The size of +a memory section is architecture dependent. For example, power uses 16MiB, ia64 +uses 1GiB. -To determine the size of sections, please read this file: +Memory sections are combined into chunks referred to as "memory blocks". The +size of a memory block is architecture dependent and represents the logical +unit upon which memory online/offline operations are to be performed. The +default size of a memory block is the same as memory section size unless an +architecture specifies otherwise. (see Section 3.) + +To determine the size (in bytes) of a memory block please read this file: /sys/devices/system/memory/block_size_bytes -This file shows the size of sections in byte. ----------------------- 2. Kernel Configuration @@ -123,42 +128,35 @@ config options. (CONFIG_ACPI_CONTAINER). This option can be kernel module too. + -------------------------------- -4 sysfs files for memory hotplug +3 sysfs files for memory hotplug -------------------------------- -All sections have their device information in sysfs. Each section is part of -a memory block under /sys/devices/system/memory as +All memory blocks have their device information in sysfs. Each memory block +is described under /sys/devices/system/memory as /sys/devices/system/memory/memoryXXX -(XXX is the section id.) +(XXX is the memory block id.) -Now, XXX is defined as (start_address_of_section / section_size) of the first -section contained in the memory block. The files 'phys_index' and -'end_phys_index' under each directory report the beginning and end section id's -for the memory block covered by the sysfs directory. It is expected that all +For the memory block covered by the sysfs directory. It is expected that all memory sections in this range are present and no memory holes exist in the range. Currently there is no way to determine if there is a memory hole, but the existence of one should not affect the hotplug capabilities of the memory block. -For example, assume 1GiB section size. A device for a memory starting at +For example, assume 1GiB memory block size. A device for a memory starting at 0x100000000 is /sys/device/system/memory/memory4 (0x100000000 / 1Gib = 4) This device covers address range [0x100000000 ... 0x140000000) -Under each section, you can see 4 or 5 files, the end_phys_index file being -a recent addition and not present on older kernels. +Under each memory block, you can see 4 files: -/sys/devices/system/memory/memoryXXX/start_phys_index -/sys/devices/system/memory/memoryXXX/end_phys_index +/sys/devices/system/memory/memoryXXX/phys_index /sys/devices/system/memory/memoryXXX/phys_device /sys/devices/system/memory/memoryXXX/state /sys/devices/system/memory/memoryXXX/removable -'phys_index' : read-only and contains section id of the first section - in the memory block, same as XXX. -'end_phys_index' : read-only and contains section id of the last section - in the memory block. +'phys_index' : read-only and contains memory block id, same as XXX. 'state' : read-write at read: contains online/offline state of memory. at write: user can specify "online_kernel", @@ -185,6 +183,7 @@ For example: A backlink will also be created: /sys/devices/system/memory/memory9/node0 -> ../../node/node0 + -------------------------------- 4. Physical memory hot-add phase -------------------------------- @@ -227,11 +226,10 @@ You can tell the physical address of new memory to the kernel by % echo start_address_of_new_memory > /sys/devices/system/memory/probe -Then, [start_address_of_new_memory, start_address_of_new_memory + section_size) -memory range is hot-added. In this case, hotplug script is not called (in -current implementation). You'll have to online memory by yourself. -Please see "How to online memory" in this text. - +Then, [start_address_of_new_memory, start_address_of_new_memory + +memory_block_size] memory range is hot-added. In this case, hotplug script is +not called (in current implementation). You'll have to online memory by +yourself. Please see "How to online memory" in this text. ------------------------------ @@ -240,36 +238,36 @@ Please see "How to online memory" in this text. 5.1. State of memory ------------ -To see (online/offline) state of memory section, read 'state' file. +To see (online/offline) state of a memory block, read 'state' file. % cat /sys/device/system/memory/memoryXXX/state -If the memory section is online, you'll read "online". -If the memory section is offline, you'll read "offline". +If the memory block is online, you'll read "online". +If the memory block is offline, you'll read "offline". 5.2. How to online memory ------------ Even if the memory is hot-added, it is not at ready-to-use state. -For using newly added memory, you have to "online" the memory section. +For using newly added memory, you have to "online" the memory block. -For onlining, you have to write "online" to the section's state file as: +For onlining, you have to write "online" to the memory block's state file as: % echo online > /sys/devices/system/memory/memoryXXX/state -This onlining will not change the ZONE type of the target memory section, -If the memory section is in ZONE_NORMAL, you can change it to ZONE_MOVABLE: +This onlining will not change the ZONE type of the target memory block, +If the memory block is in ZONE_NORMAL, you can change it to ZONE_MOVABLE: % echo online_movable > /sys/devices/system/memory/memoryXXX/state -(NOTE: current limit: this memory section must be adjacent to ZONE_MOVABLE) +(NOTE: current limit: this memory block must be adjacent to ZONE_MOVABLE) -And if the memory section is in ZONE_MOVABLE, you can change it to ZONE_NORMAL: +And if the memory block is in ZONE_MOVABLE, you can change it to ZONE_NORMAL: % echo online_kernel > /sys/devices/system/memory/memoryXXX/state -(NOTE: current limit: this memory section must be adjacent to ZONE_NORMAL) +(NOTE: current limit: this memory block must be adjacent to ZONE_NORMAL) -After this, section memoryXXX's state will be 'online' and the amount of +After this, memory block XXX's state will be 'online' and the amount of available memory will be increased. Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA). @@ -284,22 +282,22 @@ This may be changed in future. 6.1 Memory offline and ZONE_MOVABLE ------------ Memory offlining is more complicated than memory online. Because memory offline -has to make the whole memory section be unused, memory offline can fail if -the section includes memory which cannot be freed. +has to make the whole memory block be unused, memory offline can fail if +the memory block includes memory which cannot be freed. In general, memory offline can use 2 techniques. -(1) reclaim and free all memory in the section. -(2) migrate all pages in the section. +(1) reclaim and free all memory in the memory block. +(2) migrate all pages in the memory block. In the current implementation, Linux's memory offline uses method (2), freeing -all pages in the section by page migration. But not all pages are +all pages in the memory block by page migration. But not all pages are migratable. Under current Linux, migratable pages are anonymous pages and -page caches. For offlining a section by migration, the kernel has to guarantee -that the section contains only migratable pages. +page caches. For offlining a memory block by migration, the kernel has to +guarantee that the memory block contains only migratable pages. -Now, a boot option for making a section which consists of migratable pages is -supported. By specifying "kernelcore=" or "movablecore=" boot option, you can +Now, a boot option for making a memory block which consists of migratable pages +is supported. By specifying "kernelcore=" or "movablecore=" boot option, you can create ZONE_MOVABLE...a zone which is just used for movable pages. (See also Documentation/kernel-parameters.txt) @@ -315,28 +313,27 @@ creates ZONE_MOVABLE as following. Size of memory for movable pages (for offline) is ZZZZ. -Note) Unfortunately, there is no information to show which section belongs +Note: Unfortunately, there is no information to show which memory block belongs to ZONE_MOVABLE. This is TBD. 6.2. How to offline memory ------------ -You can offline a section by using the same sysfs interface that was used in -memory onlining. +You can offline a memory block by using the same sysfs interface that was used +in memory onlining. % echo offline > /sys/devices/system/memory/memoryXXX/state -If offline succeeds, the state of the memory section is changed to be "offline". +If offline succeeds, the state of the memory block is changed to be "offline". If it fails, some error core (like -EBUSY) will be returned by the kernel. -Even if a section does not belong to ZONE_MOVABLE, you can try to offline it. -If it doesn't contain 'unmovable' memory, you'll get success. +Even if a memory block does not belong to ZONE_MOVABLE, you can try to offline +it. If it doesn't contain 'unmovable' memory, you'll get success. -A section under ZONE_MOVABLE is considered to be able to be offlined easily. -But under some busy state, it may return -EBUSY. Even if a memory section -cannot be offlined due to -EBUSY, you can retry offlining it and may be able to -offline it (or not). -(For example, a page is referred to by some kernel internal call and released - soon.) +A memory block under ZONE_MOVABLE is considered to be able to be offlined +easily. But under some busy state, it may return -EBUSY. Even if a memory +block cannot be offlined due to -EBUSY, you can retry offlining it and may be +able to offline it (or not). (For example, a page is referred to by some kernel +internal call and released soon.) Consideration: Memory hotplug's design direction is to make the possibility of memory offlining @@ -373,11 +370,11 @@ MEMORY_GOING_OFFLINE Generated to begin the process of offlining memory. Allocations are no longer possible from the memory but some of the memory to be offlined is still in use. The callback can be used to free memory known to a - subsystem from the indicated memory section. + subsystem from the indicated memory block. MEMORY_CANCEL_OFFLINE Generated if MEMORY_GOING_OFFLINE fails. Memory is available again from - the section that we attempted to offline. + the memory block that we attempted to offline. MEMORY_OFFLINE Generated after offlining memory is complete. @@ -413,8 +410,8 @@ node if necessary. -------------- - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like sysctl or new control file. - - showing memory section and physical device relationship. - - showing memory section is under ZONE_MOVABLE or not + - showing memory block and physical device relationship. + - showing memory block is under ZONE_MOVABLE or not - test and make it better memory offlining. - support HugeTLB page migration and offlining. - memmap removing at memory offline. diff --git a/drivers/base/memory.c b/drivers/base/memory.c index bece691..89f752d 100644 --- a/drivers/base/memory.c +++ b/drivers/base/memory.c @@ -118,16 +118,6 @@ static ssize_t show_mem_start_phys_index(struct device *dev, return sprintf(buf, "%08lx\n", phys_index); } -static ssize_t show_mem_end_phys_index(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct memory_block *mem = to_memory_block(dev); - unsigned long phys_index; - - phys_index = mem->end_section_nr / sections_per_block; - return sprintf(buf, "%08lx\n", phys_index); -} - /* * Show whether the section of memory is likely to be hot-removable */ @@ -384,7 +374,6 @@ static ssize_t show_phys_device(struct device *dev, } static DEVICE_ATTR(phys_index, 0444, show_mem_start_phys_index, NULL); -static DEVICE_ATTR(end_phys_index, 0444, show_mem_end_phys_index, NULL); static DEVICE_ATTR(state, 0644, show_mem_state, store_mem_state); static DEVICE_ATTR(phys_device, 0444, show_phys_device, NULL); static DEVICE_ATTR(removable, 0444, show_mem_removable, NULL); @@ -529,7 +518,6 @@ struct memory_block *find_memory_block(struct mem_section *section) static struct attribute *memory_memblk_attrs[] = { &dev_attr_phys_index.attr, - &dev_attr_end_phys_index.attr, &dev_attr_state.attr, &dev_attr_phys_device.attr, &dev_attr_removable.attr, -- cgit v0.10.2 From cea371f4f39ced101d27264eddb8cf8c749fdd00 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:04 -0700 Subject: slab: document kmalloc_order Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab_common.c b/mm/slab_common.c index 1950c8f..2834bc2 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -582,6 +582,11 @@ void __init create_kmalloc_caches(unsigned long flags) } #endif /* !CONFIG_SLOB */ +/* + * To avoid unnecessary overhead, we pass through large allocation requests + * directly to the page allocator. We use __GFP_COMP, because we will need to + * know the allocation order to free the pages properly in kfree. + */ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order) { void *ret; -- cgit v0.10.2 From 14bd5b458bf62f84b2639ae288fd83d1da7a9af6 Mon Sep 17 00:00:00 2001 From: Duan Jiong Date: Wed, 4 Jun 2014 16:07:05 -0700 Subject: mm/mmap.c: replace IS_ERR and PTR_ERR with PTR_ERR_OR_ZERO Fix a coccinelle error regarding usage of IS_ERR and PTR_ERR instead of PTR_ERR_OR_ZERO. Signed-off-by: Duan Jiong Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index b1202cf..6cdec3a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2965,9 +2965,7 @@ int install_special_mapping(struct mm_struct *mm, struct vm_area_struct *vma = _install_special_mapping(mm, addr, len, vm_flags, pages); - if (IS_ERR(vma)) - return PTR_ERR(vma); - return 0; + return PTR_ERR_OR_ZERO(vma); } static DEFINE_MUTEX(mm_all_locks_mutex); -- cgit v0.10.2 From 2906dd52831b6049e1d4d9b12f6f234bf2f64a03 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 4 Jun 2014 16:07:06 -0700 Subject: hugetlb: prep_compound_gigantic_page(): drop __init marker The HugeTLB subsystem uses the buddy allocator to allocate hugepages during runtime. This means that hugepages allocation during runtime is limited to MAX_ORDER order. For archs supporting gigantic pages (that is, page sizes greater than MAX_ORDER), this in turn means that those pages can't be allocated at runtime. HugeTLB supports gigantic page allocation during boottime, via the boot allocator. To this end the kernel provides the command-line options hugepagesz= and hugepages=, which can be used to instruct the kernel to allocate N gigantic pages during boot. For example, x86_64 supports 2M and 1G hugepages, but only 2M hugepages can be allocated and freed at runtime. If one wants to allocate 1G gigantic pages, this has to be done at boot via the hugepagesz= and hugepages= command-line options. Now, gigantic page allocation at boottime has two serious problems: 1. Boottime allocation is not NUMA aware. On a NUMA machine the kernel evenly distributes boottime allocated hugepages among nodes. For example, suppose you have a four-node NUMA machine and want to allocate four 1G gigantic pages at boottime. The kernel will allocate one gigantic page per node. On the other hand, we do have users who want to be able to specify which NUMA node gigantic pages should allocated from. So that they can place virtual machines on a specific NUMA node. 2. Gigantic pages allocated at boottime can't be freed At this point it's important to observe that regular hugepages allocated at runtime don't have those problems. This is so because HugeTLB interface for runtime allocation in sysfs supports NUMA and runtime allocated pages can be freed just fine via the buddy allocator. This series adds support for allocating gigantic pages at runtime. It does so by allocating gigantic pages via CMA instead of the buddy allocator. Releasing gigantic pages is also supported via CMA. As this series builds on top of the existing HugeTLB interface, it makes gigantic page allocation and releasing just like regular sized hugepages. This also means that NUMA support just works. For example, to allocate two 1G gigantic pages on node 1, one can do: # echo 2 > \ /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages And, to release all gigantic pages on the same node: # echo 0 > \ /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages Please, refer to patch 5/5 for full technical details. Finally, please note that this series is a follow up for a previous series that tried to extend the command-line options set to be NUMA aware: http://marc.info/?l=linux-mm&m=139593335312191&w=2 During the discussion of that series it was agreed that having runtime allocation support for gigantic pages was a better solution. This patch (of 5): This function is going to be used by non-init code in a future commit. Signed-off-by: Luiz Capitulino Reviewed-by: Davidlohr Bueso Acked-by: Kirill A. Shutemov Reviewed-by: Zhang Yanfei Cc: Marcelo Tosatti Cc: Andrea Arcangeli Cc: Davidlohr Bueso Cc: David Rientjes Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Cc: Rik van Riel Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c82290b..5d54d4b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -690,8 +690,7 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } -static void __init prep_compound_gigantic_page(struct page *page, - unsigned long order) +static void prep_compound_gigantic_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; -- cgit v0.10.2 From bae7f4ae14d47008a11b4358b167cb0ae186c06a Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 4 Jun 2014 16:07:08 -0700 Subject: hugetlb: add hstate_is_gigantic() Signed-off-by: Luiz Capitulino Reviewed-by: Andrea Arcangeli Reviewed-by: Naoya Horiguchi Reviewed-by: Yasuaki Ishimatsu Reviewed-by: Davidlohr Bueso Acked-by: Kirill A. Shutemov Reviewed-by: Zhang Yanfei Cc: David Rientjes Cc: Marcelo Tosatti Cc: Rik van Riel Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d0bad1a..35786ee 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -343,6 +343,11 @@ static inline unsigned huge_page_shift(struct hstate *h) return h->order + PAGE_SHIFT; } +static inline bool hstate_is_gigantic(struct hstate *h) +{ + return huge_page_order(h) >= MAX_ORDER; +} + static inline unsigned int pages_per_huge_page(struct hstate *h) { return 1 << h->order; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5d54d4b..a663105 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -611,7 +611,7 @@ static void update_and_free_page(struct hstate *h, struct page *page) { int i; - VM_BUG_ON(h->order >= MAX_ORDER); + VM_BUG_ON(hstate_is_gigantic(h)); h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; @@ -664,7 +664,7 @@ static void free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { + if (h->surplus_huge_pages_node[nid] && !hstate_is_gigantic(h)) { /* remove the page from active list */ list_del(&page->lru); update_and_free_page(h, page); @@ -768,7 +768,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return NULL; page = alloc_pages_exact_node(nid, @@ -962,7 +962,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int nid) struct page *page; unsigned int r_nid; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return NULL; /* @@ -1155,7 +1155,7 @@ static void return_unused_surplus_pages(struct hstate *h, h->resv_huge_pages -= unused_resv_pages; /* Cannot return gigantic pages currently */ - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return; nr_pages = min(unused_resv_pages, h->surplus_huge_pages); @@ -1355,7 +1355,7 @@ static void __init gather_bootmem_prealloc(void) * fix confusing memory reports from free(1) and another * side-effects, like CommitLimit going negative. */ - if (h->order > (MAX_ORDER - 1)) + if (hstate_is_gigantic(h)) adjust_managed_page_count(page, 1 << h->order); } } @@ -1365,7 +1365,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) unsigned long i; for (i = 0; i < h->max_huge_pages; ++i) { - if (h->order >= MAX_ORDER) { + if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h)) break; } else if (!alloc_fresh_huge_page(h, @@ -1381,7 +1381,7 @@ static void __init hugetlb_init_hstates(void) for_each_hstate(h) { /* oversize hugepages were init'ed in early boot */ - if (h->order < MAX_ORDER) + if (!hstate_is_gigantic(h)) hugetlb_hstate_alloc_pages(h); } } @@ -1415,7 +1415,7 @@ static void try_to_free_low(struct hstate *h, unsigned long count, { int i; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return; for_each_node_mask(i, *nodes_allowed) { @@ -1478,7 +1478,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, { unsigned long min_count, ret; - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return h->max_huge_pages; /* @@ -1605,7 +1605,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, goto out; h = kobj_to_hstate(kobj, &nid); - if (h->order >= MAX_ORDER) { + if (hstate_is_gigantic(h)) { err = -EINVAL; goto out; } @@ -1688,7 +1688,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); - if (h->order >= MAX_ORDER) + if (hstate_is_gigantic(h)) return -EINVAL; err = kstrtoul(buf, 10, &input); @@ -2112,7 +2112,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, tmp = h->max_huge_pages; - if (write && h->order >= MAX_ORDER) + if (write && hstate_is_gigantic(h)) return -EINVAL; table->data = &tmp; @@ -2168,7 +2168,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, tmp = h->nr_overcommit_huge_pages; - if (write && h->order >= MAX_ORDER) + if (write && hstate_is_gigantic(h)) return -EINVAL; table->data = &tmp; -- cgit v0.10.2 From a7407a27c2bba3711d272d72d2d63ea147a929df Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 4 Jun 2014 16:07:09 -0700 Subject: hugetlb: update_and_free_page(): don't clear PG_reserved bit Hugepages pages never get the PG_reserved bit set, so don't clear it. However, note that if the bit gets mistakenly set free_pages_check() will catch it. Signed-off-by: Luiz Capitulino Reviewed-by: Davidlohr Bueso Acked-by: Kirill A. Shutemov Reviewed-by: Zhang Yanfei Cc: Andrea Arcangeli Cc: David Rientjes Cc: Marcelo Tosatti Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Yasuaki Ishimatsu Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a663105..c148eb2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -618,8 +618,8 @@ static void update_and_free_page(struct hstate *h, struct page *page) for (i = 0; i < pages_per_huge_page(h); i++) { page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | - 1 << PG_active | 1 << PG_reserved | - 1 << PG_private | 1 << PG_writeback); + 1 << PG_active | 1 << PG_private | + 1 << PG_writeback); } VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); set_compound_page_dtor(page, NULL); -- cgit v0.10.2 From 1cac6f2c072abe2510f56fec6729a892aa827f62 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 4 Jun 2014 16:07:11 -0700 Subject: hugetlb: move helpers up in the file Next commit will add new code which will want to call for_each_node_mask_to_alloc() macro. Move it, its buddy for_each_node_mask_to_free() and their dependencies up in the file so the new code can use them. This is just code movement, no logic change. Signed-off-by: Luiz Capitulino Reviewed-by: Andrea Arcangeli Reviewed-by: Naoya Horiguchi Reviewed-by: Yasuaki Ishimatsu Reviewed-by: Davidlohr Bueso Acked-by: Kirill A. Shutemov Reviewed-by: Zhang Yanfei Cc: David Rientjes Cc: Marcelo Tosatti Cc: Rik van Riel Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index c148eb2..5964d0d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -607,6 +607,79 @@ err: return NULL; } +/* + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. + */ +static int next_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + nid = next_node(nid, *nodes_allowed); + if (nid == MAX_NUMNODES) + nid = first_node(*nodes_allowed); + VM_BUG_ON(nid >= MAX_NUMNODES); + + return nid; +} + +static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + +/* + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. + */ +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); + h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +/* + * helper for free_pool_huge_page() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. + */ +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) +{ + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); + + return nid; +} + +#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ + nr_nodes--) + +#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ + for (nr_nodes = nodes_weight(*mask); \ + nr_nodes > 0 && \ + ((node = hstate_next_node_to_free(hs, mask)) || 1); \ + nr_nodes--) + static void update_and_free_page(struct hstate *h, struct page *page) { int i; @@ -786,79 +859,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return page; } -/* - * common helper functions for hstate_next_node_to_{alloc|free}. - * We may have allocated or freed a huge page based on a different - * nodes_allowed previously, so h->next_node_to_{alloc|free} might - * be outside of *nodes_allowed. Ensure that we use an allowed - * node for alloc or free. - */ -static int next_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - nid = next_node(nid, *nodes_allowed); - if (nid == MAX_NUMNODES) - nid = first_node(*nodes_allowed); - VM_BUG_ON(nid >= MAX_NUMNODES); - - return nid; -} - -static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - if (!node_isset(nid, *nodes_allowed)) - nid = next_node_allowed(nid, nodes_allowed); - return nid; -} - -/* - * returns the previously saved node ["this node"] from which to - * allocate a persistent huge page for the pool and advance the - * next node from which to allocate, handling wrap at end of node - * mask. - */ -static int hstate_next_node_to_alloc(struct hstate *h, - nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); - h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -/* - * helper for free_pool_huge_page() - return the previously saved - * node ["this node"] from which to free a huge page. Advance the - * next node id whether or not we find a free huge page to free so - * that the next attempt to free addresses the next node. - */ -static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); - h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -#define for_each_node_mask_to_alloc(hs, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_alloc(hs, mask)) || 1); \ - nr_nodes--) - -#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_free(hs, mask)) || 1); \ - nr_nodes--) - static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; -- cgit v0.10.2 From 944d9fec8d7aee3f2e16573e9b6a16634b33f403 Mon Sep 17 00:00:00 2001 From: Luiz Capitulino Date: Wed, 4 Jun 2014 16:07:13 -0700 Subject: hugetlb: add support for gigantic page allocation at runtime HugeTLB is limited to allocating hugepages whose size are less than MAX_ORDER order. This is so because HugeTLB allocates hugepages via the buddy allocator. Gigantic pages (that is, pages whose size is greater than MAX_ORDER order) have to be allocated at boottime. However, boottime allocation has at least two serious problems. First, it doesn't support NUMA and second, gigantic pages allocated at boottime can't be freed. This commit solves both issues by adding support for allocating gigantic pages during runtime. It works just like regular sized hugepages, meaning that the interface in sysfs is the same, it supports NUMA, and gigantic pages can be freed. For example, on x86_64 gigantic pages are 1GB big. To allocate two 1G gigantic pages on node 1, one can do: # echo 2 > \ /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages And to free them all: # echo 0 > \ /sys/devices/system/node/node1/hugepages/hugepages-1048576kB/nr_hugepages The one problem with gigantic page allocation at runtime is that it can't be serviced by the buddy allocator. To overcome that problem, this commit scans all zones from a node looking for a large enough contiguous region. When one is found, it's allocated by using CMA, that is, we call alloc_contig_range() to do the actual allocation. For example, on x86_64 we scan all zones looking for a 1GB contiguous region. When one is found, it's allocated by alloc_contig_range(). One expected issue with that approach is that such gigantic contiguous regions tend to vanish as runtime goes by. The best way to avoid this for now is to make gigantic page allocations very early during system boot, say from a init script. Other possible optimization include using compaction, which is supported by CMA but is not explicitly used by this commit. It's also important to note the following: 1. Gigantic pages allocated at boottime by the hugepages= command-line option can be freed at runtime just fine 2. This commit adds support for gigantic pages only to x86_64. The reason is that I don't have access to nor experience with other archs. The code is arch indepedent though, so it should be simple to add support to different archs 3. I didn't add support for hugepage overcommit, that is allocating a gigantic page on demand when /proc/sys/vm/nr_overcommit_hugepages > 0. The reason is that I don't think it's reasonable to do the hard and long work required for allocating a gigantic page at fault time. But it should be simple to add this if wanted [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Luiz Capitulino Reviewed-by: Davidlohr Bueso Acked-by: Kirill A. Shutemov Reviewed-by: Zhang Yanfei Reviewed-by: Yasuaki Ishimatsu Cc: Andrea Arcangeli Cc: David Rientjes Cc: Marcelo Tosatti Cc: Naoya Horiguchi Cc: Rik van Riel Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 5964d0d..98f0bc1 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -680,11 +680,150 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ((node = hstate_next_node_to_free(hs, mask)) || 1); \ nr_nodes--) +#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) +static void destroy_compound_gigantic_page(struct page *page, + unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __ClearPageTail(p); + set_page_refcounted(p); + p->first_page = NULL; + } + + set_compound_order(page, 0); + __ClearPageHead(page); +} + +static void free_gigantic_page(struct page *page, unsigned order) +{ + free_contig_range(page_to_pfn(page), 1 << order); +} + +static int __alloc_gigantic_page(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long end_pfn = start_pfn + nr_pages; + return alloc_contig_range(start_pfn, end_pfn, MIGRATE_MOVABLE); +} + +static bool pfn_range_valid_gigantic(unsigned long start_pfn, + unsigned long nr_pages) +{ + unsigned long i, end_pfn = start_pfn + nr_pages; + struct page *page; + + for (i = start_pfn; i < end_pfn; i++) { + if (!pfn_valid(i)) + return false; + + page = pfn_to_page(i); + + if (PageReserved(page)) + return false; + + if (page_count(page) > 0) + return false; + + if (PageHuge(page)) + return false; + } + + return true; +} + +static bool zone_spans_last_pfn(const struct zone *zone, + unsigned long start_pfn, unsigned long nr_pages) +{ + unsigned long last_pfn = start_pfn + nr_pages - 1; + return zone_spans_pfn(zone, last_pfn); +} + +static struct page *alloc_gigantic_page(int nid, unsigned order) +{ + unsigned long nr_pages = 1 << order; + unsigned long ret, pfn, flags; + struct zone *z; + + z = NODE_DATA(nid)->node_zones; + for (; z - NODE_DATA(nid)->node_zones < MAX_NR_ZONES; z++) { + spin_lock_irqsave(&z->lock, flags); + + pfn = ALIGN(z->zone_start_pfn, nr_pages); + while (zone_spans_last_pfn(z, pfn, nr_pages)) { + if (pfn_range_valid_gigantic(pfn, nr_pages)) { + /* + * We release the zone lock here because + * alloc_contig_range() will also lock the zone + * at some point. If there's an allocation + * spinning on this lock, it may win the race + * and cause alloc_contig_range() to fail... + */ + spin_unlock_irqrestore(&z->lock, flags); + ret = __alloc_gigantic_page(pfn, nr_pages); + if (!ret) + return pfn_to_page(pfn); + spin_lock_irqsave(&z->lock, flags); + } + pfn += nr_pages; + } + + spin_unlock_irqrestore(&z->lock, flags); + } + + return NULL; +} + +static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); +static void prep_compound_gigantic_page(struct page *page, unsigned long order); + +static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) +{ + struct page *page; + + page = alloc_gigantic_page(nid, huge_page_order(h)); + if (page) { + prep_compound_gigantic_page(page, huge_page_order(h)); + prep_new_huge_page(h, page, nid); + } + + return page; +} + +static int alloc_fresh_gigantic_page(struct hstate *h, + nodemask_t *nodes_allowed) +{ + struct page *page = NULL; + int nr_nodes, node; + + for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { + page = alloc_fresh_gigantic_page_node(h, node); + if (page) + return 1; + } + + return 0; +} + +static inline bool gigantic_page_supported(void) { return true; } +#else +static inline bool gigantic_page_supported(void) { return false; } +static inline void free_gigantic_page(struct page *page, unsigned order) { } +static inline void destroy_compound_gigantic_page(struct page *page, + unsigned long order) { } +static inline int alloc_fresh_gigantic_page(struct hstate *h, + nodemask_t *nodes_allowed) { return 0; } +#endif + static void update_and_free_page(struct hstate *h, struct page *page) { int i; - VM_BUG_ON(hstate_is_gigantic(h)); + if (hstate_is_gigantic(h) && !gigantic_page_supported()) + return; h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; @@ -697,8 +836,13 @@ static void update_and_free_page(struct hstate *h, struct page *page) VM_BUG_ON_PAGE(hugetlb_cgroup_from_page(page), page); set_compound_page_dtor(page, NULL); set_page_refcounted(page); - arch_release_hugepage(page); - __free_pages(page, huge_page_order(h)); + if (hstate_is_gigantic(h)) { + destroy_compound_gigantic_page(page, huge_page_order(h)); + free_gigantic_page(page, huge_page_order(h)); + } else { + arch_release_hugepage(page); + __free_pages(page, huge_page_order(h)); + } } struct hstate *size_to_hstate(unsigned long size) @@ -737,7 +881,7 @@ static void free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (h->surplus_huge_pages_node[nid] && !hstate_is_gigantic(h)) { + if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ list_del(&page->lru); update_and_free_page(h, page); @@ -841,9 +985,6 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; - if (hstate_is_gigantic(h)) - return NULL; - page = alloc_pages_exact_node(nid, htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, @@ -1478,7 +1619,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, { unsigned long min_count, ret; - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic(h) && !gigantic_page_supported()) return h->max_huge_pages; /* @@ -1505,7 +1646,10 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(h, nodes_allowed); + if (hstate_is_gigantic(h)) + ret = alloc_fresh_gigantic_page(h, nodes_allowed); + else + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -1605,7 +1749,7 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, goto out; h = kobj_to_hstate(kobj, &nid); - if (hstate_is_gigantic(h)) { + if (hstate_is_gigantic(h) && !gigantic_page_supported()) { err = -EINVAL; goto out; } @@ -2112,7 +2256,7 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, tmp = h->max_huge_pages; - if (write && hstate_is_gigantic(h)) + if (write && hstate_is_gigantic(h) && !gigantic_page_supported()) return -EINVAL; table->data = &tmp; -- cgit v0.10.2 From 4f9b16a64753d0bb607454347036dc997fd03b82 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:07:14 -0700 Subject: mm: disable zone_reclaim_mode by default When it was introduced, zone_reclaim_mode made sense as NUMA distances punished and workloads were generally partitioned to fit into a NUMA node. NUMA machines are now common but few of the workloads are NUMA-aware and it's routine to see major performance degradation due to zone_reclaim_mode being enabled but relatively few can identify the problem. Those that require zone_reclaim_mode are likely to be able to detect when it needs to be enabled and tune appropriately so lets have a sensible default for the bulk of users. This patch (of 2): zone_reclaim_mode causes processes to prefer reclaiming memory from local node instead of spilling over to other nodes. This made sense initially when NUMA machines were almost exclusively HPC and the workload was partitioned into nodes. The NUMA penalties were sufficiently high to justify reclaiming the memory. On current machines and workloads it is often the case that zone_reclaim_mode destroys performance but not all users know how to detect this. Favour the common case and disable it by default. Users that are sophisticated enough to know they need zone_reclaim_mode will detect it. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Zhang Yanfei Acked-by: Michal Hocko Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index dd9d0e3..5b6da0f 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -772,16 +772,17 @@ This is value ORed together of 2 = Zone reclaim writes dirty pages out 4 = Zone reclaim swaps pages -zone_reclaim_mode is set during bootup to 1 if it is determined that pages -from remote zones will cause a measurable performance reduction. The -page allocator will then reclaim easily reusable pages (those page -cache pages that are currently not used) before allocating off node pages. - -It may be beneficial to switch off zone reclaim if the system is -used for a file server and all of memory should be used for caching files -from disk. In that case the caching effect is more important than +zone_reclaim_mode is disabled by default. For file servers or workloads +that benefit from having their data cached, zone_reclaim_mode should be +left disabled as the caching effect is likely to be more important than data locality. +zone_reclaim may be enabled if it's known that the workload is partitioned +such that each partition fits within a NUMA node and that accessing remote +memory would cause a measurable performance reduction. The page allocator +will then reclaim easily reusable pages (those page cache pages that are +currently not used) before allocating off node pages. + Allowing zone reclaim to write out pages stops processes that are writing large amounts of data from dirtying pages on other nodes. Zone reclaim will write out dirty pages if a zone fills up and so effectively diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h index 3202aa7..6437ca2 100644 --- a/arch/ia64/include/asm/topology.h +++ b/arch/ia64/include/asm/topology.h @@ -21,7 +21,8 @@ #define PENALTY_FOR_NODE_WITH_CPUS 255 /* - * Distance above which we begin to use zone reclaim + * Nodes within this distance are eligible for reclaim by zone_reclaim() when + * zone_reclaim_mode is enabled. */ #define RECLAIM_DISTANCE 15 diff --git a/arch/powerpc/include/asm/topology.h b/arch/powerpc/include/asm/topology.h index c920215..6c8a8c5 100644 --- a/arch/powerpc/include/asm/topology.h +++ b/arch/powerpc/include/asm/topology.h @@ -9,12 +9,8 @@ struct device_node; #ifdef CONFIG_NUMA /* - * Before going off node we want the VM to try and reclaim from the local - * node. It does this if the remote distance is larger than RECLAIM_DISTANCE. - * With the default REMOTE_DISTANCE of 20 and the default RECLAIM_DISTANCE of - * 20, we never reclaim and go off node straight away. - * - * To fix this we choose a smaller value of RECLAIM_DISTANCE. + * If zone_reclaim_mode is enabled, a RECLAIM_DISTANCE of 10 will mean that + * all zones on all nodes will be eligible for zone_reclaim(). */ #define RECLAIM_DISTANCE 10 diff --git a/include/linux/topology.h b/include/linux/topology.h index 973671f..dda6ee5 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -58,7 +58,8 @@ int arch_update_cpu_topology(void); /* * If the distance between nodes in a system is larger than RECLAIM_DISTANCE * (in whatever arch specific measurement units returned by node_distance()) - * then switch on zone reclaim on boot. + * and zone_reclaim_mode is enabled then the VM will only call zone_reclaim() + * on nodes within this distance. */ #define RECLAIM_DISTANCE 30 #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7cfdcd8..dfe954f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1860,8 +1860,6 @@ static void __paginginit init_zone_allows_reclaim(int nid) for_each_node_state(i, N_MEMORY) if (node_distance(nid, i) <= RECLAIM_DISTANCE) node_set(i, NODE_DATA(nid)->reclaim_nodes); - else - zone_reclaim_mode = 1; } #else /* CONFIG_NUMA */ -- cgit v0.10.2 From 5f7a75acdb24c7b9c436b3a0a66eec12e101d19c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:07:15 -0700 Subject: mm: page_alloc: do not cache reclaim distances pgdat->reclaim_nodes tracks if a remote node is allowed to be reclaimed by zone_reclaim due to its distance. As it is expected that zone_reclaim_mode will be rarely enabled it is unreasonable for all machines to take a penalty. Fortunately, the zone_reclaim_mode() path is already slow and it is the path that takes the hit. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Zhang Yanfei Acked-by: Michal Hocko Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index fac5509..c1dbe0b 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -763,7 +763,6 @@ typedef struct pglist_data { unsigned long node_spanned_pages; /* total size of physical page range, including holes */ int node_id; - nodemask_t reclaim_nodes; /* Nodes allowed to reclaim from */ wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dfe954f..9f13bcf 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1850,16 +1850,8 @@ static bool zone_local(struct zone *local_zone, struct zone *zone) static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { - return node_isset(local_zone->node, zone->zone_pgdat->reclaim_nodes); -} - -static void __paginginit init_zone_allows_reclaim(int nid) -{ - int i; - - for_each_node_state(i, N_MEMORY) - if (node_distance(nid, i) <= RECLAIM_DISTANCE) - node_set(i, NODE_DATA(nid)->reclaim_nodes); + return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) < + RECLAIM_DISTANCE; } #else /* CONFIG_NUMA */ @@ -1893,9 +1885,6 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) return true; } -static inline void init_zone_allows_reclaim(int nid) -{ -} #endif /* CONFIG_NUMA */ /* @@ -4933,8 +4922,6 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; - if (node_state(nid, N_MEMORY)) - init_zone_allows_reclaim(nid); #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); #endif -- cgit v0.10.2 From e8d9df3abac5d02dd4e6a0041cb62e69189b2c8e Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:17 -0700 Subject: memcg: un-export __memcg_kmem_get_cache It is only used in slab and should not be used anywhere else so there is no need in exporting it. Signed-off-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 20f47d9..c1b816f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3513,7 +3513,6 @@ out: rcu_read_unlock(); return cachep; } -EXPORT_SYMBOL(__memcg_kmem_get_cache); /* * We need to verify if the allocation against current->mm->owner's memcg is -- cgit v0.10.2 From bfc8c90139ebd049b9801a951db3b9a4a00bed9c Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:18 -0700 Subject: mem-hotplug: implement get/put_online_mems kmem_cache_{create,destroy,shrink} need to get a stable value of cpu/node online mask, because they init/destroy/access per-cpu/node kmem_cache parts, which can be allocated or destroyed on cpu/mem hotplug. To protect against cpu hotplug, these functions use {get,put}_online_cpus. However, they do nothing to synchronize with memory hotplug - taking the slab_mutex does not eliminate the possibility of race as described in patch 2. What we need there is something like get_online_cpus, but for memory. We already have lock_memory_hotplug, which serves for the purpose, but it's a bit of a hammer right now, because it's backed by a mutex. As a result, it imposes some limitations to locking order, which are not desirable, and can't be used just like get_online_cpus. That's why in patch 1 I substitute it with get/put_online_mems, which work exactly like get/put_online_cpus except they block not cpu, but memory hotplug. [ v1 can be found at https://lkml.org/lkml/2014/4/6/68. I NAK'ed it by myself, because it used an rw semaphore for get/put_online_mems, making them dead lock prune. ] This patch (of 2): {un}lock_memory_hotplug, which is used to synchronize against memory hotplug, is currently backed by a mutex, which makes it a bit of a hammer - threads that only want to get a stable value of online nodes mask won't be able to proceed concurrently. Also, it imposes some strong locking ordering rules on it, which narrows down the set of its usage scenarios. This patch introduces get/put_online_mems, which are the same as get/put_online_cpus, but for memory hotplug, i.e. executing a code inside a get/put_online_mems section will guarantee a stable value of online nodes, present pages, etc. lock_memory_hotplug()/unlock_memory_hotplug() are removed altogether. Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tang Chen Cc: Zhang Yanfei Cc: Toshi Kani Cc: Xishi Qiu Cc: Jiang Liu Cc: Rafael J. Wysocki Cc: David Rientjes Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4ca3d95..010d125 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -187,14 +187,8 @@ extern void put_page_bootmem(struct page *page); extern void get_page_bootmem(unsigned long ingo, struct page *page, unsigned long type); -/* - * Lock for memory hotplug guarantees 1) all callbacks for memory hotplug - * notifier will be called under this. 2) offline/online/add/remove memory - * will not run simultaneously. - */ - -void lock_memory_hotplug(void); -void unlock_memory_hotplug(void); +void get_online_mems(void); +void put_online_mems(void); #else /* ! CONFIG_MEMORY_HOTPLUG */ /* @@ -232,8 +226,8 @@ static inline int try_online_node(int nid) return 0; } -static inline void lock_memory_hotplug(void) {} -static inline void unlock_memory_hotplug(void) {} +static inline void get_online_mems(void) {} +static inline void put_online_mems(void) {} #endif /* ! CONFIG_MEMORY_HOTPLUG */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c1dbe0b..ae693e1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -481,9 +481,8 @@ struct zone { * give them a chance of being in the same cacheline. * * Write access to present_pages at runtime should be protected by - * lock_memory_hotplug()/unlock_memory_hotplug(). Any reader who can't - * tolerant drift of present_pages should hold memory hotplug lock to - * get a stable value. + * mem_hotplug_begin/end(). Any reader who can't tolerant drift of + * present_pages should get_online_mems() to get a stable value. * * Read access to managed_pages should be safe because it's unsigned * long. Write access to zone->managed_pages and totalram_pages are @@ -765,7 +764,8 @@ typedef struct pglist_data { int node_id; wait_queue_head_t kswapd_wait; wait_queue_head_t pfmemalloc_wait; - struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ + struct task_struct *kswapd; /* Protected by + mem_hotplug_begin/end() */ int kswapd_max_order; enum zone_type classzone_idx; #ifdef CONFIG_NUMA_BALANCING diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 8d2fcdf..736ade3 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1300,7 +1300,7 @@ static void kmemleak_scan(void) /* * Struct page scanning for each node. */ - lock_memory_hotplug(); + get_online_mems(); for_each_online_node(i) { unsigned long start_pfn = node_start_pfn(i); unsigned long end_pfn = node_end_pfn(i); @@ -1318,7 +1318,7 @@ static void kmemleak_scan(void) scan_block(page, page + 1, NULL, 1); } } - unlock_memory_hotplug(); + put_online_mems(); /* * Scanning the task stacks (may introduce false negatives). diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9ccef39..6917f79 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1664,11 +1664,7 @@ int soft_offline_page(struct page *page, int flags) } } - /* - * The lock_memory_hotplug prevents a race with memory hotplug. - * This is a big hammer, a better would be nicer. - */ - lock_memory_hotplug(); + get_online_mems(); /* * Isolate the page, so that it doesn't get reallocated if it @@ -1679,7 +1675,7 @@ int soft_offline_page(struct page *page, int flags) set_migratetype_isolate(page, true); ret = get_any_page(page, pfn, flags); - unlock_memory_hotplug(); + put_online_mems(); if (ret > 0) { /* for in-use pages */ if (PageHuge(page)) ret = soft_offline_huge_page(page, flags); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a650db2..2906873 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -46,19 +46,84 @@ static void generic_online_page(struct page *page); static online_page_callback_t online_page_callback = generic_online_page; +static DEFINE_MUTEX(online_page_callback_lock); -DEFINE_MUTEX(mem_hotplug_mutex); +/* The same as the cpu_hotplug lock, but for memory hotplug. */ +static struct { + struct task_struct *active_writer; + struct mutex lock; /* Synchronizes accesses to refcount, */ + /* + * Also blocks the new readers during + * an ongoing mem hotplug operation. + */ + int refcount; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} mem_hotplug = { + .active_writer = NULL, + .lock = __MUTEX_INITIALIZER(mem_hotplug.lock), + .refcount = 0, +#ifdef CONFIG_DEBUG_LOCK_ALLOC + .dep_map = {.name = "mem_hotplug.lock" }, +#endif +}; + +/* Lockdep annotations for get/put_online_mems() and mem_hotplug_begin/end() */ +#define memhp_lock_acquire_read() lock_map_acquire_read(&mem_hotplug.dep_map) +#define memhp_lock_acquire() lock_map_acquire(&mem_hotplug.dep_map) +#define memhp_lock_release() lock_map_release(&mem_hotplug.dep_map) + +void get_online_mems(void) +{ + might_sleep(); + if (mem_hotplug.active_writer == current) + return; + memhp_lock_acquire_read(); + mutex_lock(&mem_hotplug.lock); + mem_hotplug.refcount++; + mutex_unlock(&mem_hotplug.lock); + +} -void lock_memory_hotplug(void) +void put_online_mems(void) { - mutex_lock(&mem_hotplug_mutex); + if (mem_hotplug.active_writer == current) + return; + mutex_lock(&mem_hotplug.lock); + + if (WARN_ON(!mem_hotplug.refcount)) + mem_hotplug.refcount++; /* try to fix things up */ + + if (!--mem_hotplug.refcount && unlikely(mem_hotplug.active_writer)) + wake_up_process(mem_hotplug.active_writer); + mutex_unlock(&mem_hotplug.lock); + memhp_lock_release(); + } -void unlock_memory_hotplug(void) +static void mem_hotplug_begin(void) { - mutex_unlock(&mem_hotplug_mutex); + mem_hotplug.active_writer = current; + + memhp_lock_acquire(); + for (;;) { + mutex_lock(&mem_hotplug.lock); + if (likely(!mem_hotplug.refcount)) + break; + __set_current_state(TASK_UNINTERRUPTIBLE); + mutex_unlock(&mem_hotplug.lock); + schedule(); + } } +static void mem_hotplug_done(void) +{ + mem_hotplug.active_writer = NULL; + mutex_unlock(&mem_hotplug.lock); + memhp_lock_release(); +} /* add this memory to iomem resource */ static struct resource *register_memory_resource(u64 start, u64 size) @@ -727,14 +792,16 @@ int set_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; - lock_memory_hotplug(); + get_online_mems(); + mutex_lock(&online_page_callback_lock); if (online_page_callback == generic_online_page) { online_page_callback = callback; rc = 0; } - unlock_memory_hotplug(); + mutex_unlock(&online_page_callback_lock); + put_online_mems(); return rc; } @@ -744,14 +811,16 @@ int restore_online_page_callback(online_page_callback_t callback) { int rc = -EINVAL; - lock_memory_hotplug(); + get_online_mems(); + mutex_lock(&online_page_callback_lock); if (online_page_callback == callback) { online_page_callback = generic_online_page; rc = 0; } - unlock_memory_hotplug(); + mutex_unlock(&online_page_callback_lock); + put_online_mems(); return rc; } @@ -899,7 +968,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ int ret; struct memory_notify arg; - lock_memory_hotplug(); + mem_hotplug_begin(); /* * This doesn't need a lock to do pfn_to_page(). * The section can't be removed here because of the @@ -907,23 +976,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ zone = page_zone(pfn_to_page(pfn)); + ret = -EINVAL; if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && - !can_online_high_movable(zone)) { - unlock_memory_hotplug(); - return -EINVAL; - } + !can_online_high_movable(zone)) + goto out; if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { - if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { - unlock_memory_hotplug(); - return -EINVAL; - } + if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) + goto out; } if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { - if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { - unlock_memory_hotplug(); - return -EINVAL; - } + if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) + goto out; } /* Previous code may changed the zone of the pfn range */ @@ -939,8 +1003,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ ret = notifier_to_errno(ret); if (ret) { memory_notify(MEM_CANCEL_ONLINE, &arg); - unlock_memory_hotplug(); - return ret; + goto out; } /* * If this zone is not populated, then it is not in zonelist. @@ -964,8 +1027,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ (((unsigned long long) pfn + nr_pages) << PAGE_SHIFT) - 1); memory_notify(MEM_CANCEL_ONLINE, &arg); - unlock_memory_hotplug(); - return ret; + goto out; } zone->present_pages += onlined_pages; @@ -995,9 +1057,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if (onlined_pages) memory_notify(MEM_ONLINE, &arg); - unlock_memory_hotplug(); - - return 0; +out: + mem_hotplug_done(); + return ret; } #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */ @@ -1055,7 +1117,7 @@ int try_online_node(int nid) if (node_online(nid)) return 0; - lock_memory_hotplug(); + mem_hotplug_begin(); pgdat = hotadd_new_pgdat(nid, 0); if (!pgdat) { pr_err("Cannot online node %d due to NULL pgdat\n", nid); @@ -1073,7 +1135,7 @@ int try_online_node(int nid) } out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } @@ -1117,7 +1179,7 @@ int __ref add_memory(int nid, u64 start, u64 size) new_pgdat = !p; } - lock_memory_hotplug(); + mem_hotplug_begin(); new_node = !node_online(nid); if (new_node) { @@ -1158,7 +1220,7 @@ error: release_memory_resource(res); out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } EXPORT_SYMBOL_GPL(add_memory); @@ -1565,7 +1627,7 @@ static int __ref __offline_pages(unsigned long start_pfn, if (!test_pages_in_a_zone(start_pfn, end_pfn)) return -EINVAL; - lock_memory_hotplug(); + mem_hotplug_begin(); zone = page_zone(pfn_to_page(start_pfn)); node = zone_to_nid(zone); @@ -1672,7 +1734,7 @@ repeat: writeback_set_ratelimit(); memory_notify(MEM_OFFLINE, &arg); - unlock_memory_hotplug(); + mem_hotplug_done(); return 0; failed_removal: @@ -1684,7 +1746,7 @@ failed_removal: undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); out: - unlock_memory_hotplug(); + mem_hotplug_done(); return ret; } @@ -1888,7 +1950,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) BUG_ON(check_hotplug_memory_range(start, size)); - lock_memory_hotplug(); + mem_hotplug_begin(); /* * All memory blocks must be offlined before removing memory. Check @@ -1897,10 +1959,8 @@ void __ref remove_memory(int nid, u64 start, u64 size) */ ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, check_memblock_offlined_cb); - if (ret) { - unlock_memory_hotplug(); + if (ret) BUG(); - } /* remove memmap entry */ firmware_map_remove(start, start + size, "System RAM"); @@ -1909,7 +1969,7 @@ void __ref remove_memory(int nid, u64 start, u64 size) try_offline_node(nid); - unlock_memory_hotplug(); + mem_hotplug_done(); } EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/slub.c b/mm/slub.c index ddb6079..9cb2501 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4332,7 +4332,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } - lock_memory_hotplug(); + get_online_mems(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { for_each_node_state(node, N_NORMAL_MEMORY) { @@ -4372,7 +4372,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif - unlock_memory_hotplug(); + put_online_mems(); kfree(nodes); return x + sprintf(buf + x, "\n"); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 7901cb7..fbcf460 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3434,7 +3434,7 @@ int kswapd_run(int nid) /* * Called by memory hotplug when all memory in a node is offlined. Caller must - * hold lock_memory_hotplug(). + * hold mem_hotplug_begin/end(). */ void kswapd_stop(int nid) { -- cgit v0.10.2 From 03afc0e25f7fc03537014a770f4c54ebbe63a24c Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:20 -0700 Subject: slab: get_online_mems for kmem_cache_{create,destroy,shrink} When we create a sl[au]b cache, we allocate kmem_cache_node structures for each online NUMA node. To handle nodes taken online/offline, we register memory hotplug notifier and allocate/free kmem_cache_node corresponding to the node that changes its state for each kmem cache. To synchronize between the two paths we hold the slab_mutex during both the cache creationg/destruction path and while tuning per-node parts of kmem caches in memory hotplug handler, but that's not quite right, because it does not guarantee that a newly created cache will have all kmem_cache_nodes initialized in case it races with memory hotplug. For instance, in case of slub: CPU0 CPU1 ---- ---- kmem_cache_create: online_pages: __kmem_cache_create: slab_memory_callback: slab_mem_going_online_callback: lock slab_mutex for each slab_caches list entry allocate kmem_cache node unlock slab_mutex lock slab_mutex init_kmem_cache_nodes: for_each_node_state(node, N_NORMAL_MEMORY) allocate kmem_cache node add kmem_cache to slab_caches list unlock slab_mutex online_pages (continued): node_states_set_node As a result we'll get a kmem cache with not all kmem_cache_nodes allocated. To avoid issues like that we should hold get/put_online_mems() during the whole kmem cache creation/destruction/shrink paths, just like we deal with cpu hotplug. This patch does the trick. Note, that after it's applied, there is no need in taking the slab_mutex for kmem_cache_shrink any more, so it is removed from there. Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Cc: Tang Chen Cc: Zhang Yanfei Cc: Toshi Kani Cc: Xishi Qiu Cc: Jiang Liu Cc: Rafael J. Wysocki Cc: David Rientjes Cc: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Lai Jiangshan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab.c b/mm/slab.c index 944ac58..7067ea7 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2480,8 +2480,7 @@ out: return nr_freed; } -/* Called with slab_mutex held to protect against cpu hotplug */ -static int __cache_shrink(struct kmem_cache *cachep) +int __kmem_cache_shrink(struct kmem_cache *cachep) { int ret = 0, i = 0; struct kmem_cache_node *n; @@ -2502,32 +2501,11 @@ static int __cache_shrink(struct kmem_cache *cachep) return (ret ? 1 : 0); } -/** - * kmem_cache_shrink - Shrink a cache. - * @cachep: The cache to shrink. - * - * Releases as many slabs as possible for a cache. - * To help debugging, a zero exit status indicates all slabs were released. - */ -int kmem_cache_shrink(struct kmem_cache *cachep) -{ - int ret; - BUG_ON(!cachep || in_interrupt()); - - get_online_cpus(); - mutex_lock(&slab_mutex); - ret = __cache_shrink(cachep); - mutex_unlock(&slab_mutex); - put_online_cpus(); - return ret; -} -EXPORT_SYMBOL(kmem_cache_shrink); - int __kmem_cache_shutdown(struct kmem_cache *cachep) { int i; struct kmem_cache_node *n; - int rc = __cache_shrink(cachep); + int rc = __kmem_cache_shrink(cachep); if (rc) return rc; diff --git a/mm/slab.h b/mm/slab.h index 863e67b..d85d598 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -91,6 +91,7 @@ __kmem_cache_alias(const char *name, size_t size, size_t align, #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS) int __kmem_cache_shutdown(struct kmem_cache *); +int __kmem_cache_shrink(struct kmem_cache *); void slab_kmem_cache_release(struct kmem_cache *); struct seq_file; diff --git a/mm/slab_common.c b/mm/slab_common.c index 2834bc2..2dd920d 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -205,6 +205,8 @@ kmem_cache_create(const char *name, size_t size, size_t align, int err; get_online_cpus(); + get_online_mems(); + mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, size); @@ -239,6 +241,8 @@ kmem_cache_create(const char *name, size_t size, size_t align, out_unlock: mutex_unlock(&slab_mutex); + + put_online_mems(); put_online_cpus(); if (err) { @@ -272,6 +276,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c char *cache_name; get_online_cpus(); + get_online_mems(); + mutex_lock(&slab_mutex); /* @@ -295,6 +301,8 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c out_unlock: mutex_unlock(&slab_mutex); + + put_online_mems(); put_online_cpus(); } @@ -328,6 +336,8 @@ void slab_kmem_cache_release(struct kmem_cache *s) void kmem_cache_destroy(struct kmem_cache *s) { get_online_cpus(); + get_online_mems(); + mutex_lock(&slab_mutex); s->refcount--; @@ -359,15 +369,36 @@ void kmem_cache_destroy(struct kmem_cache *s) #else slab_kmem_cache_release(s); #endif - goto out_put_cpus; + goto out; out_unlock: mutex_unlock(&slab_mutex); -out_put_cpus: +out: + put_online_mems(); put_online_cpus(); } EXPORT_SYMBOL(kmem_cache_destroy); +/** + * kmem_cache_shrink - Shrink a cache. + * @cachep: The cache to shrink. + * + * Releases as many slabs as possible for a cache. + * To help debugging, a zero exit status indicates all slabs were released. + */ +int kmem_cache_shrink(struct kmem_cache *cachep) +{ + int ret; + + get_online_cpus(); + get_online_mems(); + ret = __kmem_cache_shrink(cachep); + put_online_mems(); + put_online_cpus(); + return ret; +} +EXPORT_SYMBOL(kmem_cache_shrink); + int slab_is_available(void) { return slab_state >= UP; diff --git a/mm/slob.c b/mm/slob.c index 730cad4..21980e0 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -620,11 +620,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c) return 0; } -int kmem_cache_shrink(struct kmem_cache *d) +int __kmem_cache_shrink(struct kmem_cache *d) { return 0; } -EXPORT_SYMBOL(kmem_cache_shrink); struct kmem_cache kmem_cache_boot = { .name = "kmem_cache", diff --git a/mm/slub.c b/mm/slub.c index 9cb2501..5d1b653 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3398,7 +3398,7 @@ EXPORT_SYMBOL(kfree); * being allocated from last increasing the chance that the last objects * are freed in them. */ -int kmem_cache_shrink(struct kmem_cache *s) +int __kmem_cache_shrink(struct kmem_cache *s) { int node; int i; @@ -3454,7 +3454,6 @@ int kmem_cache_shrink(struct kmem_cache *s) kfree(slabs_by_inuse); return 0; } -EXPORT_SYMBOL(kmem_cache_shrink); static int slab_mem_going_offline_callback(void *arg) { @@ -3462,7 +3461,7 @@ static int slab_mem_going_offline_callback(void *arg) mutex_lock(&slab_mutex); list_for_each_entry(s, &slab_caches, list) - kmem_cache_shrink(s); + __kmem_cache_shrink(s); mutex_unlock(&slab_mutex); return 0; -- cgit v0.10.2 From 9b857d26d08f00971997cd21aa491e27e0c84ce3 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Jun 2014 16:07:21 -0700 Subject: fs/hugetlbfs/inode.c: complete conversion to pr_foo() Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index e19d4c0..2611824 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -6,6 +6,8 @@ * Copyright (C) 2002 Linus Torvalds. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -823,8 +825,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) ps = memparse(args[0].from, &rest); pconfig->hstate = size_to_hstate(ps); if (!pconfig->hstate) { - printk(KERN_ERR - "hugetlbfs: Unsupported page size %lu MB\n", + pr_err("Unsupported page size %lu MB\n", ps >> 20); return -EINVAL; } @@ -832,8 +833,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) } default: - printk(KERN_ERR "hugetlbfs: Bad mount option: \"%s\"\n", - p); + pr_err("Bad mount option: \"%s\"\n", p); return -EINVAL; break; } @@ -853,8 +853,7 @@ hugetlbfs_parse_options(char *options, struct hugetlbfs_config *pconfig) return 0; bad_val: - printk(KERN_ERR "hugetlbfs: Bad value '%s' for mount option '%s'\n", - args[0].from, p); + pr_err("Bad value '%s' for mount option '%s'\n", args[0].from, p); return -EINVAL; } @@ -970,8 +969,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, *user = current_user(); if (user_shm_lock(size, *user)) { task_lock(current); - printk_once(KERN_WARNING - "%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", + pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", current->comm, current->pid); task_unlock(current); } else { @@ -1031,7 +1029,7 @@ static int __init init_hugetlbfs_fs(void) int i; if (!hugepages_supported()) { - pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n"); + pr_info("disabling because there are no supported hugepage sizes\n"); return -ENOTSUPP; } @@ -1060,7 +1058,7 @@ static int __init init_hugetlbfs_fs(void) buf); if (IS_ERR(hugetlbfs_vfsmount[i])) { - pr_err("hugetlb: Cannot mount internal hugetlbfs for " + pr_err("Cannot mount internal hugetlbfs for " "page size %uK", ps_kb); error = PTR_ERR(hugetlbfs_vfsmount[i]); hugetlbfs_vfsmount[i] = NULL; -- cgit v0.10.2 From 5bcc9f86ef09a933255ee66bd899d4601785dad5 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:07:22 -0700 Subject: mm/page_alloc: prevent MIGRATE_RESERVE pages from being misplaced For the MIGRATE_RESERVE pages, it is useful when they do not get misplaced on free_list of other migratetype, otherwise they might get allocated prematurely and e.g. fragment the MIGRATE_RESEVE pageblocks. While this cannot be avoided completely when allocating new MIGRATE_RESERVE pageblocks in min_free_kbytes sysctl handler, we should prevent the misplacement where possible. Currently, it is possible for the misplacement to happen when a MIGRATE_RESERVE page is allocated on pcplist through rmqueue_bulk() as a fallback for other desired migratetype, and then later freed back through free_pcppages_bulk() without being actually used. This happens because free_pcppages_bulk() uses get_freepage_migratetype() to choose the free_list, and rmqueue_bulk() calls set_freepage_migratetype() with the *desired* migratetype and not the page's original MIGRATE_RESERVE migratetype. This patch fixes the problem by moving the call to set_freepage_migratetype() from rmqueue_bulk() down to __rmqueue_smallest() and __rmqueue_fallback() where the actual page's migratetype (e.g. from which free_list the page is taken from) is used. Note that this migratetype might be different from the pageblock's migratetype due to freepage stealing decisions. This is OK, as page stealing never uses MIGRATE_RESERVE as a fallback, and also takes care to leave all MIGRATE_CMA pages on the correct freelist. Therefore, as an additional benefit, the call to get_pageblock_migratetype() from rmqueue_bulk() when CMA is enabled, can be removed completely. This relies on the fact that MIGRATE_CMA pageblocks are created only during system init, and the above. The related is_migrate_isolate() check is also unnecessary, as memory isolation has other ways to move pages between freelists, and drain pcp lists containing pages that should be isolated. The buffered_rmqueue() can also benefit from calling get_freepage_migratetype() instead of get_pageblock_migratetype(). Signed-off-by: Vlastimil Babka Reported-by: Yong-Taek Lee Reported-by: Bartlomiej Zolnierkiewicz Suggested-by: Joonsoo Kim Acked-by: Joonsoo Kim Suggested-by: Mel Gorman Acked-by: Minchan Kim Cc: KOSAKI Motohiro Cc: Marek Szyprowski Cc: Hugh Dickins Cc: Rik van Riel Cc: Michal Nazarewicz Cc: "Wang, Yalin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9f13bcf..ab46f79 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -931,6 +931,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, rmv_page_order(page); area->nr_free--; expand(zone, page, order, current_order, area, migratetype); + set_freepage_migratetype(page, migratetype); return page; } @@ -1057,7 +1058,9 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* * When borrowing from MIGRATE_CMA, we need to release the excess - * buddy pages to CMA itself. + * buddy pages to CMA itself. We also ensure the freepage_migratetype + * is set to CMA so it is returned to the correct freelist in case + * the page ends up being not actually allocated from the pcp lists. */ if (is_migrate_cma(fallback_type)) return fallback_type; @@ -1125,6 +1128,12 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) expand(zone, page, order, current_order, area, new_type); + /* The freepage_migratetype may differ from pageblock's + * migratetype depending on the decisions in + * try_to_steal_freepages. This is OK as long as it does + * not differ for MIGRATE_CMA type. + */ + set_freepage_migratetype(page, new_type); trace_mm_page_alloc_extfrag(page, order, current_order, start_migratetype, migratetype, new_type); @@ -1175,7 +1184,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, int migratetype, int cold) { - int mt = migratetype, i; + int i; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { @@ -1196,14 +1205,8 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, list_add(&page->lru, list); else list_add_tail(&page->lru, list); - if (IS_ENABLED(CONFIG_CMA)) { - mt = get_pageblock_migratetype(page); - if (!is_migrate_cma(mt) && !is_migrate_isolate(mt)) - mt = migratetype; - } - set_freepage_migratetype(page, mt); list = &page->lru; - if (is_migrate_cma(mt)) + if (is_migrate_cma(get_freepage_migratetype(page))) __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, -(1 << order)); } @@ -1572,7 +1575,7 @@ again: if (!page) goto failed; __mod_zone_freepage_state(zone, -(1 << order), - get_pageblock_migratetype(page)); + get_freepage_migratetype(page)); } __mod_zone_page_state(zone, NR_ALLOC_BATCH, -(1 << order)); -- cgit v0.10.2 From 13fb44e4b0414d7e718433a49e6430d5b76bd46e Mon Sep 17 00:00:00 2001 From: Heesub Shin Date: Wed, 4 Jun 2014 16:07:24 -0700 Subject: mm/compaction: clean up unused code lines Remove code lines currently not in use or never called. Signed-off-by: Heesub Shin Acked-by: Vlastimil Babka Cc: Dongjun Shin Cc: Sunghwan Yun Cc: Minchan Kim Cc: Mel Gorman Cc: Joonsoo Kim Cc: Bartlomiej Zolnierkiewicz Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Cc: Dongjun Shin Cc: Sunghwan Yun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 627dc2e..95f7531 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -208,12 +208,6 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } -static inline bool compact_trylock_irqsave(spinlock_t *lock, - unsigned long *flags, struct compact_control *cc) -{ - return compact_checklock_irqsave(lock, flags, false, cc); -} - /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct page *page) { @@ -736,7 +730,6 @@ static void isolate_freepages(struct zone *zone, continue; /* Found a block suitable for isolating free pages from */ - isolated = 0; /* * Take care when isolating in last pageblock of a zone which @@ -1165,9 +1158,6 @@ static void __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (zone_watermark_ok(zone, cc->order, low_wmark_pages(zone), 0, 0)) compaction_defer_reset(zone, cc->order, false); - /* Currently async compaction is never deferred. */ - else if (cc->sync) - defer_compaction(zone, cc->order); } VM_BUG_ON(!list_empty(&cc->freepages)); -- cgit v0.10.2 From c96b9e508f3d06ddb601dcc9792d62c044ab359e Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:07:26 -0700 Subject: mm/compaction: cleanup isolate_freepages() isolate_freepages() is currently somewhat hard to follow thanks to many looks like it is related to the 'low_pfn' variable, but in fact it is not. This patch renames the 'high_pfn' variable to a hopefully less confusing name, and slightly changes its handling without a functional change. A comment made obsolete by recent changes is also updated. [akpm@linux-foundation.org: comment fixes, per Minchan] [iamjoonsoo.kim@lge.com: cleanups] Signed-off-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: Joonsoo Kim Cc: Bartlomiej Zolnierkiewicz Cc: Michal Nazarewicz Cc: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Cc: Dongjun Shin Cc: Sunghwan Yun Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 95f7531..6010aab 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -665,7 +665,10 @@ static void isolate_freepages(struct zone *zone, struct compact_control *cc) { struct page *page; - unsigned long high_pfn, low_pfn, pfn, z_end_pfn; + unsigned long block_start_pfn; /* start of current pageblock */ + unsigned long block_end_pfn; /* end of current pageblock */ + unsigned long low_pfn; /* lowest pfn scanner is able to scan */ + unsigned long next_free_pfn; /* start pfn for scaning at next round */ int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; @@ -673,32 +676,33 @@ static void isolate_freepages(struct zone *zone, * Initialise the free scanner. The starting point is where we last * successfully isolated from, zone-cached value, or the end of the * zone when isolating for the first time. We need this aligned to - * the pageblock boundary, because we do pfn -= pageblock_nr_pages - * in the for loop. + * the pageblock boundary, because we do + * block_start_pfn -= pageblock_nr_pages in the for loop. + * For ending point, take care when isolating in last pageblock of a + * a zone which ends in the middle of a pageblock. * The low boundary is the end of the pageblock the migration scanner * is using. */ - pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_start_pfn = cc->free_pfn & ~(pageblock_nr_pages-1); + block_end_pfn = min(block_start_pfn + pageblock_nr_pages, + zone_end_pfn(zone)); low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* - * Take care that if the migration scanner is at the end of the zone - * that the free scanner does not accidentally move to the next zone - * in the next isolation cycle. + * If no pages are isolated, the block_start_pfn < low_pfn check + * will kick in. */ - high_pfn = min(low_pfn, pfn); - - z_end_pfn = zone_end_pfn(zone); + next_free_pfn = 0; /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. */ - for (; pfn >= low_pfn && cc->nr_migratepages > nr_freepages; - pfn -= pageblock_nr_pages) { + for (; block_start_pfn >= low_pfn && cc->nr_migratepages > nr_freepages; + block_end_pfn = block_start_pfn, + block_start_pfn -= pageblock_nr_pages) { unsigned long isolated; - unsigned long end_pfn; /* * This can iterate a massively long zone without finding any @@ -707,7 +711,7 @@ static void isolate_freepages(struct zone *zone, */ cond_resched(); - if (!pfn_valid(pfn)) + if (!pfn_valid(block_start_pfn)) continue; /* @@ -717,7 +721,7 @@ static void isolate_freepages(struct zone *zone, * i.e. it's possible that all pages within a zones range of * pages do not belong to a single zone. */ - page = pfn_to_page(pfn); + page = pfn_to_page(block_start_pfn); if (page_zone(page) != zone) continue; @@ -730,14 +734,8 @@ static void isolate_freepages(struct zone *zone, continue; /* Found a block suitable for isolating free pages from */ - - /* - * Take care when isolating in last pageblock of a zone which - * ends in the middle of a pageblock. - */ - end_pfn = min(pfn + pageblock_nr_pages, z_end_pfn); - isolated = isolate_freepages_block(cc, pfn, end_pfn, - freelist, false); + isolated = isolate_freepages_block(cc, block_start_pfn, + block_end_pfn, freelist, false); nr_freepages += isolated; /* @@ -745,9 +743,9 @@ static void isolate_freepages(struct zone *zone, * looking for free pages, the search will restart here as * page migration may have returned some pages to the allocator */ - if (isolated) { + if (isolated && next_free_pfn == 0) { cc->finished_update_free = true; - high_pfn = max(high_pfn, pfn); + next_free_pfn = block_start_pfn; } } @@ -758,10 +756,10 @@ static void isolate_freepages(struct zone *zone, * If we crossed the migrate scanner, we want to keep it that way * so that compact_finished() may detect this */ - if (pfn < low_pfn) - cc->free_pfn = max(pfn, zone->zone_start_pfn); - else - cc->free_pfn = high_pfn; + if (block_start_pfn < low_pfn) + next_free_pfn = cc->migrate_pfn; + + cc->free_pfn = next_free_pfn; cc->nr_freepages = nr_freepages; } -- cgit v0.10.2 From 613813e8985bb76bd27937bfa54faf9e9be95a52 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:07:27 -0700 Subject: mm: debug: make bad_range() output more usable and readable Nobody outputs memory addresses in decimal. PFNs are essentially addresses, and they're gibberish in decimal. Output them in hex. Also, add the nid and zone name to give a little more context to the message. Signed-off-by: Dave Hansen Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ab46f79..132c337 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -261,8 +261,9 @@ static int page_outside_zone_boundaries(struct zone *zone, struct page *page) } while (zone_span_seqretry(zone, seq)); if (ret) - pr_err("page %lu outside zone [ %lu - %lu ]\n", - pfn, start_pfn, start_pfn + sp); + pr_err("page 0x%lx outside node %d zone %s [ 0x%lx - 0x%lx ]\n", + pfn, zone_to_nid(zone), zone->name, + start_pfn, start_pfn + sp); return ret; } -- cgit v0.10.2 From 2ee06468702e0742114823a537510cd6f038cacc Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:28 -0700 Subject: Documentation/memcg: warn about incomplete kmemcg state Kmemcg is currently under development and lacks some important features. In particular, it does not have support of kmem reclaim on memory pressure inside cgroup, which practically makes it unusable in real life. Let's warn about it in both Kconfig and Documentation to prevent complaints arising. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/cgroups/memory.txt b/Documentation/cgroups/memory.txt index 1829c65..4937e6f 100644 --- a/Documentation/cgroups/memory.txt +++ b/Documentation/cgroups/memory.txt @@ -270,6 +270,11 @@ When oom event notifier is registered, event will be delivered. 2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) +WARNING: Current implementation lacks reclaim support. That means allocation + attempts will fail when close to the limit even if there are plenty of + kmem available for reclaim. That makes this option unusable in real + life so DO NOT SELECT IT unless for development purposes. + With the Kernel memory extension, the Memory Controller is able to limit the amount of kernel memory used by the system. Kernel memory is fundamentally different than user memory, since it can't be swapped out, which makes it diff --git a/init/Kconfig b/init/Kconfig index 9d3585b..4a1822a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -996,6 +996,12 @@ config MEMCG_KMEM the kmem extension can use it to guarantee that no group of processes will ever exhaust kernel resources alone. + WARNING: Current implementation lacks reclaim support. That means + allocation attempts will fail when close to the limit even if there + are plenty of kmem available for reclaim. That makes this option + unusable in real life so DO NOT SELECT IT unless for development + purposes. + config CGROUP_HUGETLB bool "HugeTLB Resource Controller for Control Groups" depends on RESOURCE_COUNTERS && HUGETLB_PAGE -- cgit v0.10.2 From 5040573e49cc8f0e016a83544a0e552f2f44c897 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Wed, 4 Jun 2014 16:07:29 -0700 Subject: arc: call find_vma with the mmap_sem held Performing vma lookups without taking the mm->mmap_sem is asking for trouble. While doing the search, the vma in question can be modified or even removed before returning to the caller. Take the lock (shared) in order to avoid races while iterating through the vmacache and/or rbtree. [akpm@linux-foundation.org: CSE current->active_mm, per Vineet] Signed-off-by: Davidlohr Bueso Acked-by: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/arc/kernel/troubleshoot.c b/arch/arc/kernel/troubleshoot.c index 73a7450..1badf9b 100644 --- a/arch/arc/kernel/troubleshoot.c +++ b/arch/arc/kernel/troubleshoot.c @@ -86,12 +86,13 @@ static void show_faulting_vma(unsigned long address, char *buf) unsigned long ino = 0; dev_t dev = 0; char *nm = buf; + struct mm_struct *active_mm = current->active_mm; /* can't use print_vma_addr() yet as it doesn't check for * non-inclusive vma */ - - vma = find_vma(current->active_mm, address); + down_read(&active_mm->mmap_sem); + vma = find_vma(active_mm, address); /* check against the find_vma( ) behaviour which returns the next VMA * if the container VMA is not found @@ -110,9 +111,10 @@ static void show_faulting_vma(unsigned long address, char *buf) vma->vm_start < TASK_UNMAPPED_BASE ? address : address - vma->vm_start, nm, vma->vm_start, vma->vm_end); - } else { + } else pr_info(" @No matching VMA found\n"); - } + + up_read(&active_mm->mmap_sem); } static void show_ecr_verbose(struct pt_regs *regs) -- cgit v0.10.2 From cbe97414c24b3ceb9b8df9e45b798a88daae7f71 Mon Sep 17 00:00:00 2001 From: Jonathan Gonzalez V Date: Wed, 4 Jun 2014 16:07:30 -0700 Subject: drm/exynos: call find_vma with the mmap_sem held Performing vma lookups without taking the mm->mmap_sem is asking for trouble. While doing the search, the vma in question can be modified or even removed before returning to the caller. Take the lock (exclusively) in order to avoid races while iterating through the vmacache and/or rbtree. Signed-off-by: Jonathan Gonzalez V Signed-off-by: Davidlohr Bueso Cc: Inki Dae Cc: Joonyoung Shim Cc: David Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/gpu/drm/exynos/exynos_drm_g2d.c b/drivers/gpu/drm/exynos/exynos_drm_g2d.c index 6c1885e..8001587 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_g2d.c +++ b/drivers/gpu/drm/exynos/exynos_drm_g2d.c @@ -467,14 +467,17 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev, goto err_free; } + down_read(¤t->mm->mmap_sem); vma = find_vma(current->mm, userptr); if (!vma) { + up_read(¤t->mm->mmap_sem); DRM_ERROR("failed to get vm region.\n"); ret = -EFAULT; goto err_free_pages; } if (vma->vm_end < userptr + size) { + up_read(¤t->mm->mmap_sem); DRM_ERROR("vma is too small.\n"); ret = -EFAULT; goto err_free_pages; @@ -482,6 +485,7 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev, g2d_userptr->vma = exynos_gem_get_vma(vma); if (!g2d_userptr->vma) { + up_read(¤t->mm->mmap_sem); DRM_ERROR("failed to copy vma.\n"); ret = -ENOMEM; goto err_free_pages; @@ -492,10 +496,12 @@ static dma_addr_t *g2d_userptr_get_dma_addr(struct drm_device *drm_dev, ret = exynos_gem_get_pages_from_userptr(start & PAGE_MASK, npages, pages, vma); if (ret < 0) { + up_read(¤t->mm->mmap_sem); DRM_ERROR("failed to get user pages from userptr.\n"); goto err_put_vma; } + up_read(¤t->mm->mmap_sem); g2d_userptr->pages = pages; sgt = kzalloc(sizeof(*sgt), GFP_KERNEL); -- cgit v0.10.2 From 2329d3751b082b4fd354f334a88662d72abac52d Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:07:31 -0700 Subject: mm/swap.c: clean up *lru_cache_add* functions In mm/swap.c, __lru_cache_add() is exported, but actually there are no users outside this file. This patch unexports __lru_cache_add(), and makes it static. It also exports lru_cache_add_file(), as it is use by cifs and fuse, which can loaded as modules. Signed-off-by: Jianyu Zhan Cc: Minchan Kim Cc: Johannes Weiner Cc: Shaohua Li Cc: Bob Liu Cc: Seth Jennings Cc: Joonsoo Kim Cc: Rafael Aquini Cc: Mel Gorman Acked-by: Rik van Riel Cc: Andrea Arcangeli Cc: Khalid Aziz Cc: Christoph Hellwig Reviewed-by: Zhang Yanfei Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/swap.h b/include/linux/swap.h index 3507115..5a14b92 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -308,8 +308,9 @@ extern unsigned long nr_free_pagecache_pages(void); /* linux/mm/swap.c */ -extern void __lru_cache_add(struct page *); extern void lru_cache_add(struct page *); +extern void lru_cache_add_anon(struct page *page); +extern void lru_cache_add_file(struct page *page); extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); extern void activate_page(struct page *); @@ -323,22 +324,6 @@ extern void swap_setup(void); extern void add_page_to_unevictable_list(struct page *page); -/** - * lru_cache_add: add a page to the page lists - * @page: the page to add - */ -static inline void lru_cache_add_anon(struct page *page) -{ - ClearPageActive(page); - __lru_cache_add(page); -} - -static inline void lru_cache_add_file(struct page *page) -{ - ClearPageActive(page); - __lru_cache_add(page); -} - /* linux/mm/vmscan.c */ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); diff --git a/mm/swap.c b/mm/swap.c index 9ce43ba..c0ed4d6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -582,13 +582,7 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); -/* - * Queue the page for addition to the LRU via pagevec. The decision on whether - * to add the page to the [in]active [file|anon] list is deferred until the - * pagevec is drained. This gives a chance for the caller of __lru_cache_add() - * have the page added to the active list using mark_page_accessed(). - */ -void __lru_cache_add(struct page *page) +static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); @@ -598,11 +592,32 @@ void __lru_cache_add(struct page *page) pagevec_add(pvec, page); put_cpu_var(lru_add_pvec); } -EXPORT_SYMBOL(__lru_cache_add); + +/** + * lru_cache_add: add a page to the page lists + * @page: the page to add + */ +void lru_cache_add_anon(struct page *page) +{ + ClearPageActive(page); + __lru_cache_add(page); +} + +void lru_cache_add_file(struct page *page) +{ + ClearPageActive(page); + __lru_cache_add(page); +} +EXPORT_SYMBOL(lru_cache_add_file); /** * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. + * + * Queue the page for addition to the LRU via pagevec. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * pagevec is drained. This gives a chance for the caller of lru_cache_add() + * have the page added to the active list using mark_page_accessed(). */ void lru_cache_add(struct page *page) { -- cgit v0.10.2 From 64ac4940d557df8caab602eaea679ec7eaf9a57f Mon Sep 17 00:00:00 2001 From: Huang Shijie Date: Wed, 4 Jun 2014 16:07:33 -0700 Subject: mm/mmap.c: remove the first mapping check Remove the first mapping check for vma_link. Move the mutex_lock into the braces when vma->vm_file is true. Signed-off-by: Huang Shijie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mmap.c b/mm/mmap.c index 6cdec3a..8a56d39 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -640,11 +640,10 @@ static void vma_link(struct mm_struct *mm, struct vm_area_struct *vma, { struct address_space *mapping = NULL; - if (vma->vm_file) + if (vma->vm_file) { mapping = vma->vm_file->f_mapping; - - if (mapping) mutex_lock(&mapping->i_mmap_mutex); + } __vma_link(mm, vma, prev, rb_link, rb_parent); __vma_link_file(vma); -- cgit v0.10.2 From f98bafa06a28fdfdd5c49f820f4d6560f636fc46 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:07:34 -0700 Subject: memcg: kill CONFIG_MM_OWNER CONFIG_MM_OWNER makes no sense. It is not user-selectable, it is only selected by CONFIG_MEMCG automatically. So we can kill this option in init/Kconfig and do s/CONFIG_MM_OWNER/CONFIG_MEMCG/ globally. Signed-off-by: Oleg Nesterov Acked-by: Michal Hocko Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8967e20..de16272 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -406,7 +406,7 @@ struct mm_struct { spinlock_t ioctx_lock; struct kioctx_table __rcu *ioctx_table; #endif -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * "owner" points to a task that is regarded as the canonical * user/owner of this mm. All of the following must be true in diff --git a/include/linux/sched.h b/include/linux/sched.h index 70f67e4..2f2dd7d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2967,7 +2967,7 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); #else @@ -2978,7 +2978,7 @@ static inline void mm_update_next_owner(struct mm_struct *mm) static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ static inline unsigned long task_rlimit(const struct task_struct *tsk, unsigned int limit) diff --git a/init/Kconfig b/init/Kconfig index 4a1822a..0a2f09a 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -933,7 +933,6 @@ config RESOURCE_COUNTERS config MEMCG bool "Memory Resource Controller for Control Groups" depends on RESOURCE_COUNTERS - select MM_OWNER select EVENTFD help Provides a memory resource controller that manages both anonymous @@ -951,9 +950,6 @@ config MEMCG disable memory resource controller and you can avoid overheads. (and lose benefits of memory resource controller) - This config option also selects MM_OWNER config option, which - could in turn add some fork/exit overhead. - config MEMCG_SWAP bool "Memory Resource Controller Swap Extension" depends on MEMCG && SWAP @@ -1179,9 +1175,6 @@ config SCHED_AUTOGROUP desktop applications. Task group autogeneration is currently based upon task session. -config MM_OWNER - bool - config SYSFS_DEPRECATED bool "Enable deprecated sysfs features to support old userspace tools" depends on SYSFS diff --git a/kernel/exit.c b/kernel/exit.c index 6ed6a1d..da1b838 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -352,7 +352,7 @@ int disallow_signal(int sig) EXPORT_SYMBOL(disallow_signal); -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG /* * A task is exiting. If it owned this mm, find a new owner for the mm. */ @@ -434,7 +434,7 @@ assign_new_owner: task_unlock(c); put_task_struct(c); } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Turn us into a lazy TLB process if we diff --git a/kernel/fork.c b/kernel/fork.c index 59e3dcc..0d53eb0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1099,12 +1099,12 @@ static void rt_mutex_init_task(struct task_struct *p) #endif } -#ifdef CONFIG_MM_OWNER +#ifdef CONFIG_MEMCG void mm_init_owner(struct mm_struct *mm, struct task_struct *p) { mm->owner = p; } -#endif /* CONFIG_MM_OWNER */ +#endif /* CONFIG_MEMCG */ /* * Initialize POSIX timer handling for a single task. -- cgit v0.10.2 From 675becce15f320337499bc1a9356260409a5ba29 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:07:35 -0700 Subject: mm: vmscan: do not throttle based on pfmemalloc reserves if node has no ZONE_NORMAL throttle_direct_reclaim() is meant to trigger during swap-over-network during which the min watermark is treated as a pfmemalloc reserve. It throttes on the first node in the zonelist but this is flawed. The user-visible impact is that a process running on CPU whose local memory node has no ZONE_NORMAL will stall for prolonged periods of time, possibly indefintely. This is due to throttle_direct_reclaim thinking the pfmemalloc reserves are depleted when in fact they don't exist on that node. On a NUMA machine running a 32-bit kernel (I know) allocation requests from CPUs on node 1 would detect no pfmemalloc reserves and the process gets throttled. This patch adjusts throttling of direct reclaim to throttle based on the first node in the zonelist that has a usable ZONE_NORMAL or lower zone. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mel Gorman Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index fbcf460..53e4534 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2537,10 +2537,17 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) for (i = 0; i <= ZONE_NORMAL; i++) { zone = &pgdat->node_zones[i]; + if (!populated_zone(zone)) + continue; + pfmemalloc_reserve += min_wmark_pages(zone); free_pages += zone_page_state(zone, NR_FREE_PAGES); } + /* If there are no reserves (unexpected config) then do not throttle */ + if (!pfmemalloc_reserve) + return true; + wmark_ok = free_pages > pfmemalloc_reserve / 2; /* kswapd must be awake if processes are being throttled */ @@ -2565,9 +2572,9 @@ static bool pfmemalloc_watermark_ok(pg_data_t *pgdat) static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, nodemask_t *nodemask) { + struct zoneref *z; struct zone *zone; - int high_zoneidx = gfp_zone(gfp_mask); - pg_data_t *pgdat; + pg_data_t *pgdat = NULL; /* * Kernel threads should not be throttled as they may be indirectly @@ -2586,10 +2593,34 @@ static bool throttle_direct_reclaim(gfp_t gfp_mask, struct zonelist *zonelist, if (fatal_signal_pending(current)) goto out; - /* Check if the pfmemalloc reserves are ok */ - first_zones_zonelist(zonelist, high_zoneidx, NULL, &zone); - pgdat = zone->zone_pgdat; - if (pfmemalloc_watermark_ok(pgdat)) + /* + * Check if the pfmemalloc reserves are ok by finding the first node + * with a usable ZONE_NORMAL or lower zone. The expectation is that + * GFP_KERNEL will be required for allocating network buffers when + * swapping over the network so ZONE_HIGHMEM is unusable. + * + * Throttling is based on the first usable node and throttled processes + * wait on a queue until kswapd makes progress and wakes them. There + * is an affinity then between processes waking up and where reclaim + * progress has been made assuming the process wakes on the same node. + * More importantly, processes running on remote nodes will not compete + * for remote pfmemalloc reserves and processes on different nodes + * should make reasonable progress. + */ + for_each_zone_zonelist_nodemask(zone, z, zonelist, + gfp_mask, nodemask) { + if (zone_idx(zone) > ZONE_NORMAL) + continue; + + /* Throttle based on the first usable node */ + pgdat = zone->zone_pgdat; + if (pfmemalloc_watermark_ok(pgdat)) + goto out; + break; + } + + /* If no zone was usable by the allocation flags then do not throttle */ + if (!pgdat) goto out; /* Account for the throttling */ -- cgit v0.10.2 From d8dc595ce3909fbc131bdf5ab8c9808fe624b18d Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 4 Jun 2014 16:07:36 -0700 Subject: memcg: do not hang on OOM when killed by userspace OOM access to memory reserves Eric has reported that he can see task(s) stuck in memcg OOM handler regularly. The only way out is to echo 0 > $GROUP/memory.oom_control His usecase is: - Setup a hierarchy with memory and the freezer (disable kernel oom and have a process watch for oom). - In that memory cgroup add a process with one thread per cpu. - In one thread slowly allocate once per second I think it is 16M of ram and mlock and dirty it (just to force the pages into ram and stay there). - When oom is achieved loop: * attempt to freeze all of the tasks. * if frozen send every task SIGKILL, unfreeze, remove the directory in cgroupfs. Eric has then pinpointed the issue to be memcg specific. All tasks are sitting on the memcg_oom_waitq when memcg oom is disabled. Those that have received fatal signal will bypass the charge and should continue on their way out. The tricky part is that the exit path might trigger a page fault (e.g. exit_robust_list), thus the memcg charge, while its memcg is still under OOM because nobody has released any charges yet. Unlike with the in-kernel OOM handler the exiting task doesn't get TIF_MEMDIE set so it doesn't shortcut further charges of the killed task and falls to the memcg OOM again without any way out of it as there are no fatal signals pending anymore. This patch fixes the issue by checking PF_EXITING early in mem_cgroup_try_charge and bypass the charge same as if it had fatal signal pending or TIF_MEMDIE set. Normally exiting tasks (aka not killed) will bypass the charge now but this should be OK as the task is leaving and will release memory and increasing the memory pressure just to release it in a moment seems dubious wasting of cycles. Besides that charges after exit_signals should be rare. I am bringing this patch again (rebased on the current mmotm tree). I hope we can move forward finally. If there is still an opposition then I would really appreciate a concurrent approach so that we can discuss alternatives. http://comments.gmane.org/gmane.linux.kernel.stable/77650 is a reference to the followup discussion when the patch has been dropped from the mmotm last time. Reported-by: Eric W. Biederman Signed-off-by: Michal Hocko Acked-by: David Rientjes Acked-by: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c1b816f..9f4ff49 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2684,7 +2684,8 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, * free their memory. */ if (unlikely(test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current))) + fatal_signal_pending(current) || + current->flags & PF_EXITING)) goto bypass; if (unlikely(task_in_memcg_oom(current))) -- cgit v0.10.2 From 1e32e77f95d60b121b6072e3e3a650a7f93068f9 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:37 -0700 Subject: memcg, slab: do not schedule cache destruction when last page goes away This patchset is a part of preparations for kmemcg re-parenting. It targets at simplifying kmemcg work-flows and synchronization. First, it removes async per memcg cache destruction (see patches 1, 2). Now caches are only destroyed on memcg offline. That means the caches that are not empty on memcg offline will be leaked. However, they are already leaked, because memcg_cache_params::nr_pages normally never drops to 0 so the destruction work is never scheduled except kmem_cache_shrink is called explicitly. In the future I'm planning reaping such dead caches on vmpressure or periodically. Second, it substitutes per memcg slab_caches_mutex's with the global memcg_slab_mutex, which should be taken during the whole per memcg cache creation/destruction path before the slab_mutex (see patch 3). This greatly simplifies synchronization among various per memcg cache creation/destruction paths. I'm still not quite sure about the end picture, in particular I don't know whether we should reap dead memcgs' kmem caches periodically or try to merge them with their parents (see https://lkml.org/lkml/2014/4/20/38 for more details), but whichever way we choose, this set looks like a reasonable change to me, because it greatly simplifies kmemcg work-flows and eases further development. This patch (of 3): After a memcg is offlined, we mark its kmem caches that cannot be deleted right now due to pending objects as dead by setting the memcg_cache_params::dead flag, so that memcg_release_pages will schedule cache destruction (memcg_cache_params::destroy) as soon as the last slab of the cache is freed (memcg_cache_params::nr_pages drops to zero). I guess the idea was to destroy the caches as soon as possible, i.e. immediately after freeing the last object. However, it just doesn't work that way, because kmem caches always preserve some pages for the sake of performance, so that nr_pages never gets to zero unless the cache is shrunk explicitly using kmem_cache_shrink. Of course, we could account the total number of objects on the cache or check if all the slabs allocated for the cache are empty on kmem_cache_free and schedule destruction if so, but that would be too costly. Thus we have a piece of code that works only when we explicitly call kmem_cache_shrink, but complicates the whole picture a lot. Moreover, it's racy in fact. For instance, kmem_cache_shrink may free the last slab and thus schedule cache destruction before it finishes checking that the cache is empty, which can lead to use-after-free. So I propose to remove this async cache destruction from memcg_release_pages, and check if the cache is empty explicitly after calling kmem_cache_shrink instead. This will simplify things a lot w/o introducing any functional changes. And regarding dead memcg caches (i.e. those that are left hanging around after memcg offline for they have objects), I suppose we should reap them either periodically or on vmpressure as Glauber suggested initially. I'm going to implement this later. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 5155d09..087a453 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -509,7 +509,6 @@ __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size); void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size); -void mem_cgroup_destroy_cache(struct kmem_cache *cachep); int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); /** diff --git a/include/linux/slab.h b/include/linux/slab.h index a6aab2c..905541d 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -524,7 +524,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * @memcg: pointer to the memcg this cache belongs to * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from - * @dead: set to true after the memcg dies; the cache may still be around. * @nr_pages: number of pages that belongs to this cache. * @destroy: worker to be called whenever we are ready, or believe we may be * ready, to destroy this cache. @@ -540,7 +539,6 @@ struct memcg_cache_params { struct mem_cgroup *memcg; struct list_head list; struct kmem_cache *root_cache; - bool dead; atomic_t nr_pages; struct work_struct destroy; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9f4ff49..6b1c45c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3277,60 +3277,11 @@ static void kmem_cache_destroy_work_func(struct work_struct *w) cachep = memcg_params_to_cache(p); - /* - * If we get down to 0 after shrink, we could delete right away. - * However, memcg_release_pages() already puts us back in the workqueue - * in that case. If we proceed deleting, we'll get a dangling - * reference, and removing the object from the workqueue in that case - * is unnecessary complication. We are not a fast path. - * - * Note that this case is fundamentally different from racing with - * shrink_slab(): if memcg_cgroup_destroy_cache() is called in - * kmem_cache_shrink, not only we would be reinserting a dead cache - * into the queue, but doing so from inside the worker racing to - * destroy it. - * - * So if we aren't down to zero, we'll just schedule a worker and try - * again - */ - if (atomic_read(&cachep->memcg_params->nr_pages) != 0) - kmem_cache_shrink(cachep); - else + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) kmem_cache_destroy(cachep); } -void mem_cgroup_destroy_cache(struct kmem_cache *cachep) -{ - if (!cachep->memcg_params->dead) - return; - - /* - * There are many ways in which we can get here. - * - * We can get to a memory-pressure situation while the delayed work is - * still pending to run. The vmscan shrinkers can then release all - * cache memory and get us to destruction. If this is the case, we'll - * be executed twice, which is a bug (the second time will execute over - * bogus data). In this case, cancelling the work should be fine. - * - * But we can also get here from the worker itself, if - * kmem_cache_shrink is enough to shake all the remaining objects and - * get the page count to 0. In this case, we'll deadlock if we try to - * cancel the work (the worker runs with an internal lock held, which - * is the same lock we would hold for cancel_work_sync().) - * - * Since we can't possibly know who got us here, just refrain from - * running if there is already work pending - */ - if (work_pending(&cachep->memcg_params->destroy)) - return; - /* - * We have to defer the actual destroying to a workqueue, because - * we might currently be in a context that cannot sleep. - */ - schedule_work(&cachep->memcg_params->destroy); -} - int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) { struct kmem_cache *c; @@ -3356,16 +3307,7 @@ int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) * We will now manually delete the caches, so to avoid races * we need to cancel all pending destruction workers and * proceed with destruction ourselves. - * - * kmem_cache_destroy() will call kmem_cache_shrink internally, - * and that could spawn the workers again: it is likely that - * the cache still have active pages until this very moment. - * This would lead us back to mem_cgroup_destroy_cache. - * - * But that will not execute at all if the "dead" flag is not - * set, so flip it down to guarantee we are in control. */ - c->memcg_params->dead = false; cancel_work_sync(&c->memcg_params->destroy); kmem_cache_destroy(c); @@ -3387,7 +3329,6 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) mutex_lock(&memcg->slab_caches_mutex); list_for_each_entry(params, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - cachep->memcg_params->dead = true; schedule_work(&cachep->memcg_params->destroy); } mutex_unlock(&memcg->slab_caches_mutex); diff --git a/mm/slab.h b/mm/slab.h index d85d598..b59447a 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -129,11 +129,8 @@ static inline void memcg_bind_pages(struct kmem_cache *s, int order) static inline void memcg_release_pages(struct kmem_cache *s, int order) { - if (is_root_cache(s)) - return; - - if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages)) - mem_cgroup_destroy_cache(s); + if (!is_root_cache(s)) + atomic_sub(1 << order, &s->memcg_params->nr_pages); } static inline bool slab_equal_or_root(struct kmem_cache *s, -- cgit v0.10.2 From c67a8a685a6e9abbaf0235e084168f15a721ae39 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:39 -0700 Subject: memcg, slab: merge memcg_{bind,release}_pages to memcg_{un}charge_slab Currently we have two pairs of kmemcg-related functions that are called on slab alloc/free. The first is memcg_{bind,release}_pages that count the total number of pages allocated on a kmem cache. The second is memcg_{un}charge_slab that {un}charge slab pages to kmemcg resource counter. Let's just merge them to keep the code clean. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 087a453..d38d190 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -506,8 +506,8 @@ void memcg_update_array_size(int num_groups); struct kmem_cache * __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size); -void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size); +int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); +void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b1c45c..86a2078 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2954,7 +2954,7 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) } #endif -int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) +static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) { struct res_counter *fail_res; int ret = 0; @@ -2992,7 +2992,7 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) return ret; } -void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) +static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) { res_counter_uncharge(&memcg->res, size); if (do_swap_account) @@ -3390,6 +3390,24 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, __memcg_create_cache_enqueue(memcg, cachep); memcg_resume_kmem_account(); } + +int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order) +{ + int res; + + res = memcg_charge_kmem(cachep->memcg_params->memcg, gfp, + PAGE_SIZE << order); + if (!res) + atomic_add(1 << order, &cachep->memcg_params->nr_pages); + return res; +} + +void __memcg_uncharge_slab(struct kmem_cache *cachep, int order) +{ + memcg_uncharge_kmem(cachep->memcg_params->memcg, PAGE_SIZE << order); + atomic_sub(1 << order, &cachep->memcg_params->nr_pages); +} + /* * Return the kmem_cache we're supposed to use for a slab allocation. * We try to use the current memcg's version of the cache. diff --git a/mm/slab.c b/mm/slab.c index 7067ea7..9ca3b87 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1712,7 +1712,6 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, __SetPageSlab(page); if (page->pfmemalloc) SetPageSlabPfmemalloc(page); - memcg_bind_pages(cachep, cachep->gfporder); if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) { kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid); @@ -1748,7 +1747,6 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page) page_mapcount_reset(page); page->mapping = NULL; - memcg_release_pages(cachep, cachep->gfporder); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += nr_freed; __free_pages(page, cachep->gfporder); diff --git a/mm/slab.h b/mm/slab.h index b59447a..961a3fb 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -121,18 +121,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return !s->memcg_params || s->memcg_params->is_root_cache; } -static inline void memcg_bind_pages(struct kmem_cache *s, int order) -{ - if (!is_root_cache(s)) - atomic_add(1 << order, &s->memcg_params->nr_pages); -} - -static inline void memcg_release_pages(struct kmem_cache *s, int order) -{ - if (!is_root_cache(s)) - atomic_sub(1 << order, &s->memcg_params->nr_pages); -} - static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { @@ -198,8 +186,7 @@ static __always_inline int memcg_charge_slab(struct kmem_cache *s, return 0; if (is_root_cache(s)) return 0; - return memcg_charge_kmem(s->memcg_params->memcg, gfp, - PAGE_SIZE << order); + return __memcg_charge_slab(s, gfp, order); } static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) @@ -208,7 +195,7 @@ static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order) return; if (is_root_cache(s)) return; - memcg_uncharge_kmem(s->memcg_params->memcg, PAGE_SIZE << order); + __memcg_uncharge_slab(s, order); } #else static inline bool is_root_cache(struct kmem_cache *s) @@ -216,14 +203,6 @@ static inline bool is_root_cache(struct kmem_cache *s) return true; } -static inline void memcg_bind_pages(struct kmem_cache *s, int order) -{ -} - -static inline void memcg_release_pages(struct kmem_cache *s, int order) -{ -} - static inline bool slab_equal_or_root(struct kmem_cache *s, struct kmem_cache *p) { diff --git a/mm/slub.c b/mm/slub.c index 5d1b653..9e288d7 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1422,7 +1422,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node) order = compound_order(page); inc_slabs_node(s, page_to_nid(page), page->objects); - memcg_bind_pages(s, order); page->slab_cache = s; __SetPageSlab(page); if (page->pfmemalloc) @@ -1473,7 +1472,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - memcg_release_pages(s, order); page_mapcount_reset(page); if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; -- cgit v0.10.2 From bd67314586a3d5725e60f2f6587b4cb0f659bb67 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:07:40 -0700 Subject: memcg, slab: simplify synchronization scheme At present, we have the following mutexes protecting data related to per memcg kmem caches: - slab_mutex. This one is held during the whole kmem cache creation and destruction paths. We also take it when updating per root cache memcg_caches arrays (see memcg_update_all_caches). As a result, taking it guarantees there will be no changes to any kmem cache (including per memcg). Why do we need something else then? The point is it is private to slab implementation and has some internal dependencies with other mutexes (get_online_cpus). So we just don't want to rely upon it and prefer to introduce additional mutexes instead. - activate_kmem_mutex. Initially it was added to synchronize initializing kmem limit (memcg_activate_kmem). However, since we can grow per root cache memcg_caches arrays only on kmem limit initialization (see memcg_update_all_caches), we also employ it to protect against memcg_caches arrays relocation (e.g. see __kmem_cache_destroy_memcg_children). - We have a convention not to take slab_mutex in memcontrol.c, but we want to walk over per memcg memcg_slab_caches lists there (e.g. for destroying all memcg caches on offline). So we have per memcg slab_caches_mutex's protecting those lists. The mutexes are taken in the following order: activate_kmem_mutex -> slab_mutex -> memcg::slab_caches_mutex Such a syncrhonization scheme has a number of flaws, for instance: - We can't call kmem_cache_{destroy,shrink} while walking over a memcg::memcg_slab_caches list due to locking order. As a result, in mem_cgroup_destroy_all_caches we schedule the memcg_cache_params::destroy work shrinking and destroying the cache. - We don't have a mutex to synchronize per memcg caches destruction between memcg offline (mem_cgroup_destroy_all_caches) and root cache destruction (__kmem_cache_destroy_memcg_children). Currently we just don't bother about it. This patch simplifies it by substituting per memcg slab_caches_mutex's with the global memcg_slab_mutex. It will be held whenever a new per memcg cache is created or destroyed, so it protects per root cache memcg_caches arrays and per memcg memcg_slab_caches lists. The locking order is following: activate_kmem_mutex -> memcg_slab_mutex -> slab_mutex This allows us to call kmem_cache_{create,shrink,destroy} under the memcg_slab_mutex. As a result, we don't need memcg_cache_params::destroy work any more - we can simply destroy caches while iterating over a per memcg slab caches list. Also using the global mutex simplifies synchronization between concurrent per memcg caches creation/destruction, e.g. mem_cgroup_destroy_all_caches vs __kmem_cache_destroy_memcg_children. The downside of this is that we substitute per-memcg slab_caches_mutex's with a hummer-like global mutex, but since we already take either the slab_mutex or the cgroup_mutex along with a memcg::slab_caches_mutex, it shouldn't hurt concurrency a lot. Signed-off-by: Vladimir Davydov Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Glauber Costa Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d38d190..1fa2324 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -497,8 +497,6 @@ char *memcg_create_cache_name(struct mem_cgroup *memcg, int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache); void memcg_free_cache_params(struct kmem_cache *s); -void memcg_register_cache(struct kmem_cache *s); -void memcg_unregister_cache(struct kmem_cache *s); int memcg_update_cache_size(struct kmem_cache *s, int num_groups); void memcg_update_array_size(int num_groups); @@ -640,14 +638,6 @@ static inline void memcg_free_cache_params(struct kmem_cache *s) { } -static inline void memcg_register_cache(struct kmem_cache *s) -{ -} - -static inline void memcg_unregister_cache(struct kmem_cache *s) -{ -} - static inline struct kmem_cache * memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp) { diff --git a/include/linux/slab.h b/include/linux/slab.h index 905541d..ecbec9c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,7 +116,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM -void kmem_cache_create_memcg(struct mem_cgroup *, struct kmem_cache *); +struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *, + struct kmem_cache *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); @@ -525,8 +526,6 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) * @list: list_head for the list of all caches in this memcg * @root_cache: pointer to the global, root cache, this cache was derived from * @nr_pages: number of pages that belongs to this cache. - * @destroy: worker to be called whenever we are ready, or believe we may be - * ready, to destroy this cache. */ struct memcg_cache_params { bool is_root_cache; @@ -540,7 +539,6 @@ struct memcg_cache_params { struct list_head list; struct kmem_cache *root_cache; atomic_t nr_pages; - struct work_struct destroy; }; }; }; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 86a2078..6b44888 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -357,10 +357,9 @@ struct mem_cgroup { struct cg_proto tcp_mem; #endif #if defined(CONFIG_MEMCG_KMEM) - /* analogous to slab_common's slab_caches list. per-memcg */ + /* analogous to slab_common's slab_caches list, but per-memcg; + * protected by memcg_slab_mutex */ struct list_head memcg_slab_caches; - /* Not a spinlock, we can take a lot of time walking the list */ - struct mutex slab_caches_mutex; /* Index in the kmem_cache->memcg_params->memcg_caches array */ int kmemcg_id; #endif @@ -2913,6 +2912,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, static DEFINE_MUTEX(set_limit_mutex); #ifdef CONFIG_MEMCG_KMEM +/* + * The memcg_slab_mutex is held whenever a per memcg kmem cache is created or + * destroyed. It protects memcg_caches arrays and memcg_slab_caches lists. + */ +static DEFINE_MUTEX(memcg_slab_mutex); + static DEFINE_MUTEX(activate_kmem_mutex); static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) @@ -2945,10 +2950,10 @@ static int mem_cgroup_slabinfo_read(struct seq_file *m, void *v) print_slabinfo_header(m); - mutex_lock(&memcg->slab_caches_mutex); + mutex_lock(&memcg_slab_mutex); list_for_each_entry(params, &memcg->memcg_slab_caches, list) cache_show(memcg_params_to_cache(params), m); - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); return 0; } @@ -3050,8 +3055,6 @@ void memcg_update_array_size(int num) memcg_limited_groups_array_size = memcg_caches_array_size(num); } -static void kmem_cache_destroy_work_func(struct work_struct *w); - int memcg_update_cache_size(struct kmem_cache *s, int num_groups) { struct memcg_cache_params *cur_params = s->memcg_params; @@ -3148,8 +3151,6 @@ int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); css_get(&memcg->css); } else s->memcg_params->is_root_cache = true; @@ -3166,24 +3167,34 @@ void memcg_free_cache_params(struct kmem_cache *s) kfree(s->memcg_params); } -void memcg_register_cache(struct kmem_cache *s) +static void memcg_kmem_create_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { - struct kmem_cache *root; - struct mem_cgroup *memcg; + struct kmem_cache *cachep; int id; - if (is_root_cache(s)) + lockdep_assert_held(&memcg_slab_mutex); + + id = memcg_cache_id(memcg); + + /* + * Since per-memcg caches are created asynchronously on first + * allocation (see memcg_kmem_get_cache()), several threads can try to + * create the same cache, but only one of them may succeed. + */ + if (cache_from_memcg_idx(root_cache, id)) return; + cachep = kmem_cache_create_memcg(memcg, root_cache); /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. + * If we could not create a memcg cache, do not complain, because + * that's not critical at all as we can always proceed with the root + * cache. */ - lockdep_assert_held(&slab_mutex); + if (!cachep) + return; - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; - id = memcg_cache_id(memcg); + list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); /* * Since readers won't lock (see cache_from_memcg_idx()), we need a @@ -3192,49 +3203,30 @@ void memcg_register_cache(struct kmem_cache *s) */ smp_wmb(); - /* - * Initialize the pointer to this cache in its parent's memcg_params - * before adding it to the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id]); - root->memcg_params->memcg_caches[id] = s; - - mutex_lock(&memcg->slab_caches_mutex); - list_add(&s->memcg_params->list, &memcg->memcg_slab_caches); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id]); + root_cache->memcg_params->memcg_caches[id] = cachep; } -void memcg_unregister_cache(struct kmem_cache *s) +static void memcg_kmem_destroy_cache(struct kmem_cache *cachep) { - struct kmem_cache *root; + struct kmem_cache *root_cache; struct mem_cgroup *memcg; int id; - if (is_root_cache(s)) - return; + lockdep_assert_held(&memcg_slab_mutex); - /* - * Holding the slab_mutex assures nobody will touch the memcg_caches - * array while we are modifying it. - */ - lockdep_assert_held(&slab_mutex); + BUG_ON(is_root_cache(cachep)); - root = s->memcg_params->root_cache; - memcg = s->memcg_params->memcg; + root_cache = cachep->memcg_params->root_cache; + memcg = cachep->memcg_params->memcg; id = memcg_cache_id(memcg); - mutex_lock(&memcg->slab_caches_mutex); - list_del(&s->memcg_params->list); - mutex_unlock(&memcg->slab_caches_mutex); + BUG_ON(root_cache->memcg_params->memcg_caches[id] != cachep); + root_cache->memcg_params->memcg_caches[id] = NULL; - /* - * Clear the pointer to this cache in its parent's memcg_params only - * after removing it from the memcg_slab_caches list, otherwise we can - * fail to convert memcg_params_to_cache() while traversing the list. - */ - VM_BUG_ON(root->memcg_params->memcg_caches[id] != s); - root->memcg_params->memcg_caches[id] = NULL; + list_del(&cachep->memcg_params->list); + + kmem_cache_destroy(cachep); } /* @@ -3268,70 +3260,42 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } -static void kmem_cache_destroy_work_func(struct work_struct *w) -{ - struct kmem_cache *cachep; - struct memcg_cache_params *p; - - p = container_of(w, struct memcg_cache_params, destroy); - - cachep = memcg_params_to_cache(p); - - kmem_cache_shrink(cachep); - if (atomic_read(&cachep->memcg_params->nr_pages) == 0) - kmem_cache_destroy(cachep); -} - int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) { struct kmem_cache *c; int i, failed = 0; - /* - * If the cache is being destroyed, we trust that there is no one else - * requesting objects from it. Even if there are, the sanity checks in - * kmem_cache_destroy should caught this ill-case. - * - * Still, we don't want anyone else freeing memcg_caches under our - * noses, which can happen if a new memcg comes to life. As usual, - * we'll take the activate_kmem_mutex to protect ourselves against - * this. - */ - mutex_lock(&activate_kmem_mutex); + mutex_lock(&memcg_slab_mutex); for_each_memcg_cache_index(i) { c = cache_from_memcg_idx(s, i); if (!c) continue; - /* - * We will now manually delete the caches, so to avoid races - * we need to cancel all pending destruction workers and - * proceed with destruction ourselves. - */ - cancel_work_sync(&c->memcg_params->destroy); - kmem_cache_destroy(c); + memcg_kmem_destroy_cache(c); if (cache_from_memcg_idx(s, i)) failed++; } - mutex_unlock(&activate_kmem_mutex); + mutex_unlock(&memcg_slab_mutex); return failed; } static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; - struct memcg_cache_params *params; + struct memcg_cache_params *params, *tmp; if (!memcg_kmem_is_active(memcg)) return; - mutex_lock(&memcg->slab_caches_mutex); - list_for_each_entry(params, &memcg->memcg_slab_caches, list) { + mutex_lock(&memcg_slab_mutex); + list_for_each_entry_safe(params, tmp, &memcg->memcg_slab_caches, list) { cachep = memcg_params_to_cache(params); - schedule_work(&cachep->memcg_params->destroy); + kmem_cache_shrink(cachep); + if (atomic_read(&cachep->memcg_params->nr_pages) == 0) + memcg_kmem_destroy_cache(cachep); } - mutex_unlock(&memcg->slab_caches_mutex); + mutex_unlock(&memcg_slab_mutex); } struct create_work { @@ -3346,7 +3310,10 @@ static void memcg_create_cache_work_func(struct work_struct *w) struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; - kmem_cache_create_memcg(memcg, cachep); + mutex_lock(&memcg_slab_mutex); + memcg_kmem_create_cache(memcg, cachep); + mutex_unlock(&memcg_slab_mutex); + css_put(&memcg->css); kfree(cw); } @@ -5022,13 +4989,14 @@ static int __memcg_activate_kmem(struct mem_cgroup *memcg, * Make sure we have enough space for this cgroup in each root cache's * memcg_params. */ + mutex_lock(&memcg_slab_mutex); err = memcg_update_all_caches(memcg_id + 1); + mutex_unlock(&memcg_slab_mutex); if (err) goto out_rmid; memcg->kmemcg_id = memcg_id; INIT_LIST_HEAD(&memcg->memcg_slab_caches); - mutex_init(&memcg->slab_caches_mutex); /* * We couldn't have accounted to this cgroup, because it hasn't got the diff --git a/mm/slab_common.c b/mm/slab_common.c index 2dd920d..7e348cf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -160,7 +160,6 @@ do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align, s->refcount = 1; list_add(&s->list, &slab_caches); - memcg_register_cache(s); out: if (err) return ERR_PTR(err); @@ -270,9 +269,10 @@ EXPORT_SYMBOL(kmem_cache_create); * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ -void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_cache) +struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { - struct kmem_cache *s; + struct kmem_cache *s = NULL; char *cache_name; get_online_cpus(); @@ -280,14 +280,6 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c mutex_lock(&slab_mutex); - /* - * Since per-memcg caches are created asynchronously on first - * allocation (see memcg_kmem_get_cache()), several threads can try to - * create the same cache, but only one of them may succeed. - */ - if (cache_from_memcg_idx(root_cache, memcg_cache_id(memcg))) - goto out_unlock; - cache_name = memcg_create_cache_name(memcg, root_cache); if (!cache_name) goto out_unlock; @@ -296,14 +288,18 @@ void kmem_cache_create_memcg(struct mem_cgroup *memcg, struct kmem_cache *root_c root_cache->size, root_cache->align, root_cache->flags, root_cache->ctor, memcg, root_cache); - if (IS_ERR(s)) + if (IS_ERR(s)) { kfree(cache_name); + s = NULL; + } out_unlock: mutex_unlock(&slab_mutex); put_online_mems(); put_online_cpus(); + + return s; } static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) @@ -348,11 +344,8 @@ void kmem_cache_destroy(struct kmem_cache *s) goto out_unlock; list_del(&s->list); - memcg_unregister_cache(s); - if (__kmem_cache_shutdown(s) != 0) { list_add(&s->list, &slab_caches); - memcg_register_cache(s); printk(KERN_ERR "kmem_cache_destroy %s: " "Slab cache still has objects\n", s->name); dump_stack(); -- cgit v0.10.2 From 11de9927f9dd3cb0a0f18064fa4b6976fc37e79c Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:07:41 -0700 Subject: mm: numa: add migrated transhuge pages to LRU the same way as base pages Migration of misplaced transhuge pages uses page_add_new_anon_rmap() when putting the page back as it avoided an atomic operations and added the new page to the correct LRU. A side-effect is that the page gets marked activated as part of the migration meaning that transhuge and base pages are treated differently from an aging perspective than base page migration. This patch uses page_add_anon_rmap() and putback_lru_page() on completion of a transhuge migration similar to base page migration. It would require fewer atomic operations to use lru_cache_add without taking an additional reference to the page. The downside would be that it's still different to base page migration and unevictable pages may be added to the wrong LRU for cleaning up later. Testing of the usual workloads did not show any adverse impact to the change. Signed-off-by: Mel Gorman Cc: Rik van Riel Cc: Sasha Levin Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/migrate.c b/mm/migrate.c index bed4880..6247be7 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1852,7 +1852,7 @@ fail_putback: * guarantee the copy is visible before the pagetable update. */ flush_cache_range(vma, mmun_start, mmun_end); - page_add_new_anon_rmap(new_page, vma, mmun_start); + page_add_anon_rmap(new_page, vma, mmun_start); pmdp_clear_flush(vma, mmun_start, pmd); set_pmd_at(mm, mmun_start, pmd, entry); flush_tlb_range(vma, mmun_start, mmun_end); @@ -1877,6 +1877,10 @@ fail_putback: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); + /* Take an "isolate" reference and put new page on the LRU. */ + get_page(new_page); + putback_lru_page(new_page); + unlock_page(new_page); unlock_page(page); put_page(page); /* Drop the rmap reference */ -- cgit v0.10.2 From 399ba0b95670c70aaaa3f4f1623ea9e76c391681 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 4 Jun 2014 16:07:42 -0700 Subject: mm/vmscan.c: avoid throttling reclaim for loop-back nfsd threads When a loopback NFS mount is active and the backing device for the NFS mount becomes congested, that can impose throttling delays on the nfsd threads. These delays significantly reduce throughput and so the NFS mount remains congested. This results in a livelock and the reduced throughput persists. This livelock has been found in testing with the 'wait_iff_congested' call, and could possibly be caused by the 'congestion_wait' call. This livelock is similar to the deadlock which justified the introduction of PF_LESS_THROTTLE, and the same flag can be used to remove this livelock. To minimise the impact of the change, we still throttle nfsd when the filesystem it is writing to is congested, but not when some separate filesystem (e.g. the NFS filesystem) is congested. Signed-off-by: NeilBrown Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index 53e4534..5a8776e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1439,6 +1439,19 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list) } /* + * If a kernel thread (such as nfsd for loop-back mounts) services + * a backing device by writing to the page cache it sets PF_LESS_THROTTLE. + * In that case we should only throttle if the backing device it is + * writing to is congested. In other cases it is safe to throttle. + */ +static int current_may_throttle(void) +{ + return !(current->flags & PF_LESS_THROTTLE) || + current->backing_dev_info == NULL || + bdi_write_congested(current->backing_dev_info); +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -1566,7 +1579,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * implies that pages are cycling through the LRU faster than * they are written so also forcibly stall. */ - if (nr_unqueued_dirty == nr_taken || nr_immediate) + if ((nr_unqueued_dirty == nr_taken || nr_immediate) && + current_may_throttle()) congestion_wait(BLK_RW_ASYNC, HZ/10); } @@ -1575,7 +1589,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * is congested. Allow kswapd to continue until it starts encountering * unqueued dirty pages or cycling through the LRU too quickly. */ - if (!sc->hibernation_mode && !current_is_kswapd()) + if (!sc->hibernation_mode && !current_is_kswapd() && + current_may_throttle()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, -- cgit v0.10.2 From 1b938c0827478df268d2336469ec48d400a2eb3e Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jun 2014 16:07:43 -0700 Subject: fs/buffer.c: remove block_write_full_page_endio() The last in-tree caller of block_write_full_page_endio() was removed in January 2013. It's time to remove the EXPORT_SYMBOL, which leaves block_write_full_page() as the only caller of block_write_full_page_endio(), so inline block_write_full_page_endio() into block_write_full_page(). Signed-off-by: Matthew Wilcox Cc: Hugh Dickins Cc: Dave Chinner Cc: Dheeraj Reddy Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/buffer.c b/fs/buffer.c index 6a8110c..e33f8d5 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2879,10 +2879,9 @@ EXPORT_SYMBOL(block_truncate_page); /* * The generic ->writepage function for buffer-backed address_spaces - * this form passes in the end_io handler used to finish the IO. */ -int block_write_full_page_endio(struct page *page, get_block_t *get_block, - struct writeback_control *wbc, bh_end_io_t *handler) +int block_write_full_page(struct page *page, get_block_t *get_block, + struct writeback_control *wbc) { struct inode * const inode = page->mapping->host; loff_t i_size = i_size_read(inode); @@ -2892,7 +2891,7 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, /* Is the page fully inside i_size? */ if (page->index < end_index) return __block_write_full_page(inode, page, get_block, wbc, - handler); + end_buffer_async_write); /* Is the page fully outside i_size? (truncate in progress) */ offset = i_size & (PAGE_CACHE_SIZE-1); @@ -2915,18 +2914,8 @@ int block_write_full_page_endio(struct page *page, get_block_t *get_block, * writes to that region are not written out to the file." */ zero_user_segment(page, offset, PAGE_CACHE_SIZE); - return __block_write_full_page(inode, page, get_block, wbc, handler); -} -EXPORT_SYMBOL(block_write_full_page_endio); - -/* - * The generic ->writepage function for buffer-backed address_spaces - */ -int block_write_full_page(struct page *page, get_block_t *get_block, - struct writeback_control *wbc) -{ - return block_write_full_page_endio(page, get_block, wbc, - end_buffer_async_write); + return __block_write_full_page(inode, page, get_block, wbc, + end_buffer_async_write); } EXPORT_SYMBOL(block_write_full_page); diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index c18d95b..1a64e7a 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -429,7 +429,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io, block_start = bh_offset(bh); if (block_start >= len) { /* - * Comments copied from block_write_full_page_endio: + * Comments copied from block_write_full_page: * * The page straddles i_size. It must be zeroed out on * each and every writepage invocation because it may diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 8970dcf..8eb6e57 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -828,7 +828,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* * fs-writeback will release the dirty pages without page lock * whose offset are over inode size, the release happens at - * block_write_full_page_endio(). + * block_write_full_page(). */ i_size_write(inode, abs_to); inode->i_blocks = ocfs2_inode_sector_count(inode); diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 7cbf837..324329c 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -207,8 +207,6 @@ void block_invalidatepage(struct page *page, unsigned int offset, unsigned int length); int block_write_full_page(struct page *page, get_block_t *get_block, struct writeback_control *wbc); -int block_write_full_page_endio(struct page *page, get_block_t *get_block, - struct writeback_control *wbc, bh_end_io_t *handler); int block_read_full_page(struct page*, get_block_t*); int block_is_partially_uptodate(struct page *page, unsigned long from, unsigned long count); -- cgit v0.10.2 From 90768eee4565adb28ea28b4ac5081c676a8fe1f2 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jun 2014 16:07:44 -0700 Subject: fs/mpage.c: factor clean_buffers() out of __mpage_writepage() __mpage_writepage() is over 200 lines long, has 20 local variables, four goto labels and could desperately use simplification. Splitting clean_buffers() into a helper function improves matters a little, removing 20+ lines from it. Signed-off-by: Matthew Wilcox Cc: Dave Chinner Cc: Dheeraj Reddy Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/mpage.c b/fs/mpage.c index 4979ffa..4cc9c5d 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -439,6 +439,35 @@ struct mpage_data { unsigned use_writepage; }; +/* + * We have our BIO, so we can now mark the buffers clean. Make + * sure to only clean buffers which we know we'll be writing. + */ +static void clean_buffers(struct page *page, unsigned first_unmapped) +{ + unsigned buffer_counter = 0; + struct buffer_head *bh, *head; + if (!page_has_buffers(page)) + return; + head = page_buffers(page); + bh = head; + + do { + if (buffer_counter++ == first_unmapped) + break; + clear_buffer_dirty(bh); + bh = bh->b_this_page; + } while (bh != head); + + /* + * we cannot drop the bh if the page is not uptodate or a concurrent + * readpage would fail to serialize with the bh and it would read from + * disk before we reach the platter. + */ + if (buffer_heads_over_limit && PageUptodate(page)) + try_to_free_buffers(page); +} + static int __mpage_writepage(struct page *page, struct writeback_control *wbc, void *data) { @@ -591,30 +620,7 @@ alloc_new: goto alloc_new; } - /* - * OK, we have our BIO, so we can now mark the buffers clean. Make - * sure to only clean buffers which we know we'll be writing. - */ - if (page_has_buffers(page)) { - struct buffer_head *head = page_buffers(page); - struct buffer_head *bh = head; - unsigned buffer_counter = 0; - - do { - if (buffer_counter++ == first_unmapped) - break; - clear_buffer_dirty(bh); - bh = bh->b_this_page; - } while (bh != head); - - /* - * we cannot drop the bh if the page is not uptodate - * or a concurrent readpage would fail to serialize with the bh - * and it would read from disk before we reach the platter. - */ - if (buffer_heads_over_limit && PageUptodate(page)) - try_to_free_buffers(page); - } + clean_buffers(page, first_unmapped); BUG_ON(PageWriteback(page)); set_page_writeback(page); -- cgit v0.10.2 From 57d998456ae8680ed446aa1993f45f4d8a9a5973 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jun 2014 16:07:45 -0700 Subject: fs/mpage.c: factor page_endio() out of mpage_end_io() page_endio() takes care of updating all the appropriate page flags once I/O has finished to a page. Switch to using mapping_set_error() instead of setting AS_EIO directly; this will handle thin-provisioned devices correctly. Signed-off-by: Matthew Wilcox Cc: Dave Chinner Cc: Dheeraj Reddy Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/mpage.c b/fs/mpage.c index 4cc9c5d..10da0da 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -48,23 +48,7 @@ static void mpage_end_io(struct bio *bio, int err) bio_for_each_segment_all(bv, bio, i) { struct page *page = bv->bv_page; - - if (bio_data_dir(bio) == READ) { - if (!err) { - SetPageUptodate(page); - } else { - ClearPageUptodate(page); - SetPageError(page); - } - unlock_page(page); - } else { /* bio_data_dir(bio) == WRITE */ - if (err) { - SetPageError(page); - if (page->mapping) - set_bit(AS_EIO, &page->mapping->flags); - } - end_page_writeback(page); - } + page_endio(page, bio_data_dir(bio), err); } bio_put(bio); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 45598f1..718214c 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -425,6 +425,8 @@ static inline void wait_on_page_writeback(struct page *page) extern void end_page_writeback(struct page *page); void wait_for_stable_page(struct page *page); +void page_endio(struct page *page, int rw, int err); + /* * Add an arbitrary waiter to a page's wait queue */ diff --git a/mm/filemap.c b/mm/filemap.c index 021056c..47d235b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -764,6 +764,31 @@ void end_page_writeback(struct page *page) } EXPORT_SYMBOL(end_page_writeback); +/* + * After completing I/O on a page, call this routine to update the page + * flags appropriately + */ +void page_endio(struct page *page, int rw, int err) +{ + if (rw == READ) { + if (!err) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { /* rw == WRITE */ + if (err) { + SetPageError(page); + if (page->mapping) + mapping_set_error(page->mapping, err); + } + end_page_writeback(page); + } +} +EXPORT_SYMBOL_GPL(page_endio); + /** * __lock_page - get a lock on the page, assuming we need to sleep to get it * @page: the page to lock -- cgit v0.10.2 From 47a191fd38ebddb1bd1510ec2bc1085c578c8868 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jun 2014 16:07:46 -0700 Subject: fs/block_dev.c: add bdev_read_page() and bdev_write_page() A block device driver may choose to provide a rw_page operation. These will be called when the filesystem is attempting to do page sized I/O to page cache pages (ie not for direct I/O). This does preclude I/Os that are larger than page size, so this may only be a performance gain for some devices. Signed-off-by: Matthew Wilcox Tested-by: Dheeraj Reddy Cc: Dave Chinner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/block_dev.c b/fs/block_dev.c index 552a8d1..83fba15 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -363,6 +363,69 @@ int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) } EXPORT_SYMBOL(blkdev_fsync); +/** + * bdev_read_page() - Start reading a page from a block device + * @bdev: The device to read the page from + * @sector: The offset on the device to read the page to (need not be aligned) + * @page: The page to read + * + * On entry, the page should be locked. It will be unlocked when the page + * has been read. If the block driver implements rw_page synchronously, + * that will be true on exit from this function, but it need not be. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to read this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_read_page(struct block_device *bdev, sector_t sector, + struct page *page) +{ + const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page) + return -EOPNOTSUPP; + return ops->rw_page(bdev, sector + get_start_sect(bdev), page, READ); +} +EXPORT_SYMBOL_GPL(bdev_read_page); + +/** + * bdev_write_page() - Start writing a page to a block device + * @bdev: The device to write the page to + * @sector: The offset on the device to write the page to (need not be aligned) + * @page: The page to write + * @wbc: The writeback_control for the write + * + * On entry, the page should be locked and not currently under writeback. + * On exit, if the write started successfully, the page will be unlocked and + * under writeback. If the write failed already (eg the driver failed to + * queue the page to the device), the page will still be locked. If the + * caller is a ->writepage implementation, it will need to unlock the page. + * + * Errors returned by this function are usually "soft", eg out of memory, or + * queue full; callers should try a different route to write this page rather + * than propagate an error back up the stack. + * + * Return: negative errno if an error occurs, 0 if submission was successful. + */ +int bdev_write_page(struct block_device *bdev, sector_t sector, + struct page *page, struct writeback_control *wbc) +{ + int result; + int rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE; + const struct block_device_operations *ops = bdev->bd_disk->fops; + if (!ops->rw_page) + return -EOPNOTSUPP; + set_page_writeback(page); + result = ops->rw_page(bdev, sector + get_start_sect(bdev), page, rw); + if (result) + end_page_writeback(page); + else + unlock_page(page); + return result; +} +EXPORT_SYMBOL_GPL(bdev_write_page); + /* * pseudo-fs */ diff --git a/fs/mpage.c b/fs/mpage.c index 10da0da..5f9ed62 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -269,6 +269,11 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, alloc_new: if (bio == NULL) { + if (first_hole == blocks_per_page) { + if (!bdev_read_page(bdev, blocks[0] << (blkbits - 9), + page)) + goto out; + } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), min_t(int, nr_pages, bio_get_nr_vecs(bdev)), GFP_KERNEL); @@ -587,6 +592,13 @@ page_is_mapped: alloc_new: if (bio == NULL) { + if (first_unmapped == blocks_per_page) { + if (!bdev_write_page(bdev, blocks[0] << (blkbits - 9), + page, wbc)) { + clean_buffers(page, first_unmapped); + goto out; + } + } bio = mpage_alloc(bdev, blocks[0] << (blkbits - 9), bio_get_nr_vecs(bdev), GFP_NOFS|__GFP_HIGH); if (bio == NULL) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 45cf6e5..2f3886e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1588,6 +1588,7 @@ static inline bool blk_integrity_is_initialized(struct gendisk *g) struct block_device_operations { int (*open) (struct block_device *, fmode_t); void (*release) (struct gendisk *, fmode_t); + int (*rw_page)(struct block_device *, sector_t, struct page *, int rw); int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long); int (*direct_access) (struct block_device *, sector_t, @@ -1606,6 +1607,9 @@ struct block_device_operations { extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int, unsigned long); +extern int bdev_read_page(struct block_device *, sector_t, struct page *); +extern int bdev_write_page(struct block_device *, sector_t, struct page *, + struct writeback_control *); #else /* CONFIG_BLOCK */ struct block_device; -- cgit v0.10.2 From dd6bd0d9c7dbb3951005db4e424abbd37c4ff801 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jun 2014 16:07:48 -0700 Subject: swap: use bdev_read_page() / bdev_write_page() By calling the device driver to write the page directly, we avoid allocating a BIO, which allows us to free memory without allocating memory. [akpm@linux-foundation.org: fix used-uninitialized bug] Signed-off-by: Matthew Wilcox Cc: Dave Chinner Cc: Dheeraj Reddy Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_io.c b/mm/page_io.c index 7c59ef6..58b50d2 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -248,11 +248,16 @@ out: return ret; } +static sector_t swap_page_sector(struct page *page) +{ + return (sector_t)__page_file_index(page) << (PAGE_CACHE_SHIFT - 9); +} + int __swap_writepage(struct page *page, struct writeback_control *wbc, void (*end_write_func)(struct bio *, int)) { struct bio *bio; - int ret = 0, rw = WRITE; + int ret, rw = WRITE; struct swap_info_struct *sis = page_swap_info(page); if (sis->flags & SWP_FILE) { @@ -297,6 +302,13 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, return ret; } + ret = bdev_write_page(sis->bdev, swap_page_sector(page), page, wbc); + if (!ret) { + count_vm_event(PSWPOUT); + return 0; + } + + ret = 0; bio = get_swap_bio(GFP_NOIO, page, end_write_func); if (bio == NULL) { set_page_dirty(page); @@ -338,6 +350,13 @@ int swap_readpage(struct page *page) return ret; } + ret = bdev_read_page(sis->bdev, swap_page_sector(page), page); + if (!ret) { + count_vm_event(PSWPIN); + return 0; + } + + ret = 0; bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); -- cgit v0.10.2 From a72132c31d580969a38972aaf925915e861cd342 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jun 2014 16:07:49 -0700 Subject: brd: add support for rw_page() Signed-off-by: Matthew Wilcox Cc: Dave Chinner Cc: Dheeraj Reddy Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/block/brd.c b/drivers/block/brd.c index e73b85c..807d3d5 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -360,6 +360,15 @@ out: bio_endio(bio, err); } +static int brd_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + struct brd_device *brd = bdev->bd_disk->private_data; + int err = brd_do_bvec(brd, page, PAGE_CACHE_SIZE, 0, rw, sector); + page_endio(page, rw & WRITE, err); + return err; +} + #ifdef CONFIG_BLK_DEV_XIP static int brd_direct_access(struct block_device *bdev, sector_t sector, void **kaddr, unsigned long *pfn) @@ -419,6 +428,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode, static const struct block_device_operations brd_fops = { .owner = THIS_MODULE, + .rw_page = brd_rw_page, .ioctl = brd_ioctl, #ifdef CONFIG_BLK_DEV_XIP .direct_access = brd_direct_access, -- cgit v0.10.2 From 96f8d8e0965ba3d895f4cf63bbb16fd9fca8998b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jun 2014 16:07:50 -0700 Subject: brd: return -ENOSPC rather than -ENOMEM on page allocation failure brd is effectively a thinly provisioned device. Thinly provisioned devices return -ENOSPC when they can't write a new block. -ENOMEM is an implementation detail that callers shouldn't know. Signed-off-by: Matthew Wilcox Acked-by: Dave Chinner Cc: Dheeraj Reddy Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 807d3d5..c7d138e 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -200,11 +200,11 @@ static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n) copy = min_t(size_t, n, PAGE_SIZE - offset); if (!brd_insert_page(brd, sector)) - return -ENOMEM; + return -ENOSPC; if (copy < n) { sector += copy >> SECTOR_SHIFT; if (!brd_insert_page(brd, sector)) - return -ENOMEM; + return -ENOSPC; } return 0; } @@ -384,7 +384,7 @@ static int brd_direct_access(struct block_device *bdev, sector_t sector, return -ERANGE; page = brd_insert_page(brd, sector); if (!page) - return -ENOMEM; + return -ENOSPC; *kaddr = page_address(page); *pfn = page_to_pfn(page); -- cgit v0.10.2 From c8e861a531b0199dc6ef9e402e29c474dfa507ce Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:07:51 -0700 Subject: mm/memory_hotplug.c: use PFN_DOWN() Replace ((x) >> PAGE_SHIFT) with the pfn macro. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2906873..cbb7ca0 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1069,7 +1069,7 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) struct pglist_data *pgdat; unsigned long zones_size[MAX_NR_ZONES] = {0}; unsigned long zholes_size[MAX_NR_ZONES] = {0}; - unsigned long start_pfn = start >> PAGE_SHIFT; + unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); if (!pgdat) { @@ -1141,7 +1141,7 @@ out: static int check_hotplug_memory_range(u64 start, u64 size) { - u64 start_pfn = start >> PAGE_SHIFT; + u64 start_pfn = PFN_DOWN(start); u64 nr_pages = size >> PAGE_SHIFT; /* Memory range must be aligned with section */ -- cgit v0.10.2 From f7e2f7e896d8b74e92b687f7333721fd7be0f4b5 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:07:51 -0700 Subject: mm/memblock.c: use PFN_DOWN Replace ((x) >> PAGE_SHIFT) with the pfn macro. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memblock.c b/mm/memblock.c index 1467364..0aa0d2b 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -1402,9 +1402,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn, if (mid == -1) return -1; - *start_pfn = type->regions[mid].base >> PAGE_SHIFT; - *end_pfn = (type->regions[mid].base + type->regions[mid].size) - >> PAGE_SHIFT; + *start_pfn = PFN_DOWN(type->regions[mid].base); + *end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size); return type->regions[mid].nid; } -- cgit v0.10.2 From f87fb599ae4d2a152a93f9821b94f3158146d097 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:07:52 -0700 Subject: memcg: mm_update_next_owner() should skip kthreads "Search through everything else" in mm_update_next_owner() can hit a kthread which adopted this "mm" via use_mm(), it should not be used as mm->owner. Add the PF_KTHREAD check. While at it, change this code to use for_each_process_thread() instead of deprecated do_each_thread/while_each_thread. Signed-off-by: Oleg Nesterov Reviewed-by: Michal Hocko Cc: Balbir Singh Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Peter Chiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/exit.c b/kernel/exit.c index da1b838..5ac3c19 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -395,14 +395,12 @@ retry: } /* - * Search through everything else. We should not get - * here often + * Search through everything else, we should not get here often. */ - do_each_thread(g, c) { - if (c->mm == mm) + for_each_process_thread(g, c) { + if (!(c->flags & PF_KTHREAD) && c->mm == mm) goto assign_new_owner; - } while_each_thread(g, c); - + } read_unlock(&tasklist_lock); /* * We found no owner yet mm_users > 1: this implies that we are -- cgit v0.10.2 From 39af1765f1255b2bbadc3064e16270781abf24a1 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:07:54 -0700 Subject: memcg: optimize the "Search everything else" loop in mm_update_next_owner() for_each_process_thread() is sub-optimal. All threads share the same ->mm, we can swicth to the next process once we found a thread with ->mm != NULL and ->mm != mm. Signed-off-by: Oleg Nesterov Reviewed-by: Michal Hocko Cc: Balbir Singh Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Peter Chiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/exit.c b/kernel/exit.c index 5ac3c19..750c2e5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -397,9 +397,15 @@ retry: /* * Search through everything else, we should not get here often. */ - for_each_process_thread(g, c) { - if (!(c->flags & PF_KTHREAD) && c->mm == mm) - goto assign_new_owner; + for_each_process(g) { + if (g->flags & PF_KTHREAD) + continue; + for_each_thread(g, c) { + if (c->mm == mm) + goto assign_new_owner; + if (c->mm) + break; + } } read_unlock(&tasklist_lock); /* -- cgit v0.10.2 From dc6f6c97f1d3d58fef81f0f9db0c7d068b2cf392 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:07:55 -0700 Subject: memcg: kill start_kernel()->mm_init_owner(&init_mm) Remove start_kernel()->mm_init_owner(&init_mm, &init_task). This doesn't really hurt but unnecessary and misleading. init_task is the "swapper" thread == current, its ->mm is always NULL. And init_mm can only be used as ->active_mm, not as ->mm. mm_init_owner() has a single caller with this patch, perhaps it should die. mm_init() can initialize ->owner under #ifdef. Signed-off-by: Oleg Nesterov Reviewed-by: Michal Hocko Cc: Balbir Singh Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Cc: Peter Chiang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/init/main.c b/init/main.c index 48655ce..e08c0b2 100644 --- a/init/main.c +++ b/init/main.c @@ -507,7 +507,6 @@ asmlinkage __visible void __init start_kernel(void) page_address_init(); pr_notice("%s", linux_banner); setup_arch(&command_line); - mm_init_owner(&init_mm, &init_task); mm_init_cpumask(&init_mm); setup_command_line(command_line); setup_nr_cpu_ids(); -- cgit v0.10.2 From 7c8e0181e6e0b8079c4c2ce902bf52d7a2c6fa5d Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Wed, 4 Jun 2014 16:07:56 -0700 Subject: mm: replace __get_cpu_var uses with this_cpu_ptr Replace places where __get_cpu_var() is used for an address calculation with this_cpu_ptr(). Signed-off-by: Christoph Lameter Cc: Tejun Heo Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 9599aa7..55f7a9c 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -194,7 +194,7 @@ radix_tree_node_alloc(struct radix_tree_root *root) * succeed in getting a node here (and never reach * kmem_cache_alloc) */ - rtp = &__get_cpu_var(radix_tree_preloads); + rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr) { ret = rtp->nodes[rtp->nr - 1]; rtp->nodes[rtp->nr - 1] = NULL; @@ -250,14 +250,14 @@ static int __radix_tree_preload(gfp_t gfp_mask) int ret = -ENOMEM; preempt_disable(); - rtp = &__get_cpu_var(radix_tree_preloads); + rtp = this_cpu_ptr(&radix_tree_preloads); while (rtp->nr < ARRAY_SIZE(rtp->nodes)) { preempt_enable(); node = kmem_cache_alloc(radix_tree_node_cachep, gfp_mask); if (node == NULL) goto out; preempt_disable(); - rtp = &__get_cpu_var(radix_tree_preloads); + rtp = this_cpu_ptr(&radix_tree_preloads); if (rtp->nr < ARRAY_SIZE(rtp->nodes)) rtp->nodes[rtp->nr++] = node; else diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b44888..1432693 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2436,7 +2436,7 @@ static void drain_stock(struct memcg_stock_pcp *stock) */ static void drain_local_stock(struct work_struct *dummy) { - struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); + struct memcg_stock_pcp *stock = this_cpu_ptr(&memcg_stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); } diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6917f79..d50f17f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1298,7 +1298,7 @@ static void memory_failure_work_func(struct work_struct *work) unsigned long proc_flags; int gotten; - mf_cpu = &__get_cpu_var(memory_failure_cpu); + mf_cpu = this_cpu_ptr(&memory_failure_cpu); for (;;) { spin_lock_irqsave(&mf_cpu->lock, proc_flags); gotten = kfifo_get(&mf_cpu->fifo, &entry); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index a4317da..b9b8e82 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1623,7 +1623,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * 1000+ tasks, all of them start dirtying pages at exactly the same * time, hence all honoured too large initial task->nr_dirtied_pause. */ - p = &__get_cpu_var(bdp_ratelimits); + p = this_cpu_ptr(&bdp_ratelimits); if (unlikely(current->nr_dirtied >= ratelimit)) *p = 0; else if (unlikely(*p >= ratelimit_pages)) { @@ -1635,7 +1635,7 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping) * short-lived tasks (eg. gcc invocations in a kernel build) escaping * the dirty throttling and livelock other long-run dirtiers. */ - p = &__get_cpu_var(dirty_throttle_leaks); + p = this_cpu_ptr(&dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); diff --git a/mm/slub.c b/mm/slub.c index 9e288d7..fdf0fe4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2209,7 +2209,7 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, page = new_slab(s, flags, node); if (page) { - c = __this_cpu_ptr(s->cpu_slab); + c = raw_cpu_ptr(s->cpu_slab); if (c->page) flush_slab(s, c); @@ -2425,7 +2425,7 @@ redo: * and the retrieval of the tid. */ preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + c = this_cpu_ptr(s->cpu_slab); /* * The transaction ids are globally unique per cpu and per operation on @@ -2681,7 +2681,7 @@ redo: * during the cmpxchg then the free will succedd. */ preempt_disable(); - c = __this_cpu_ptr(s->cpu_slab); + c = this_cpu_ptr(s->cpu_slab); tid = c->tid; preempt_enable(); diff --git a/mm/swap.c b/mm/swap.c index c0ed4d6..913b99d 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -441,7 +441,7 @@ void rotate_reclaimable_page(struct page *page) page_cache_get(page); local_irq_save(flags); - pvec = &__get_cpu_var(lru_rotate_pvecs); + pvec = this_cpu_ptr(&lru_rotate_pvecs); if (!pagevec_add(pvec, page)) pagevec_move_tail(pvec); local_irq_restore(flags); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index bf233b2..ddaf70b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1496,7 +1496,7 @@ void vfree(const void *addr) if (!addr) return; if (unlikely(in_interrupt())) { - struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); + struct vfree_deferred *p = this_cpu_ptr(&vfree_deferred); if (llist_add((struct llist_node *)addr, &p->list)) schedule_work(&p->wq); } else diff --git a/mm/vmstat.c b/mm/vmstat.c index 82ce17c..376bd2d 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -489,7 +489,7 @@ static void refresh_cpu_vm_stats(void) continue; if (__this_cpu_read(p->pcp.count)) - drain_zone_pages(zone, __this_cpu_ptr(&p->pcp)); + drain_zone_pages(zone, this_cpu_ptr(&p->pcp)); #endif } fold_diff(global_diff); @@ -1230,7 +1230,7 @@ int sysctl_stat_interval __read_mostly = HZ; static void vmstat_update(struct work_struct *w) { refresh_cpu_vm_stats(); - schedule_delayed_work(&__get_cpu_var(vmstat_work), + schedule_delayed_work(this_cpu_ptr(&vmstat_work), round_jiffies_relative(sysctl_stat_interval)); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 36b4591..5ae5d85 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1082,7 +1082,7 @@ void zs_unmap_object(struct zs_pool *pool, unsigned long handle) class = &pool->size_class[class_idx]; off = obj_idx_to_offset(page, obj_idx, class->size); - area = &__get_cpu_var(zs_map_area); + area = this_cpu_ptr(&zs_map_area); if (off + class->size <= PAGE_SIZE) kunmap_atomic(area->vm_addr); else { -- cgit v0.10.2 From f7f28ca98b9a7a99fc55df2dddcf49857ab004f0 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 4 Jun 2014 16:07:57 -0700 Subject: mm: constify nmask argument to mbind() The nmask argument to mbind() is const according to the userspace header numaif.h, and since the kernel does indeed not modify it, it might as well be declared const in the kernel. Signed-off-by: Rasmus Villemoes Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index a4a0588..bfef0be 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -723,7 +723,7 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, int flags); asmlinkage long sys_mbind(unsigned long start, unsigned long len, unsigned long mode, - unsigned long __user *nmask, + const unsigned long __user *nmask, unsigned long maxnode, unsigned flags); asmlinkage long sys_get_mempolicy(int __user *policy, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 78e1472..727187f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1362,7 +1362,7 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, } SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, - unsigned long, mode, unsigned long __user *, nmask, + unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned, flags) { nodemask_t nodes; -- cgit v0.10.2 From 23c8902d403ef9a04cdc367d0b76a3ed6d83f5c5 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Wed, 4 Jun 2014 16:07:58 -0700 Subject: mm: constify nmask argument to set_mempolicy() The nmask argument to set_mempolicy() is const according to the user-space header numaif.h, and since the kernel does indeed not modify it, it might as well be declared const in the kernel. Signed-off-by: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index bfef0be..b0881a0 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -711,7 +711,7 @@ asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3, asmlinkage long sys_ioprio_set(int which, int who, int ioprio); asmlinkage long sys_ioprio_get(int which, int who); -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, +asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask, unsigned long maxnode); asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *from, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 727187f..b09586d 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1383,7 +1383,7 @@ SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, } /* Set the process memory policy */ -SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, +SYSCALL_DEFINE3(set_mempolicy, int, mode, const unsigned long __user *, nmask, unsigned long, maxnode) { int err; -- cgit v0.10.2 From c747ce7907ab11be53d65ef55c53821558720d8f Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:07:59 -0700 Subject: mm/swap.c: introduce put_[un]refcounted_compound_page helpers for splitting put_compound_page() Currently, put_compound_page() carefully handles tricky cases to avoid racing with compound page releasing or splitting, which makes it quite lenthy (about 200+ lines) and needs deep tab indention, which makes it quite hard to follow and maintain. This patch and the next patch refactor this function. Based on the code skeleton of put_compound_page: put_compound_pge: if !PageTail(page) put head page fastpath; return; /* else PageTail */ page_head = compound_head(page) if !__compound_tail_refcounted(page_head) put head page optimal path; <---(1) return; else put head page slowpath; <--- (2) return; This patch introduces two helpers, put_[un]refcounted_compound_page, handling the code path (1) and code path (2), respectively. They both are tagged __always_inline, thus elmiating function call overhead, making them operating the same way as before. They are almost copied verbatim(except one place, a "goto out_put_single" is expanded), with some comments rephrasing. Signed-off-by: Jianyu Zhan Cc: Kirill A. Shutemov Cc: Rik van Riel Cc: Jiang Liu Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Sasha Levin Cc: Wanpeng Li Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/swap.c b/mm/swap.c index 913b99d..54f3ae4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -79,6 +79,148 @@ static void __put_compound_page(struct page *page) (*dtor)(page); } +/** + * Two special cases here: we could avoid taking compound_lock_irqsave + * and could skip the tail refcounting(in _mapcount). + * + * 1. Hugetlbfs page: + * + * PageHeadHuge will remain true until the compound page + * is released and enters the buddy allocator, and it could + * not be split by __split_huge_page_refcount(). + * + * So if we see PageHeadHuge set, and we have the tail page pin, + * then we could safely put head page. + * + * 2. Slab THP page: + * + * PG_slab is cleared before the slab frees the head page, and + * tail pin cannot be the last reference left on the head page, + * because the slab code is free to reuse the compound page + * after a kfree/kmem_cache_free without having to check if + * there's any tail pin left. In turn all tail pinsmust be always + * released while the head is still pinned by the slab code + * and so we know PG_slab will be still set too. + * + * So if we see PageSlab set, and we have the tail page pin, + * then we could safely put head page. + */ +static __always_inline +void put_unrefcounted_compound_page(struct page *page_head, struct page *page) +{ + /* + * If @page is a THP tail, we must read the tail page + * flags after the head page flags. The + * __split_huge_page_refcount side enforces write memory barriers + * between clearing PageTail and before the head page + * can be freed and reallocated. + */ + smp_rmb(); + if (likely(PageTail(page))) { + /* + * __split_huge_page_refcount cannot race + * here, see the comment above this function. + */ + VM_BUG_ON_PAGE(!PageHead(page_head), page_head); + VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); + if (put_page_testzero(page_head)) { + /* + * If this is the tail of a slab THP page, + * the tail pin must not be the last reference + * held on the page, because the PG_slab cannot + * be cleared before all tail pins (which skips + * the _mapcount tail refcounting) have been + * released. + * + * If this is the tail of a hugetlbfs page, + * the tail pin may be the last reference on + * the page instead, because PageHeadHuge will + * not go away until the compound page enters + * the buddy allocator. + */ + VM_BUG_ON_PAGE(PageSlab(page_head), page_head); + __put_compound_page(page_head); + } + } else + /* + * __split_huge_page_refcount run before us, + * @page was a THP tail. The split @page_head + * has been freed and reallocated as slab or + * hugetlbfs page of smaller order (only + * possible if reallocated as slab on x86). + */ + if (put_page_testzero(page)) + __put_single_page(page); +} + +static __always_inline +void put_refcounted_compound_page(struct page *page_head, struct page *page) +{ + if (likely(page != page_head && get_page_unless_zero(page_head))) { + unsigned long flags; + + /* + * @page_head wasn't a dangling pointer but it may not + * be a head page anymore by the time we obtain the + * lock. That is ok as long as it can't be freed from + * under us. + */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); + if (put_page_testzero(page_head)) { + /* + * The @page_head may have been freed + * and reallocated as a compound page + * of smaller order and then freed + * again. All we know is that it + * cannot have become: a THP page, a + * compound page of higher order, a + * tail page. That is because we + * still hold the refcount of the + * split THP tail and page_head was + * the THP head before the split. + */ + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } +out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; + } + VM_BUG_ON_PAGE(page_head != page->first_page, page); + /* + * We can release the refcount taken by + * get_page_unless_zero() now that + * __split_huge_page_refcount() is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON_PAGE(1, page_head); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); + atomic_dec(&page->_mapcount); + VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); + VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); + compound_unlock_irqrestore(page_head, flags); + + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } + } else { + /* @page_head is a dangling pointer */ + VM_BUG_ON_PAGE(PageTail(page), page); + goto out_put_single; + } +} + static void put_compound_page(struct page *page) { struct page *page_head; -- cgit v0.10.2 From 4bd3e8f7b94785a6f65665fee21ff3dbc2bf4ef8 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:08:01 -0700 Subject: mm/swap.c: split put_compound_page() Currently, put_compound_page() carefully handles tricky cases to avoid racing with compound page releasing or splitting, which makes it quite lenthy (about 200+ lines) and needs deep tab indention, which makes it quite hard to follow and maintain. Now based on two helpers introduced in the previous patch ("mm/swap.c: introduce put_[un]refcounted_compound_page helpers for spliting put_compound_page"), this patch replaces those two lengthy code paths with these two helpers, respectively. Also, it has some comment rephrasing. After this patch, the put_compound_page() is very compact, thus easy to read and maintain. After splitting, the object file is of same size as the original one. Actually, I've diff'ed put_compound_page()'s orginal disassemble code and the patched disassemble code, the are 100% the same! This fact shows that this splitting has no functional change, but it brings readability. This patch and the previous one blow the code by 32 lines, mostly due to comments. Signed-off-by: Jianyu Zhan Cc: Kirill A. Shutemov Cc: Rik van Riel Cc: Jiang Liu Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Sasha Levin Cc: Wanpeng Li Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/swap.c b/mm/swap.c index 54f3ae4..d089c5a 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -225,6 +225,11 @@ static void put_compound_page(struct page *page) { struct page *page_head; + /* + * We see the PageCompound set and PageTail not set, so @page maybe: + * 1. hugetlbfs head page, or + * 2. THP head page. + */ if (likely(!PageTail(page))) { if (put_page_testzero(page)) { /* @@ -239,135 +244,20 @@ static void put_compound_page(struct page *page) return; } - /* __split_huge_page_refcount can run under us */ - page_head = compound_head(page); - /* - * THP can not break up slab pages so avoid taking - * compound_lock() and skip the tail page refcounting (in - * _mapcount) too. Slab performs non-atomic bit ops on - * page->flags for better performance. In particular - * slab_unlock() in slub used to be a hot path. It is still - * hot on arches that do not support - * this_cpu_cmpxchg_double(). + * We see the PageCompound set and PageTail set, so @page maybe: + * 1. a tail hugetlbfs page, or + * 2. a tail THP page, or + * 3. a split THP page. * - * If "page" is part of a slab or hugetlbfs page it cannot be - * splitted and the head page cannot change from under us. And - * if "page" is part of a THP page under splitting, if the - * head page pointed by the THP tail isn't a THP head anymore, - * we'll find PageTail clear after smp_rmb() and we'll treat - * it as a single page. + * Case 3 is possible, as we may race with + * __split_huge_page_refcount tearing down a THP page. */ - if (!__compound_tail_refcounted(page_head)) { - /* - * If "page" is a THP tail, we must read the tail page - * flags after the head page flags. The - * split_huge_page side enforces write memory barriers - * between clearing PageTail and before the head page - * can be freed and reallocated. - */ - smp_rmb(); - if (likely(PageTail(page))) { - /* - * __split_huge_page_refcount cannot race - * here. - */ - VM_BUG_ON_PAGE(!PageHead(page_head), page_head); - VM_BUG_ON_PAGE(page_mapcount(page) != 0, page); - if (put_page_testzero(page_head)) { - /* - * If this is the tail of a slab - * compound page, the tail pin must - * not be the last reference held on - * the page, because the PG_slab - * cannot be cleared before all tail - * pins (which skips the _mapcount - * tail refcounting) have been - * released. For hugetlbfs the tail - * pin may be the last reference on - * the page instead, because - * PageHeadHuge will not go away until - * the compound page enters the buddy - * allocator. - */ - VM_BUG_ON_PAGE(PageSlab(page_head), page_head); - __put_compound_page(page_head); - } - return; - } else - /* - * __split_huge_page_refcount run before us, - * "page" was a THP tail. The split page_head - * has been freed and reallocated as slab or - * hugetlbfs page of smaller order (only - * possible if reallocated as slab on x86). - */ - goto out_put_single; - } - - if (likely(page != page_head && get_page_unless_zero(page_head))) { - unsigned long flags; - - /* - * page_head wasn't a dangling pointer but it may not - * be a head page anymore by the time we obtain the - * lock. That is ok as long as it can't be freed from - * under us. - */ - flags = compound_lock_irqsave(page_head); - if (unlikely(!PageTail(page))) { - /* __split_huge_page_refcount run before us */ - compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) { - /* - * The head page may have been freed - * and reallocated as a compound page - * of smaller order and then freed - * again. All we know is that it - * cannot have become: a THP page, a - * compound page of higher order, a - * tail page. That is because we - * still hold the refcount of the - * split THP tail and page_head was - * the THP head before the split. - */ - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } -out_put_single: - if (put_page_testzero(page)) - __put_single_page(page); - return; - } - VM_BUG_ON_PAGE(page_head != page->first_page, page); - /* - * We can release the refcount taken by - * get_page_unless_zero() now that - * __split_huge_page_refcount() is blocked on the - * compound_lock. - */ - if (put_page_testzero(page_head)) - VM_BUG_ON_PAGE(1, page_head); - /* __split_huge_page_refcount will wait now */ - VM_BUG_ON_PAGE(page_mapcount(page) <= 0, page); - atomic_dec(&page->_mapcount); - VM_BUG_ON_PAGE(atomic_read(&page_head->_count) <= 0, page_head); - VM_BUG_ON_PAGE(atomic_read(&page->_count) != 0, page); - compound_unlock_irqrestore(page_head, flags); - - if (put_page_testzero(page_head)) { - if (PageHead(page_head)) - __put_compound_page(page_head); - else - __put_single_page(page_head); - } - } else { - /* page_head is a dangling pointer */ - VM_BUG_ON_PAGE(PageTail(page), page); - goto out_put_single; - } + page_head = compound_head(page); + if (!__compound_tail_refcounted(page_head)) + put_unrefcounted_compound_page(page_head, page); + else + put_refcounted_compound_page(page_head, page); } void put_page(struct page *page) -- cgit v0.10.2 From d2ee40eae98d8a41ff27dcdd13b1b656c4c1ad00 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:08:02 -0700 Subject: mm: introdule compound_head_by_tail() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, in put_compound_page(), we have ====== if (likely(!PageTail(page))) { <------ (1) if (put_page_testzero(page)) { /* ¦* By the time all refcounts have been released ¦* split_huge_page cannot run anymore from under us. ¦*/ if (PageHead(page)) __put_compound_page(page); else __put_single_page(page); } return; } /* __split_huge_page_refcount can run under us */ page_head = compound_head(page); <------------ (2) ====== if at (1) , we fail the check, this means page is *likely* a tail page. Then at (2), as compoud_head(page) is inlined, it is : ====== static inline struct page *compound_head(struct page *page) { if (unlikely(PageTail(page))) { <----------- (3) struct page *head = page->first_page; smp_rmb(); if (likely(PageTail(page))) return head; } return page; } ====== here, the (3) unlikely in the case is a negative hint, because it is *likely* a tail page. So the check (3) in this case is not good, so I introduce a helper for this case. So this patch introduces compound_head_by_tail() which deals with a possible tail page(though it could be spilt by a racy thread), and make compound_head() a wrapper on it. This patch has no functional change, and it reduces the object size slightly: text data bss dec hex filename 11003 1328 16 12347 303b mm/swap.o.orig 10971 1328 16 12315 301b mm/swap.o.patched I've ran "perf top -e branch-miss" to observe branch-miss in this case. As Michael points out, it's a slow path, so only very few times this case happens. But I grep'ed the code base, and found there still are some other call sites could be benifited from this helper. And given that it only bloating up the source by only 5 lines, but with a reduced object size. I still believe this helper deserves to exsit. Signed-off-by: Jianyu Zhan Cc: Kirill A. Shutemov Cc: Rik van Riel Cc: Jiang Liu Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Mel Gorman Cc: Andrea Arcangeli Cc: Sasha Levin Cc: Wanpeng Li Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mm.h b/include/linux/mm.h index d677706..3686006 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -407,20 +407,25 @@ static inline void compound_unlock_irqrestore(struct page *page, #endif } +static inline struct page *compound_head_by_tail(struct page *tail) +{ + struct page *head = tail->first_page; + + /* + * page->first_page may be a dangling pointer to an old + * compound page, so recheck that it is still a tail + * page before returning. + */ + smp_rmb(); + if (likely(PageTail(tail))) + return head; + return tail; +} + static inline struct page *compound_head(struct page *page) { - if (unlikely(PageTail(page))) { - struct page *head = page->first_page; - - /* - * page->first_page may be a dangling pointer to an old - * compound page, so recheck that it is still a tail - * page before returning. - */ - smp_rmb(); - if (likely(PageTail(page))) - return head; - } + if (unlikely(PageTail(page))) + return compound_head_by_tail(page); return page; } diff --git a/mm/swap.c b/mm/swap.c index d089c5a..c8d6df5 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -253,7 +253,7 @@ static void put_compound_page(struct page *page) * Case 3 is possible, as we may race with * __split_huge_page_refcount tearing down a THP page. */ - page_head = compound_head(page); + page_head = compound_head_by_tail(page); if (!__compound_tail_refcounted(page_head)) put_unrefcounted_compound_page(page_head, page); else -- cgit v0.10.2 From 1754e44e8291c92b9d981a6eca59f28dd25f03ab Mon Sep 17 00:00:00 2001 From: Wang Sheng-Hui Date: Wed, 4 Jun 2014 16:08:04 -0700 Subject: include/linux/bootmem.h: cleanup the comment for BOOTMEM_ flags Use BOOTMEM_DEFAULT instead of 0 in the comment. Signed-off-by: Wang Sheng-Hui Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index db51fe4..4e2bd4c 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -58,9 +58,9 @@ extern void free_bootmem_late(unsigned long physaddr, unsigned long size); * Flags for reserve_bootmem (also if CONFIG_HAVE_ARCH_BOOTMEM_NODE, * the architecture-specific code should honor this). * - * If flags is 0, then the return value is always 0 (success). If - * flags contains BOOTMEM_EXCLUSIVE, then -EBUSY is returned if the - * memory already was reserved. + * If flags is BOOTMEM_DEFAULT, then the return value is always 0 (success). + * If flags contains BOOTMEM_EXCLUSIVE, then -EBUSY is returned if the memory + * already was reserved. */ #define BOOTMEM_DEFAULT 0 #define BOOTMEM_EXCLUSIVE (1<<0) -- cgit v0.10.2 From cc6b664aa26de93d9a3f99d4021a8d88b434ed06 Mon Sep 17 00:00:00 2001 From: Daeseok Youn Date: Wed, 4 Jun 2014 16:08:05 -0700 Subject: mm/dmapool.c: remove redundant NULL check for dev in dma_pool_create() "dev" cannot be NULL because it is already checked before calling dma_pool_create(). If dev ever was NULL, the code would oops in dev_to_node() after enabling CONFIG_NUMA. It is possible that some driver is using dev==NULL and has never been run on a NUMA machine. Such a driver is probably outdated, possibly buggy and will need some attention if it starts triggering NULL derefs. Signed-off-by: Daeseok Youn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/dmapool.c b/mm/dmapool.c index 8058fcd..a3a1bfe 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -170,24 +170,16 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, retval->boundary = boundary; retval->allocation = allocation; - if (dev) { - int ret; + INIT_LIST_HEAD(&retval->pools); - mutex_lock(&pools_lock); - if (list_empty(&dev->dma_pools)) - ret = device_create_file(dev, &dev_attr_pools); - else - ret = 0; - /* note: not currently insisting "name" be unique */ - if (!ret) - list_add(&retval->pools, &dev->dma_pools); - else { - kfree(retval); - retval = NULL; - } - mutex_unlock(&pools_lock); + mutex_lock(&pools_lock); + if (list_empty(&dev->dma_pools) && + device_create_file(dev, &dev_attr_pools)) { + kfree(retval); + return NULL; } else - INIT_LIST_HEAD(&retval->pools); + list_add(&retval->pools, &dev->dma_pools); + mutex_unlock(&pools_lock); return retval; } -- cgit v0.10.2 From 7fe7047597cf5ebb300802494db4f407327ec94f Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:08:06 -0700 Subject: mm: shrinker trace points: fix negatives I was looking at a trace of the slab shrinkers (attachment in this comment): https://bugs.freedesktop.org/show_bug.cgi?id=72742#c67 and noticed that "total_scan" can go negative in some cases. We used to dump out the "total_scan" variable directly, but some of the shrinker modifications along the way changed that. This patch just dumps it out directly, again. It doesn't make any sense to derive it from new_nr and nr any more since there are now other shrinkers that can be running in parallel and mucking with those values. Here's an example of the negative numbers in the output: > kswapd0-840 [000] 160.869398: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 10 new scan count 39 total_scan 29 last shrinker return val 256 > kswapd0-840 [000] 160.869618: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 39 new scan count 102 total_scan 63 last shrinker return val 256 > kswapd0-840 [000] 160.870031: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 102 new scan count 47 total_scan -55 last shrinker return val 768 > kswapd0-840 [000] 160.870464: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 47 new scan count 45 total_scan -2 last shrinker return val 768 > kswapd0-840 [000] 163.384144: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 45 new scan count 56 total_scan 11 last shrinker return val 0 > kswapd0-840 [000] 163.384297: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 56 new scan count 15 total_scan -41 last shrinker return val 256 > kswapd0-840 [000] 163.384414: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 15 new scan count 117 total_scan 102 last shrinker return val 0 > kswapd0-840 [000] 163.384657: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 117 new scan count 36 total_scan -81 last shrinker return val 512 > kswapd0-840 [000] 163.384880: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 36 new scan count 111 total_scan 75 last shrinker return val 256 > kswapd0-840 [000] 163.385256: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 111 new scan count 34 total_scan -77 last shrinker return val 768 > kswapd0-840 [000] 163.385598: mm_shrink_slab_end: i915_gem_inactive_scan+0x0 0xffff8800037cbc68: unused scan count 34 new scan count 122 total_scan 88 last shrinker return val 512 Signed-off-by: Dave Hansen Acked-by: Dave Chinner Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 132a985..1dd5e77 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -226,9 +226,9 @@ TRACE_EVENT(mm_shrink_slab_start, TRACE_EVENT(mm_shrink_slab_end, TP_PROTO(struct shrinker *shr, int shrinker_retval, - long unused_scan_cnt, long new_scan_cnt), + long unused_scan_cnt, long new_scan_cnt, long total_scan), - TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt), + TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt, total_scan), TP_STRUCT__entry( __field(struct shrinker *, shr) @@ -245,7 +245,7 @@ TRACE_EVENT(mm_shrink_slab_end, __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; __entry->retval = shrinker_retval; - __entry->total_scan = new_scan_cnt - unused_scan_cnt; + __entry->total_scan = total_scan; ), TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", diff --git a/mm/vmscan.c b/mm/vmscan.c index 5a8776e..15e9315 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -324,7 +324,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); - trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr); + trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr, total_scan); return freed; } -- cgit v0.10.2 From df9024a8c5a3e031c5df26386f74ffed1b8fc095 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 4 Jun 2014 16:08:07 -0700 Subject: mm: shrinker: add nid to tracepoint output Now that we are doing NUMA-aware shrinking, and can have shrinkers running in parallel, or working on individual nodes, it seems like we should also be sticking the node in the output. Signed-off-by: Dave Hansen Acked-by: Dave Chinner Cc: Konstantin Khlebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index 1dd5e77..69590b6 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -191,6 +191,7 @@ TRACE_EVENT(mm_shrink_slab_start, TP_STRUCT__entry( __field(struct shrinker *, shr) __field(void *, shrink) + __field(int, nid) __field(long, nr_objects_to_shrink) __field(gfp_t, gfp_flags) __field(unsigned long, pgs_scanned) @@ -203,6 +204,7 @@ TRACE_EVENT(mm_shrink_slab_start, TP_fast_assign( __entry->shr = shr; __entry->shrink = shr->scan_objects; + __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = sc->gfp_mask; __entry->pgs_scanned = pgs_scanned; @@ -212,9 +214,10 @@ TRACE_EVENT(mm_shrink_slab_start, __entry->total_scan = total_scan; ), - TP_printk("%pF %p: objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", + TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", __entry->shrink, __entry->shr, + __entry->nid, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), __entry->pgs_scanned, @@ -225,13 +228,15 @@ TRACE_EVENT(mm_shrink_slab_start, ); TRACE_EVENT(mm_shrink_slab_end, - TP_PROTO(struct shrinker *shr, int shrinker_retval, + TP_PROTO(struct shrinker *shr, int nid, int shrinker_retval, long unused_scan_cnt, long new_scan_cnt, long total_scan), - TP_ARGS(shr, shrinker_retval, unused_scan_cnt, new_scan_cnt, total_scan), + TP_ARGS(shr, nid, shrinker_retval, unused_scan_cnt, new_scan_cnt, + total_scan), TP_STRUCT__entry( __field(struct shrinker *, shr) + __field(int, nid) __field(void *, shrink) __field(long, unused_scan) __field(long, new_scan) @@ -241,6 +246,7 @@ TRACE_EVENT(mm_shrink_slab_end, TP_fast_assign( __entry->shr = shr; + __entry->nid = nid; __entry->shrink = shr->scan_objects; __entry->unused_scan = unused_scan_cnt; __entry->new_scan = new_scan_cnt; @@ -248,9 +254,10 @@ TRACE_EVENT(mm_shrink_slab_end, __entry->total_scan = total_scan; ), - TP_printk("%pF %p: unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", + TP_printk("%pF %p: nid: %d unused scan count %ld new scan count %ld total_scan %ld last shrinker return val %d", __entry->shrink, __entry->shr, + __entry->nid, __entry->unused_scan, __entry->new_scan, __entry->total_scan, diff --git a/mm/vmscan.c b/mm/vmscan.c index 15e9315..9253e18 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -324,7 +324,7 @@ shrink_slab_node(struct shrink_control *shrinkctl, struct shrinker *shrinker, else new_nr = atomic_long_read(&shrinker->nr_deferred[nid]); - trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr, total_scan); + trace_mm_shrink_slab_end(shrinker, nid, freed, nr, new_nr, total_scan); return freed; } -- cgit v0.10.2 From ada4ba591472f511ad56dd0075c457295c3ca317 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:08:08 -0700 Subject: mm/memcontrol.c: remove NULL assignment on static static values are automatically initialized to NULL Signed-off-by: Fabian Frederick Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1432693..03d7662 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -80,7 +80,7 @@ int do_swap_account __read_mostly; #ifdef CONFIG_MEMCG_SWAP_ENABLED static int really_do_swap_account __initdata = 1; #else -static int really_do_swap_account __initdata = 0; +static int really_do_swap_account __initdata; #endif #else @@ -3110,7 +3110,7 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) char *memcg_create_cache_name(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { - static char *buf = NULL; + static char *buf; /* * We need a mutex here to protect the shared buffer. Since this is -- cgit v0.10.2 From f4527c90868d8fa175c68ccf216cf9b67a7d8a1a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:08:09 -0700 Subject: mm/vmalloc.c: replace seq_printf by seq_puts Replace seq_printf where possible Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ddaf70b..2ed75fb 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2619,19 +2619,19 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, " phys=%llx", (unsigned long long)v->phys_addr); if (v->flags & VM_IOREMAP) - seq_printf(m, " ioremap"); + seq_puts(m, " ioremap"); if (v->flags & VM_ALLOC) - seq_printf(m, " vmalloc"); + seq_puts(m, " vmalloc"); if (v->flags & VM_MAP) - seq_printf(m, " vmap"); + seq_puts(m, " vmap"); if (v->flags & VM_USERMAP) - seq_printf(m, " user"); + seq_puts(m, " user"); if (v->flags & VM_VPAGES) - seq_printf(m, " vpages"); + seq_puts(m, " vpages"); show_numa_info(m, v); seq_putc(m, '\n'); -- cgit v0.10.2 From 4bbd4c776a63a063546552de42f6a535395f6d9e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:10 -0700 Subject: mm: move get_user_pages()-related code to separate file mm/memory.c is overloaded: over 4k lines. get_user_pages() code is pretty much self-contained let's move it to separate file. No other changes made. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/Makefile b/mm/Makefile index 0173940..4064f3e 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -3,7 +3,7 @@ # mmu-y := nommu.o -mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ +mmu-$(CONFIG_MMU) := fremap.o gup.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ vmalloc.o pagewalk.o pgtable-generic.o diff --git a/mm/gup.c b/mm/gup.c new file mode 100644 index 0000000..ea88b65 --- /dev/null +++ b/mm/gup.c @@ -0,0 +1,649 @@ +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); + goto out; + } + + page = NULL; + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + goto no_page_table; + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + goto no_page_table; + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + if (flags & FOLL_GET) + goto out; + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + goto out; + } + if (unlikely(pud_bad(*pud))) + goto no_page_table; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto no_page_table; + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else { + page = NULL; + goto out; + } + } + goto out; + } + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + goto no_page_table; + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(vma, address, pmd); + goto split_fallthrough; + } + ptl = pmd_lock(mm, pmd); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(ptl); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(vma, address, + pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + goto out; + } + } else + spin_unlock(ptl); + /* fall through */ + } +split_fallthrough: + if (unlikely(pmd_bad(*pmd))) + goto no_page_table; + + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + + pte = *ptep; + if (!pte_present(pte)) { + swp_entry_t entry; + /* + * KSM's break_ksm() relies upon recognizing a ksm page + * even while it is being migrated, so for that case we + * need migration_entry_wait(). + */ + if (likely(!(flags & FOLL_MIGRATION))) + goto no_page; + if (pte_none(pte) || pte_file(pte)) + goto no_page; + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto no_page; + pte_unmap_unlock(ptep, ptl); + migration_entry_wait(mm, pmd, address); + goto split_fallthrough; + } + if ((flags & FOLL_NUMA) && pte_numa(pte)) + goto no_page; + if ((flags & FOLL_WRITE) && !pte_write(pte)) + goto unlock; + + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) { + if ((flags & FOLL_DUMP) || + !is_zero_pfn(pte_pfn(pte))) + goto bad_page; + page = pte_page(pte); + } + + if (flags & FOLL_GET) + get_page_foll(page); + if (flags & FOLL_TOUCH) { + if ((flags & FOLL_WRITE) && + !pte_dirty(pte) && !PageDirty(page)) + set_page_dirty(page); + /* + * pte_mkyoung() would be more correct here, but atomic care + * is needed to avoid losing the dirty bit: it is easier to use + * mark_page_accessed(). + */ + mark_page_accessed(page); + } + if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here, and migration is + * blocked by the pte's page reference, and we + * know the page is still mapped, we don't even + * need to check for file-cache page truncation. + */ + mlock_vma_page(page); + unlock_page(page); + } + } +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return page; + +bad_page: + pte_unmap_unlock(ptep, ptl); + return ERR_PTR(-EFAULT); + +no_page: + pte_unmap_unlock(ptep, ptl); + if (!pte_none(pte)) + return page; + +no_page_table: + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. + */ + if ((flags & FOLL_DUMP) && + (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); + return page; +} + +static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) +{ + return stack_guard_page_start(vma, addr) || + stack_guard_page_end(vma, addr+PAGE_SIZE); +} + +/** + * __get_user_pages() - pin user pages in memory + * @tsk: task_struct of target task + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * @nonblocking: whether waiting for disk IO or mmap_sem contention + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * __get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * __get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If + * the page is written to, set_page_dirty (or set_page_dirty_lock, as + * appropriate) must be called after the page is finished with, and + * before put_page is called. + * + * If @nonblocking != NULL, __get_user_pages will not wait for disk IO + * or mmap_sem contention, and if waiting is needed to pin all pages, + * *@nonblocking will be set to 0. + * + * In most cases, get_user_pages or get_user_pages_fast should be used + * instead of __get_user_pages. __get_user_pages should be used only if + * you need some special @gup_flags. + */ +long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + struct vm_area_struct **vmas, int *nonblocking) +{ + long i; + unsigned long vm_flags; + unsigned int page_mask; + + if (!nr_pages) + return 0; + + VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); + + /* + * If FOLL_FORCE is set then do not force a full fault as the hinting + * fault information is unrelated to the reference behaviour of a task + * using the address space + */ + if (!(gup_flags & FOLL_FORCE)) + gup_flags |= FOLL_NUMA; + + i = 0; + + do { + struct vm_area_struct *vma; + + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(mm, start)) { + unsigned long pg = start & PAGE_MASK; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + + /* user gate pages are read-only */ + if (gup_flags & FOLL_WRITE) + goto efault; + if (pg > TASK_SIZE) + pgd = pgd_offset_k(pg); + else + pgd = pgd_offset_gate(mm, pg); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, pg); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, pg); + if (pmd_none(*pmd)) + goto efault; + VM_BUG_ON(pmd_trans_huge(*pmd)); + pte = pte_offset_map(pmd, pg); + if (pte_none(*pte)) { + pte_unmap(pte); + goto efault; + } + vma = get_gate_vma(mm); + if (pages) { + struct page *page; + + page = vm_normal_page(vma, start, *pte); + if (!page) { + if (!(gup_flags & FOLL_DUMP) && + is_zero_pfn(pte_pfn(*pte))) + page = pte_page(*pte); + else { + pte_unmap(pte); + goto efault; + } + } + pages[i] = page; + get_page(page); + } + pte_unmap(pte); + page_mask = 0; + goto next_page; + } + + if (!vma) + goto efault; + vm_flags = vma->vm_flags; + if (vm_flags & (VM_IO | VM_PFNMAP)) + goto efault; + + if (gup_flags & FOLL_WRITE) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * We used to let the write,force case do COW + * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so + * ptrace could set a breakpoint in a read-only + * mapping of an executable, without corrupting + * the file (yet only when that file had been + * opened for writing!). Anon pages in shared + * mappings are surprising: now just reject it. + */ + if (!is_cow_mapping(vm_flags)) { + WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + goto efault; + } + } + } else { + if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + goto efault; + /* + * Is there actually any vma we can reach here + * which does not have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + goto efault; + } + } + + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &nr_pages, i, gup_flags); + continue; + } + + do { + struct page *page; + unsigned int foll_flags = gup_flags; + unsigned int page_increm; + + /* + * If we have a pending SIGKILL, don't keep faulting + * pages and potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; + + cond_resched(); + while (!(page = follow_page_mask(vma, start, + foll_flags, &page_mask))) { + int ret; + unsigned int fault_flags = 0; + + /* For mlock, just skip the stack guard page. */ + if (foll_flags & FOLL_MLOCK) { + if (stack_guard_page(vma, start)) + goto next_page; + } + if (foll_flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (foll_flags & FOLL_NOWAIT) + fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); + + ret = handle_mm_fault(mm, vma, start, + fault_flags); + + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return i ? i : -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | + VM_FAULT_HWPOISON_LARGE)) { + if (i) + return i; + else if (gup_flags & FOLL_HWPOISON) + return -EHWPOISON; + else + return -EFAULT; + } + if (ret & VM_FAULT_SIGBUS) + goto efault; + BUG(); + } + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + return i; + } + + /* + * The VM_FAULT_WRITE bit tells us that + * do_wp_page has broken COW when necessary, + * even if maybe_mkwrite decided not to set + * pte_write. We can thus safely do subsequent + * page lookups as if they were reads. But only + * do so when looping for pte_write is futile: + * in some cases userspace may also be wanting + * to write to the gotten user page, which a + * read fault here might prevent (a readonly + * page might get reCOWed by userspace write). + */ + if ((ret & VM_FAULT_WRITE) && + !(vma->vm_flags & VM_WRITE)) + foll_flags &= ~FOLL_WRITE; + + cond_resched(); + } + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); + if (pages) { + pages[i] = page; + + flush_anon_page(vma, page, start); + flush_dcache_page(page); + page_mask = 0; + } +next_page: + if (vmas) { + vmas[i] = vma; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; + } while (nr_pages && start < vma->vm_end); + } while (nr_pages); + return i; +efault: + return i ? : -EFAULT; +} +EXPORT_SYMBOL(__get_user_pages); + +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This should be called with the mm_sem held for read. + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + vm_flags_t vm_flags; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; + if (!(vm_flags & vma->vm_flags)) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* + * get_user_pages() - pin user pages in memory + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @write: whether pages will be written to by the caller + * @force: whether to force access even when user mapping is currently + * protected (but never forces write access to shared mapping). + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. Or NULL, if caller + * only intends to ensure the pages are faulted in. + * @vmas: array of pointers to vmas corresponding to each page. + * Or NULL if the caller does not require them. + * + * Returns number of pages pinned. This may be fewer than the number + * requested. If nr_pages is 0 or negative, returns 0. If no pages + * were pinned, returns -errno. Each page returned must be released + * with a put_page() call when it is finished with. vmas will only + * remain valid while mmap_sem is held. + * + * Must be called with mmap_sem held for read or write. + * + * get_user_pages walks a process's page tables and takes a reference to + * each struct page that each user address corresponds to at a given + * instant. That is, it takes the page that would be accessed if a user + * thread accesses the given user virtual address at that instant. + * + * This does not guarantee that the page exists in the user mappings when + * get_user_pages returns, and there may even be a completely different + * page there in some cases (eg. if mmapped pagecache has been invalidated + * and subsequently re faulted). However it does guarantee that the page + * won't be freed completely. And mostly callers simply care that the page + * contains data that was valid *at some point in time*. Typically, an IO + * or similar operation cannot guarantee anything stronger anyway because + * locks can't be held over the syscall boundary. + * + * If write=0, the page must not be written to. If the page is written to, + * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called + * after the page is finished with, and before put_page is called. + * + * get_user_pages is typically used for fewer-copy IO operations, to get a + * handle on the memory by some means other than accesses via the user virtual + * addresses. The pages may be submitted for DMA to devices or accessed via + * their kernel linear mapping (via the kmap APIs). Care should be taken to + * use the correct cache flushing APIs. + * + * See also get_user_pages_fast, for performance critical applications. + */ +long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, unsigned long nr_pages, int write, + int force, struct page **pages, struct vm_area_struct **vmas) +{ + int flags = FOLL_TOUCH; + + if (pages) + flags |= FOLL_GET; + if (write) + flags |= FOLL_WRITE; + if (force) + flags |= FOLL_FORCE; + + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); +} +EXPORT_SYMBOL(get_user_pages); + +/** + * get_dump_page() - pin user page in memory while writing it to core dump + * @addr: user address + * + * Returns struct page pointer of user page pinned for dump, + * to be freed afterwards by page_cache_release() or put_page(). + * + * Returns NULL on any kind of failure - a hole must then be inserted into + * the corefile, to preserve alignment with its headers; and also returns + * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - + * allowing a hole to be left in the corefile to save diskspace. + * + * Called without mmap_sem, but after all other threads have been killed. + */ +#ifdef CONFIG_ELF_CORE +struct page *get_dump_page(unsigned long addr) +{ + struct vm_area_struct *vma; + struct page *page; + + if (__get_user_pages(current, current->mm, addr, 1, + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) + return NULL; + flush_cache_page(vma, addr, page_to_pfn(page)); + return page; +} +#endif /* CONFIG_ELF_CORE */ diff --git a/mm/internal.h b/mm/internal.h index 07b6736..6ee580d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -169,6 +169,11 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +static inline bool is_cow_mapping(vm_flags_t flags) +{ + return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; +} + /* mm/util.c */ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev, struct rb_node *rb_parent); diff --git a/mm/memory.c b/mm/memory.c index 0897830..7049d39 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -698,11 +698,6 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); } -static inline bool is_cow_mapping(vm_flags_t flags) -{ - return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; -} - /* * vm_normal_page -- This function gets the "struct page" associated with a pte. * @@ -1458,642 +1453,6 @@ int zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, } EXPORT_SYMBOL_GPL(zap_vma_ptes); -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - struct page *page; - struct mm_struct *mm = vma->vm_mm; - - *page_mask = 0; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - goto out; - } - - page = NULL; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto no_page_table; - - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - goto no_page_table; - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - if (flags & FOLL_GET) - goto out; - page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); - goto out; - } - if (unlikely(pud_bad(*pud))) - goto no_page_table; - - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto no_page_table; - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); - if (flags & FOLL_GET) { - /* - * Refcount on tail pages are not well-defined and - * shouldn't be taken. The caller should handle a NULL - * return when trying to follow tail pages. - */ - if (PageHead(page)) - get_page(page); - else { - page = NULL; - goto out; - } - } - goto out; - } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) - goto no_page_table; - if (pmd_trans_huge(*pmd)) { - if (flags & FOLL_SPLIT) { - split_huge_page_pmd(vma, address, pmd); - goto split_fallthrough; - } - ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - page = follow_trans_huge_pmd(vma, address, - pmd, flags); - spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; - goto out; - } - } else - spin_unlock(ptl); - /* fall through */ - } -split_fallthrough: - if (unlikely(pmd_bad(*pmd))) - goto no_page_table; - - ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - - pte = *ptep; - if (!pte_present(pte)) { - swp_entry_t entry; - /* - * KSM's break_ksm() relies upon recognizing a ksm page - * even while it is being migrated, so for that case we - * need migration_entry_wait(). - */ - if (likely(!(flags & FOLL_MIGRATION))) - goto no_page; - if (pte_none(pte) || pte_file(pte)) - goto no_page; - entry = pte_to_swp_entry(pte); - if (!is_migration_entry(entry)) - goto no_page; - pte_unmap_unlock(ptep, ptl); - migration_entry_wait(mm, pmd, address); - goto split_fallthrough; - } - if ((flags & FOLL_NUMA) && pte_numa(pte)) - goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; - - page = vm_normal_page(vma, address, pte); - if (unlikely(!page)) { - if ((flags & FOLL_DUMP) || - !is_zero_pfn(pte_pfn(pte))) - goto bad_page; - page = pte_page(pte); - } - - if (flags & FOLL_GET) - get_page_foll(page); - if (flags & FOLL_TOUCH) { - if ((flags & FOLL_WRITE) && - !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - /* - * pte_mkyoung() would be more correct here, but atomic care - * is needed to avoid losing the dirty bit: it is easier to use - * mark_page_accessed(). - */ - mark_page_accessed(page); - } - if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { - /* - * The preliminary mapping check is mainly to avoid the - * pointless overhead of lock_page on the ZERO_PAGE - * which might bounce very badly if there is contention. - * - * If the page is already locked, we don't need to - * handle it now - vmscan will handle it later if and - * when it attempts to reclaim the page. - */ - if (page->mapping && trylock_page(page)) { - lru_add_drain(); /* push cached pages to LRU */ - /* - * Because we lock page here, and migration is - * blocked by the pte's page reference, and we - * know the page is still mapped, we don't even - * need to check for file-cache page truncation. - */ - mlock_vma_page(page); - unlock_page(page); - } - } -unlock: - pte_unmap_unlock(ptep, ptl); -out: - return page; - -bad_page: - pte_unmap_unlock(ptep, ptl); - return ERR_PTR(-EFAULT); - -no_page: - pte_unmap_unlock(ptep, ptl); - if (!pte_none(pte)) - return page; - -no_page_table: - /* - * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate unnecessary pages or - * page tables. Return error instead of NULL to skip handle_mm_fault, - * then get_dump_page() will return NULL to leave a hole in the dump. - * But we can only make this optimization where a hole would surely - * be zero-filled if handle_mm_fault() actually did handle it. - */ - if ((flags & FOLL_DUMP) && - (!vma->vm_ops || !vma->vm_ops->fault)) - return ERR_PTR(-EFAULT); - return page; -} - -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return stack_guard_page_start(vma, addr) || - stack_guard_page_end(vma, addr+PAGE_SIZE); -} - -/** - * __get_user_pages() - pin user pages in memory - * @tsk: task_struct of target task - * @mm: mm_struct of target mm - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @gup_flags: flags modifying pin behaviour - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. - * @nonblocking: whether waiting for disk IO or mmap_sem contention - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * __get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * __get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If - * the page is written to, set_page_dirty (or set_page_dirty_lock, as - * appropriate) must be called after the page is finished with, and - * before put_page is called. - * - * If @nonblocking != NULL, __get_user_pages will not wait for disk IO - * or mmap_sem contention, and if waiting is needed to pin all pages, - * *@nonblocking will be set to 0. - * - * In most cases, get_user_pages or get_user_pages_fast should be used - * instead of __get_user_pages. __get_user_pages should be used only if - * you need some special @gup_flags. - */ -long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas, int *nonblocking) -{ - long i; - unsigned long vm_flags; - unsigned int page_mask; - - if (!nr_pages) - return 0; - - VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET)); - - /* - * If FOLL_FORCE is set then do not force a full fault as the hinting - * fault information is unrelated to the reference behaviour of a task - * using the address space - */ - if (!(gup_flags & FOLL_FORCE)) - gup_flags |= FOLL_NUMA; - - i = 0; - - do { - struct vm_area_struct *vma; - - vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(mm, start)) { - unsigned long pg = start & PAGE_MASK; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* user gate pages are read-only */ - if (gup_flags & FOLL_WRITE) - goto efault; - if (pg > TASK_SIZE) - pgd = pgd_offset_k(pg); - else - pgd = pgd_offset_gate(mm, pg); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, pg); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, pg); - if (pmd_none(*pmd)) - goto efault; - VM_BUG_ON(pmd_trans_huge(*pmd)); - pte = pte_offset_map(pmd, pg); - if (pte_none(*pte)) { - pte_unmap(pte); - goto efault; - } - vma = get_gate_vma(mm); - if (pages) { - struct page *page; - - page = vm_normal_page(vma, start, *pte); - if (!page) { - if (!(gup_flags & FOLL_DUMP) && - is_zero_pfn(pte_pfn(*pte))) - page = pte_page(*pte); - else { - pte_unmap(pte); - goto efault; - } - } - pages[i] = page; - get_page(page); - } - pte_unmap(pte); - page_mask = 0; - goto next_page; - } - - if (!vma) - goto efault; - vm_flags = vma->vm_flags; - if (vm_flags & (VM_IO | VM_PFNMAP)) - goto efault; - - if (gup_flags & FOLL_WRITE) { - if (!(vm_flags & VM_WRITE)) { - if (!(gup_flags & FOLL_FORCE)) - goto efault; - /* - * We used to let the write,force case do COW - * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so - * ptrace could set a breakpoint in a read-only - * mapping of an executable, without corrupting - * the file (yet only when that file had been - * opened for writing!). Anon pages in shared - * mappings are surprising: now just reject it. - */ - if (!is_cow_mapping(vm_flags)) { - WARN_ON_ONCE(vm_flags & VM_MAYWRITE); - goto efault; - } - } - } else { - if (!(vm_flags & VM_READ)) { - if (!(gup_flags & FOLL_FORCE)) - goto efault; - /* - * Is there actually any vma we can reach here - * which does not have VM_MAYREAD set? - */ - if (!(vm_flags & VM_MAYREAD)) - goto efault; - } - } - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, gup_flags); - continue; - } - - do { - struct page *page; - unsigned int foll_flags = gup_flags; - unsigned int page_increm; - - /* - * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory. - */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; - - cond_resched(); - while (!(page = follow_page_mask(vma, start, - foll_flags, &page_mask))) { - int ret; - unsigned int fault_flags = 0; - - /* For mlock, just skip the stack guard page. */ - if (foll_flags & FOLL_MLOCK) { - if (stack_guard_page(vma, start)) - goto next_page; - } - if (foll_flags & FOLL_WRITE) - fault_flags |= FAULT_FLAG_WRITE; - if (nonblocking) - fault_flags |= FAULT_FLAG_ALLOW_RETRY; - if (foll_flags & FOLL_NOWAIT) - fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); - - ret = handle_mm_fault(mm, vma, start, - fault_flags); - - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return i ? i : -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | - VM_FAULT_HWPOISON_LARGE)) { - if (i) - return i; - else if (gup_flags & FOLL_HWPOISON) - return -EHWPOISON; - else - return -EFAULT; - } - if (ret & VM_FAULT_SIGBUS) - goto efault; - BUG(); - } - - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - - if (ret & VM_FAULT_RETRY) { - if (nonblocking) - *nonblocking = 0; - return i; - } - - /* - * The VM_FAULT_WRITE bit tells us that - * do_wp_page has broken COW when necessary, - * even if maybe_mkwrite decided not to set - * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. But only - * do so when looping for pte_write is futile: - * in some cases userspace may also be wanting - * to write to the gotten user page, which a - * read fault here might prevent (a readonly - * page might get reCOWed by userspace write). - */ - if ((ret & VM_FAULT_WRITE) && - !(vma->vm_flags & VM_WRITE)) - foll_flags &= ~FOLL_WRITE; - - cond_resched(); - } - if (IS_ERR(page)) - return i ? i : PTR_ERR(page); - if (pages) { - pages[i] = page; - - flush_anon_page(vma, page, start); - flush_dcache_page(page); - page_mask = 0; - } -next_page: - if (vmas) { - vmas[i] = vma; - page_mask = 0; - } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); - if (page_increm > nr_pages) - page_increm = nr_pages; - i += page_increm; - start += page_increm * PAGE_SIZE; - nr_pages -= page_increm; - } while (nr_pages && start < vma->vm_end); - } while (nr_pages); - return i; -efault: - return i ? : -EFAULT; -} -EXPORT_SYMBOL(__get_user_pages); - -/* - * fixup_user_fault() - manually resolve a user page fault - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. - * @mm: mm_struct of target mm - * @address: user address - * @fault_flags:flags to pass down to handle_mm_fault() - * - * This is meant to be called in the specific scenario where for locking reasons - * we try to access user memory in atomic context (within a pagefault_disable() - * section), this returns -EFAULT, and we want to resolve the user fault before - * trying again. - * - * Typically this is meant to be used by the futex code. - * - * The main difference with get_user_pages() is that this function will - * unconditionally call handle_mm_fault() which will in turn perform all the - * necessary SW fixup of the dirty and young bits in the PTE, while - * handle_mm_fault() only guarantees to update these in the struct page. - * - * This is important for some architectures where those bits also gate the - * access permission to the page because they are maintained in software. On - * such architectures, gup() will not be enough to make a subsequent access - * succeed. - * - * This should be called with the mm_sem held for read. - */ -int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, - unsigned long address, unsigned int fault_flags) -{ - struct vm_area_struct *vma; - vm_flags_t vm_flags; - int ret; - - vma = find_extend_vma(mm, address); - if (!vma || address < vma->vm_start) - return -EFAULT; - - vm_flags = (fault_flags & FAULT_FLAG_WRITE) ? VM_WRITE : VM_READ; - if (!(vm_flags & vma->vm_flags)) - return -EFAULT; - - ret = handle_mm_fault(mm, vma, address, fault_flags); - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) - return -EHWPOISON; - if (ret & VM_FAULT_SIGBUS) - return -EFAULT; - BUG(); - } - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - return 0; -} - -/* - * get_user_pages() - pin user pages in memory - * @tsk: the task_struct to use for page fault accounting, or - * NULL if faults are not to be recorded. - * @mm: mm_struct of target mm - * @start: starting user address - * @nr_pages: number of pages from start to pin - * @write: whether pages will be written to by the caller - * @force: whether to force access even when user mapping is currently - * protected (but never forces write access to shared mapping). - * @pages: array that receives pointers to the pages pinned. - * Should be at least nr_pages long. Or NULL, if caller - * only intends to ensure the pages are faulted in. - * @vmas: array of pointers to vmas corresponding to each page. - * Or NULL if the caller does not require them. - * - * Returns number of pages pinned. This may be fewer than the number - * requested. If nr_pages is 0 or negative, returns 0. If no pages - * were pinned, returns -errno. Each page returned must be released - * with a put_page() call when it is finished with. vmas will only - * remain valid while mmap_sem is held. - * - * Must be called with mmap_sem held for read or write. - * - * get_user_pages walks a process's page tables and takes a reference to - * each struct page that each user address corresponds to at a given - * instant. That is, it takes the page that would be accessed if a user - * thread accesses the given user virtual address at that instant. - * - * This does not guarantee that the page exists in the user mappings when - * get_user_pages returns, and there may even be a completely different - * page there in some cases (eg. if mmapped pagecache has been invalidated - * and subsequently re faulted). However it does guarantee that the page - * won't be freed completely. And mostly callers simply care that the page - * contains data that was valid *at some point in time*. Typically, an IO - * or similar operation cannot guarantee anything stronger anyway because - * locks can't be held over the syscall boundary. - * - * If write=0, the page must not be written to. If the page is written to, - * set_page_dirty (or set_page_dirty_lock, as appropriate) must be called - * after the page is finished with, and before put_page is called. - * - * get_user_pages is typically used for fewer-copy IO operations, to get a - * handle on the memory by some means other than accesses via the user virtual - * addresses. The pages may be submitted for DMA to devices or accessed via - * their kernel linear mapping (via the kmap APIs). Care should be taken to - * use the correct cache flushing APIs. - * - * See also get_user_pages_fast, for performance critical applications. - */ -long get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, int write, - int force, struct page **pages, struct vm_area_struct **vmas) -{ - int flags = FOLL_TOUCH; - - if (pages) - flags |= FOLL_GET; - if (write) - flags |= FOLL_WRITE; - if (force) - flags |= FOLL_FORCE; - - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, - NULL); -} -EXPORT_SYMBOL(get_user_pages); - -/** - * get_dump_page() - pin user page in memory while writing it to core dump - * @addr: user address - * - * Returns struct page pointer of user page pinned for dump, - * to be freed afterwards by page_cache_release() or put_page(). - * - * Returns NULL on any kind of failure - a hole must then be inserted into - * the corefile, to preserve alignment with its headers; and also returns - * NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found - - * allowing a hole to be left in the corefile to save diskspace. - * - * Called without mmap_sem, but after all other threads have been killed. - */ -#ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) -{ - struct vm_area_struct *vma; - struct page *page; - - if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, - NULL) < 1) - return NULL; - flush_cache_page(vma, addr, page_to_pfn(page)); - return page; -} -#endif /* CONFIG_ELF_CORE */ - pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl) { -- cgit v0.10.2 From f2b495ca82e188fd2818479a551f126edf023756 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:11 -0700 Subject: mm: extract in_gate_area() case from __get_user_pages() The case is special and disturb from reading main __get_user_pages() code path. Let's move it to separate function. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/gup.c b/mm/gup.c index ea88b65..0bf127b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -213,6 +213,50 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add stack_guard_page_end(vma, addr+PAGE_SIZE); } +static int get_gate_page(struct mm_struct *mm, unsigned long address, + unsigned int gup_flags, struct vm_area_struct **vma, + struct page **page) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte; + int ret = -EFAULT; + + /* user gate pages are read-only */ + if (gup_flags & FOLL_WRITE) + return -EFAULT; + if (address > TASK_SIZE) + pgd = pgd_offset_k(address); + else + pgd = pgd_offset_gate(mm, address); + BUG_ON(pgd_none(*pgd)); + pud = pud_offset(pgd, address); + BUG_ON(pud_none(*pud)); + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); + pte = pte_offset_map(pmd, address); + if (pte_none(*pte)) + goto unmap; + *vma = get_gate_vma(mm); + if (!page) + goto out; + *page = vm_normal_page(*vma, address, *pte); + if (!*page) { + if ((gup_flags & FOLL_DUMP) || !is_zero_pfn(pte_pfn(*pte))) + goto unmap; + *page = pte_page(*pte); + } + get_page(*page); +out: + ret = 0; +unmap: + pte_unmap(pte); + return ret; +} + /** * __get_user_pages() - pin user pages in memory * @tsk: task_struct of target task @@ -291,49 +335,11 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, vma = find_extend_vma(mm, start); if (!vma && in_gate_area(mm, start)) { - unsigned long pg = start & PAGE_MASK; - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *pte; - - /* user gate pages are read-only */ - if (gup_flags & FOLL_WRITE) - goto efault; - if (pg > TASK_SIZE) - pgd = pgd_offset_k(pg); - else - pgd = pgd_offset_gate(mm, pg); - BUG_ON(pgd_none(*pgd)); - pud = pud_offset(pgd, pg); - BUG_ON(pud_none(*pud)); - pmd = pmd_offset(pud, pg); - if (pmd_none(*pmd)) + int ret; + ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, + &vma, pages ? &pages[i] : NULL); + if (ret) goto efault; - VM_BUG_ON(pmd_trans_huge(*pmd)); - pte = pte_offset_map(pmd, pg); - if (pte_none(*pte)) { - pte_unmap(pte); - goto efault; - } - vma = get_gate_vma(mm); - if (pages) { - struct page *page; - - page = vm_normal_page(vma, start, *pte); - if (!page) { - if (!(gup_flags & FOLL_DUMP) && - is_zero_pfn(pte_pfn(*pte))) - page = pte_page(*pte); - else { - pte_unmap(pte); - goto efault; - } - } - pages[i] = page; - get_page(page); - } - pte_unmap(pte); page_mask = 0; goto next_page; } -- cgit v0.10.2 From 69e68b4f03135da4a09d1215a3942d7dabd1075b Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:11 -0700 Subject: mm: cleanup follow_page_mask() Cleanups: - move pte-related code to separate function. It's about half of the function; - get rid of some goto-logic; - use 'return NULL' instead of 'return page' where page can only be NULL; Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/gup.c b/mm/gup.c index 0bf127b..4063678 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -12,105 +12,35 @@ #include "internal.h" -/** - * follow_page_mask - look up a page descriptor from a user-virtual address - * @vma: vm_area_struct mapping @address - * @address: virtual address to look up - * @flags: flags modifying lookup behaviour - * @page_mask: on output, *page_mask is set according to the size of the page - * - * @flags can have FOLL_ flags set, defined in - * - * Returns the mapped (struct page *), %NULL if no mapping exists, or - * an error pointer if there is a mapping to something not represented - * by a page descriptor (see also vm_normal_page()). - */ -struct page *follow_page_mask(struct vm_area_struct *vma, - unsigned long address, unsigned int flags, - unsigned int *page_mask) +static struct page *no_page_table(struct vm_area_struct *vma, + unsigned int flags) { - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; - spinlock_t *ptl; - struct page *page; - struct mm_struct *mm = vma->vm_mm; - - *page_mask = 0; - - page = follow_huge_addr(mm, address, flags & FOLL_WRITE); - if (!IS_ERR(page)) { - BUG_ON(flags & FOLL_GET); - goto out; - } - - page = NULL; - pgd = pgd_offset(mm, address); - if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) - goto no_page_table; + /* + * When core dumping an enormous anonymous area that nobody + * has touched so far, we don't want to allocate unnecessary pages or + * page tables. Return error instead of NULL to skip handle_mm_fault, + * then get_dump_page() will return NULL to leave a hole in the dump. + * But we can only make this optimization where a hole would surely + * be zero-filled if handle_mm_fault() actually did handle it. + */ + if ((flags & FOLL_DUMP) && (!vma->vm_ops || !vma->vm_ops->fault)) + return ERR_PTR(-EFAULT); + return NULL; +} - pud = pud_offset(pgd, address); - if (pud_none(*pud)) - goto no_page_table; - if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { - if (flags & FOLL_GET) - goto out; - page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); - goto out; - } - if (unlikely(pud_bad(*pud))) - goto no_page_table; +static struct page *follow_page_pte(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, unsigned int flags) +{ + struct mm_struct *mm = vma->vm_mm; + struct page *page; + spinlock_t *ptl; + pte_t *ptep, pte; - pmd = pmd_offset(pud, address); - if (pmd_none(*pmd)) - goto no_page_table; - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { - page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); - if (flags & FOLL_GET) { - /* - * Refcount on tail pages are not well-defined and - * shouldn't be taken. The caller should handle a NULL - * return when trying to follow tail pages. - */ - if (PageHead(page)) - get_page(page); - else { - page = NULL; - goto out; - } - } - goto out; - } - if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) - goto no_page_table; - if (pmd_trans_huge(*pmd)) { - if (flags & FOLL_SPLIT) { - split_huge_page_pmd(vma, address, pmd); - goto split_fallthrough; - } - ptl = pmd_lock(mm, pmd); - if (likely(pmd_trans_huge(*pmd))) { - if (unlikely(pmd_trans_splitting(*pmd))) { - spin_unlock(ptl); - wait_split_huge_page(vma->anon_vma, pmd); - } else { - page = follow_trans_huge_pmd(vma, address, - pmd, flags); - spin_unlock(ptl); - *page_mask = HPAGE_PMD_NR - 1; - goto out; - } - } else - spin_unlock(ptl); - /* fall through */ - } -split_fallthrough: +retry: if (unlikely(pmd_bad(*pmd))) - goto no_page_table; + return no_page_table(vma, flags); ptep = pte_offset_map_lock(mm, pmd, address, &ptl); - pte = *ptep; if (!pte_present(pte)) { swp_entry_t entry; @@ -128,12 +58,14 @@ split_fallthrough: goto no_page; pte_unmap_unlock(ptep, ptl); migration_entry_wait(mm, pmd, address); - goto split_fallthrough; + goto retry; } if ((flags & FOLL_NUMA) && pte_numa(pte)) goto no_page; - if ((flags & FOLL_WRITE) && !pte_write(pte)) - goto unlock; + if ((flags & FOLL_WRITE) && !pte_write(pte)) { + pte_unmap_unlock(ptep, ptl); + return NULL; + } page = vm_normal_page(vma, address, pte); if (unlikely(!page)) { @@ -178,11 +110,8 @@ split_fallthrough: unlock_page(page); } } -unlock: pte_unmap_unlock(ptep, ptl); -out: return page; - bad_page: pte_unmap_unlock(ptep, ptl); return ERR_PTR(-EFAULT); @@ -190,21 +119,99 @@ bad_page: no_page: pte_unmap_unlock(ptep, ptl); if (!pte_none(pte)) + return NULL; + return no_page_table(vma, flags); +} + +/** + * follow_page_mask - look up a page descriptor from a user-virtual address + * @vma: vm_area_struct mapping @address + * @address: virtual address to look up + * @flags: flags modifying lookup behaviour + * @page_mask: on output, *page_mask is set according to the size of the page + * + * @flags can have FOLL_ flags set, defined in + * + * Returns the mapped (struct page *), %NULL if no mapping exists, or + * an error pointer if there is a mapping to something not represented + * by a page descriptor (see also vm_normal_page()). + */ +struct page *follow_page_mask(struct vm_area_struct *vma, + unsigned long address, unsigned int flags, + unsigned int *page_mask) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + spinlock_t *ptl; + struct page *page; + struct mm_struct *mm = vma->vm_mm; + + *page_mask = 0; + + page = follow_huge_addr(mm, address, flags & FOLL_WRITE); + if (!IS_ERR(page)) { + BUG_ON(flags & FOLL_GET); return page; + } -no_page_table: - /* - * When core dumping an enormous anonymous area that nobody - * has touched so far, we don't want to allocate unnecessary pages or - * page tables. Return error instead of NULL to skip handle_mm_fault, - * then get_dump_page() will return NULL to leave a hole in the dump. - * But we can only make this optimization where a hole would surely - * be zero-filled if handle_mm_fault() actually did handle it. - */ - if ((flags & FOLL_DUMP) && - (!vma->vm_ops || !vma->vm_ops->fault)) - return ERR_PTR(-EFAULT); - return page; + pgd = pgd_offset(mm, address); + if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd))) + return no_page_table(vma, flags); + + pud = pud_offset(pgd, address); + if (pud_none(*pud)) + return no_page_table(vma, flags); + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { + if (flags & FOLL_GET) + return NULL; + page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); + return page; + } + if (unlikely(pud_bad(*pud))) + return no_page_table(vma, flags); + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + return no_page_table(vma, flags); + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); + if (flags & FOLL_GET) { + /* + * Refcount on tail pages are not well-defined and + * shouldn't be taken. The caller should handle a NULL + * return when trying to follow tail pages. + */ + if (PageHead(page)) + get_page(page); + else + page = NULL; + } + return page; + } + if ((flags & FOLL_NUMA) && pmd_numa(*pmd)) + return no_page_table(vma, flags); + if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(vma, address, pmd); + return follow_page_pte(vma, address, pmd, flags); + } + ptl = pmd_lock(mm, pmd); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(ptl); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(vma, address, + pmd, flags); + spin_unlock(ptl); + *page_mask = HPAGE_PMD_NR - 1; + return page; + } + } else + spin_unlock(ptl); + } + return follow_page_pte(vma, address, pmd, flags); } static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -- cgit v0.10.2 From 1674448345cdb56e724483a2a26622771f4e3a10 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:12 -0700 Subject: mm: extract code to fault in a page from __get_user_pages() Nesting level in __get_user_pages() is just insane. Let's try to fix it a bit. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/gup.c b/mm/gup.c index 4063678..28e3700 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -214,12 +214,6 @@ struct page *follow_page_mask(struct vm_area_struct *vma, return follow_page_pte(vma, address, pmd, flags); } -static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr) -{ - return stack_guard_page_start(vma, addr) || - stack_guard_page_end(vma, addr+PAGE_SIZE); -} - static int get_gate_page(struct mm_struct *mm, unsigned long address, unsigned int gup_flags, struct vm_area_struct **vma, struct page **page) @@ -264,6 +258,63 @@ unmap: return ret; } +static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, + unsigned long address, unsigned int *flags, int *nonblocking) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned int fault_flags = 0; + int ret; + + /* For mlock, just skip the stack guard page. */ + if ((*flags & FOLL_MLOCK) && + (stack_guard_page_start(vma, address) || + stack_guard_page_end(vma, address + PAGE_SIZE))) + return -ENOENT; + if (*flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; + if (*flags & FOLL_NOWAIT) + fault_flags |= FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return *flags & FOLL_HWPOISON ? -EHWPOISON : -EFAULT; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + + if (ret & VM_FAULT_RETRY) { + if (nonblocking) + *nonblocking = 0; + return -EBUSY; + } + + /* + * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when + * necessary, even if maybe_mkwrite decided not to set pte_write. We + * can thus safely do subsequent page lookups as if they were reads. + * But only do so when looping for pte_write is futile: in some cases + * userspace may also be wanting to write to the gotten user page, + * which a read fault here might prevent (a readonly page might get + * reCOWed by userspace write). + */ + if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) + *flags &= ~FOLL_WRITE; + return 0; +} + /** * __get_user_pages() - pin user pages in memory * @tsk: task_struct of target task @@ -410,69 +461,22 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, while (!(page = follow_page_mask(vma, start, foll_flags, &page_mask))) { int ret; - unsigned int fault_flags = 0; - - /* For mlock, just skip the stack guard page. */ - if (foll_flags & FOLL_MLOCK) { - if (stack_guard_page(vma, start)) - goto next_page; - } - if (foll_flags & FOLL_WRITE) - fault_flags |= FAULT_FLAG_WRITE; - if (nonblocking) - fault_flags |= FAULT_FLAG_ALLOW_RETRY; - if (foll_flags & FOLL_NOWAIT) - fault_flags |= (FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_RETRY_NOWAIT); - - ret = handle_mm_fault(mm, vma, start, - fault_flags); - - if (ret & VM_FAULT_ERROR) { - if (ret & VM_FAULT_OOM) - return i ? i : -ENOMEM; - if (ret & (VM_FAULT_HWPOISON | - VM_FAULT_HWPOISON_LARGE)) { - if (i) - return i; - else if (gup_flags & FOLL_HWPOISON) - return -EHWPOISON; - else - return -EFAULT; - } - if (ret & VM_FAULT_SIGBUS) - goto efault; - BUG(); - } - - if (tsk) { - if (ret & VM_FAULT_MAJOR) - tsk->maj_flt++; - else - tsk->min_flt++; - } - - if (ret & VM_FAULT_RETRY) { - if (nonblocking) - *nonblocking = 0; + ret = faultin_page(tsk, vma, start, &foll_flags, + nonblocking); + switch (ret) { + case 0: + break; + case -EFAULT: + case -ENOMEM: + case -EHWPOISON: + return i ? i : ret; + case -EBUSY: return i; + case -ENOENT: + goto next_page; + default: + BUG(); } - - /* - * The VM_FAULT_WRITE bit tells us that - * do_wp_page has broken COW when necessary, - * even if maybe_mkwrite decided not to set - * pte_write. We can thus safely do subsequent - * page lookups as if they were reads. But only - * do so when looping for pte_write is futile: - * in some cases userspace may also be wanting - * to write to the gotten user page, which a - * read fault here might prevent (a readonly - * page might get reCOWed by userspace write). - */ - if ((ret & VM_FAULT_WRITE) && - !(vma->vm_flags & VM_WRITE)) - foll_flags &= ~FOLL_WRITE; - cond_resched(); } if (IS_ERR(page)) -- cgit v0.10.2 From fa5bb2093a1d2ba552309a81139e0abebf5325d8 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:13 -0700 Subject: mm: cleanup __get_user_pages() Get rid of two nested loops over nr_pages, extract vma flags checking to separate function and other random cleanups. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/gup.c b/mm/gup.c index 28e3700..cc5a9e7 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -315,6 +315,44 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma, return 0; } +static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags) +{ + vm_flags_t vm_flags = vma->vm_flags; + + if (vm_flags & (VM_IO | VM_PFNMAP)) + return -EFAULT; + + if (gup_flags & FOLL_WRITE) { + if (!(vm_flags & VM_WRITE)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* + * We used to let the write,force case do COW in a + * VM_MAYWRITE VM_SHARED !VM_WRITE vma, so ptrace could + * set a breakpoint in a read-only mapping of an + * executable, without corrupting the file (yet only + * when that file had been opened for writing!). + * Anon pages in shared mappings are surprising: now + * just reject it. + */ + if (!is_cow_mapping(vm_flags)) { + WARN_ON_ONCE(vm_flags & VM_MAYWRITE); + return -EFAULT; + } + } + } else if (!(vm_flags & VM_READ)) { + if (!(gup_flags & FOLL_FORCE)) + return -EFAULT; + /* + * Is there actually any vma we can reach here which does not + * have VM_MAYREAD set? + */ + if (!(vm_flags & VM_MAYREAD)) + return -EFAULT; + } + return 0; +} + /** * __get_user_pages() - pin user pages in memory * @tsk: task_struct of target task @@ -369,9 +407,9 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas, int *nonblocking) { - long i; - unsigned long vm_flags; + long i = 0; unsigned int page_mask; + struct vm_area_struct *vma = NULL; if (!nr_pages) return 0; @@ -386,124 +424,82 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (!(gup_flags & FOLL_FORCE)) gup_flags |= FOLL_NUMA; - i = 0; - do { - struct vm_area_struct *vma; - - vma = find_extend_vma(mm, start); - if (!vma && in_gate_area(mm, start)) { - int ret; - ret = get_gate_page(mm, start & PAGE_MASK, gup_flags, - &vma, pages ? &pages[i] : NULL); - if (ret) - goto efault; - page_mask = 0; - goto next_page; - } + struct page *page; + unsigned int foll_flags = gup_flags; + unsigned int page_increm; + + /* first iteration or cross vma bound */ + if (!vma || start >= vma->vm_end) { + vma = find_extend_vma(mm, start); + if (!vma && in_gate_area(mm, start)) { + int ret; + ret = get_gate_page(mm, start & PAGE_MASK, + gup_flags, &vma, + pages ? &pages[i] : NULL); + if (ret) + return i ? : ret; + page_mask = 0; + goto next_page; + } - if (!vma) - goto efault; - vm_flags = vma->vm_flags; - if (vm_flags & (VM_IO | VM_PFNMAP)) - goto efault; - - if (gup_flags & FOLL_WRITE) { - if (!(vm_flags & VM_WRITE)) { - if (!(gup_flags & FOLL_FORCE)) - goto efault; - /* - * We used to let the write,force case do COW - * in a VM_MAYWRITE VM_SHARED !VM_WRITE vma, so - * ptrace could set a breakpoint in a read-only - * mapping of an executable, without corrupting - * the file (yet only when that file had been - * opened for writing!). Anon pages in shared - * mappings are surprising: now just reject it. - */ - if (!is_cow_mapping(vm_flags)) { - WARN_ON_ONCE(vm_flags & VM_MAYWRITE); - goto efault; - } + if (!vma || check_vma_flags(vma, gup_flags)) + return i ? : -EFAULT; + if (is_vm_hugetlb_page(vma)) { + i = follow_hugetlb_page(mm, vma, pages, vmas, + &start, &nr_pages, i, + gup_flags); + continue; } - } else { - if (!(vm_flags & VM_READ)) { - if (!(gup_flags & FOLL_FORCE)) - goto efault; - /* - * Is there actually any vma we can reach here - * which does not have VM_MAYREAD set? - */ - if (!(vm_flags & VM_MAYREAD)) - goto efault; + } +retry: + /* + * If we have a pending SIGKILL, don't keep faulting pages and + * potentially allocating memory. + */ + if (unlikely(fatal_signal_pending(current))) + return i ? i : -ERESTARTSYS; + cond_resched(); + page = follow_page_mask(vma, start, foll_flags, &page_mask); + if (!page) { + int ret; + ret = faultin_page(tsk, vma, start, &foll_flags, + nonblocking); + switch (ret) { + case 0: + goto retry; + case -EFAULT: + case -ENOMEM: + case -EHWPOISON: + return i ? i : ret; + case -EBUSY: + return i; + case -ENOENT: + goto next_page; } + BUG(); } - - if (is_vm_hugetlb_page(vma)) { - i = follow_hugetlb_page(mm, vma, pages, vmas, - &start, &nr_pages, i, gup_flags); - continue; + if (IS_ERR(page)) + return i ? i : PTR_ERR(page); + if (pages) { + pages[i] = page; + flush_anon_page(vma, page, start); + flush_dcache_page(page); + page_mask = 0; } - - do { - struct page *page; - unsigned int foll_flags = gup_flags; - unsigned int page_increm; - - /* - * If we have a pending SIGKILL, don't keep faulting - * pages and potentially allocating memory. - */ - if (unlikely(fatal_signal_pending(current))) - return i ? i : -ERESTARTSYS; - - cond_resched(); - while (!(page = follow_page_mask(vma, start, - foll_flags, &page_mask))) { - int ret; - ret = faultin_page(tsk, vma, start, &foll_flags, - nonblocking); - switch (ret) { - case 0: - break; - case -EFAULT: - case -ENOMEM: - case -EHWPOISON: - return i ? i : ret; - case -EBUSY: - return i; - case -ENOENT: - goto next_page; - default: - BUG(); - } - cond_resched(); - } - if (IS_ERR(page)) - return i ? i : PTR_ERR(page); - if (pages) { - pages[i] = page; - - flush_anon_page(vma, page, start); - flush_dcache_page(page); - page_mask = 0; - } next_page: - if (vmas) { - vmas[i] = vma; - page_mask = 0; - } - page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); - if (page_increm > nr_pages) - page_increm = nr_pages; - i += page_increm; - start += page_increm * PAGE_SIZE; - nr_pages -= page_increm; - } while (nr_pages && start < vma->vm_end); + if (vmas) { + vmas[i] = vma; + page_mask = 0; + } + page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask); + if (page_increm > nr_pages) + page_increm = nr_pages; + i += page_increm; + start += page_increm * PAGE_SIZE; + nr_pages -= page_increm; } while (nr_pages); return i; -efault: - return i ? : -EFAULT; } EXPORT_SYMBOL(__get_user_pages); -- cgit v0.10.2 From 2373eaecff33db5972bde9418f92d6401b4a945c Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 4 Jun 2014 16:08:14 -0700 Subject: mm: x86 pgtable: drop unneeded preprocessor ifdef _PAGE_BIT_FILE (bit 6) is always less than _PAGE_BIT_PROTNONE (bit 8), so drop redundant #ifdef. Signed-off-by: Cyrill Gorcunov Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Anvin Cc: Ingo Molnar Cc: Steven Noonan Cc: Rik van Riel Cc: David Vrabel Cc: Peter Zijlstra Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 0d193e2..eec82d4 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -115,13 +115,8 @@ static __always_inline pte_t pgoff_to_pte(pgoff_t off) */ #define PTE_FILE_MAX_BITS 29 #define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) -#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) #define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) -#else -#define PTE_FILE_SHIFT2 (_PAGE_BIT_PROTNONE + 1) -#define PTE_FILE_SHIFT3 (_PAGE_BIT_FILE + 1) -#endif #define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) #define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) @@ -153,13 +148,8 @@ static __always_inline pte_t pgoff_to_pte(pgoff_t off) #endif /* CONFIG_MEM_SOFT_DIRTY */ /* Encode and de-code a swap entry */ -#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) -#else -#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) -#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) -#endif #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 6d6ecd0..5be9063 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -143,7 +143,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_unmap(pte) ((void)(pte))/* NOP */ /* Encode and de-code a swap entry */ -#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) #ifdef CONFIG_NUMA_BALANCING /* Automatic NUMA balancing needs to be distinguishable from swap entries */ @@ -151,13 +150,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #else #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) #endif -#else -#ifdef CONFIG_NUMA_BALANCING -#error Incompatible format for automatic NUMA balancing -#endif -#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1) -#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1) -#endif #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -- cgit v0.10.2 From 2bf01f9f0cf07b231c90e5d56266e84fe17cec79 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Wed, 4 Jun 2014 16:08:16 -0700 Subject: mm: x86 pgtable: require X86_64 for soft-dirty tracker Tracking dirty status on 2 level pages requires very ugly macros and taking into account how old the machines who can operate without PAE mode only are, lets drop soft dirty tracker from them for code simplicity (note I can't drop all the macros from 2 level pages by now since _PAGE_BIT_PROTNONE and _PAGE_BIT_FILE are still used even without tracker). Linus proposed to completely rip off softdirty support on x86-32 (even with PAE) and since for CRIU we're not planning to support native x86-32 mode, lets do that. (Softdirty tracker is relatively new feature which is mostly used by CRIU so I don't expect if such API change would cause problems for userspace). Signed-off-by: Cyrill Gorcunov Cc: Linus Torvalds Cc: Mel Gorman Cc: Peter Anvin Cc: Ingo Molnar Cc: Steven Noonan Cc: Rik van Riel Cc: David Vrabel Cc: Peter Zijlstra Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4a0137f..69086a3 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -105,7 +105,7 @@ config X86 select HAVE_ARCH_SECCOMP_FILTER select BUILDTIME_EXTABLE_SORT select GENERIC_CMOS_UPDATE - select HAVE_ARCH_SOFT_DIRTY + select HAVE_ARCH_SOFT_DIRTY if X86_64 select CLOCKSOURCE_WATCHDOG select GENERIC_CLOCKEVENTS select ARCH_CLOCKSOURCE_DATA diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index eec82d4..206a87f 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -62,53 +62,6 @@ static inline unsigned long pte_bitop(unsigned long value, unsigned int rightshi return ((value >> rightshift) & mask) << leftshift; } -#ifdef CONFIG_MEM_SOFT_DIRTY - -/* - * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and - * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset - * into this range. - */ -#define PTE_FILE_MAX_BITS 28 -#define PTE_FILE_SHIFT1 (_PAGE_BIT_PRESENT + 1) -#define PTE_FILE_SHIFT2 (_PAGE_BIT_FILE + 1) -#define PTE_FILE_SHIFT3 (_PAGE_BIT_PROTNONE + 1) -#define PTE_FILE_SHIFT4 (_PAGE_BIT_SOFT_DIRTY + 1) -#define PTE_FILE_BITS1 (PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1) -#define PTE_FILE_BITS2 (PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1) -#define PTE_FILE_BITS3 (PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1) - -#define PTE_FILE_MASK1 ((1U << PTE_FILE_BITS1) - 1) -#define PTE_FILE_MASK2 ((1U << PTE_FILE_BITS2) - 1) -#define PTE_FILE_MASK3 ((1U << PTE_FILE_BITS3) - 1) - -#define PTE_FILE_LSHIFT2 (PTE_FILE_BITS1) -#define PTE_FILE_LSHIFT3 (PTE_FILE_BITS1 + PTE_FILE_BITS2) -#define PTE_FILE_LSHIFT4 (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3) - -static __always_inline pgoff_t pte_to_pgoff(pte_t pte) -{ - return (pgoff_t) - (pte_bitop(pte.pte_low, PTE_FILE_SHIFT1, PTE_FILE_MASK1, 0) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT2, PTE_FILE_MASK2, PTE_FILE_LSHIFT2) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT3, PTE_FILE_MASK3, PTE_FILE_LSHIFT3) + - pte_bitop(pte.pte_low, PTE_FILE_SHIFT4, -1UL, PTE_FILE_LSHIFT4)); -} - -static __always_inline pte_t pgoff_to_pte(pgoff_t off) -{ - return (pte_t){ - .pte_low = - pte_bitop(off, 0, PTE_FILE_MASK1, PTE_FILE_SHIFT1) + - pte_bitop(off, PTE_FILE_LSHIFT2, PTE_FILE_MASK2, PTE_FILE_SHIFT2) + - pte_bitop(off, PTE_FILE_LSHIFT3, PTE_FILE_MASK3, PTE_FILE_SHIFT3) + - pte_bitop(off, PTE_FILE_LSHIFT4, -1UL, PTE_FILE_SHIFT4) + - _PAGE_FILE, - }; -} - -#else /* CONFIG_MEM_SOFT_DIRTY */ - /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, * split up the 29 bits of offset into this range. @@ -145,8 +98,6 @@ static __always_inline pte_t pgoff_to_pte(pgoff_t off) }; } -#endif /* CONFIG_MEM_SOFT_DIRTY */ - /* Encode and de-code a swap entry */ #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 66276c1..0ec0560 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -297,6 +297,7 @@ static inline pmd_t pmd_mknotpresent(pmd_t pmd) return pmd_clear_flags(pmd, _PAGE_PRESENT); } +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline int pte_soft_dirty(pte_t pte) { return pte_flags(pte) & _PAGE_SOFT_DIRTY; @@ -332,6 +333,8 @@ static inline int pte_file_soft_dirty(pte_t pte) return pte_flags(pte) & _PAGE_SOFT_DIRTY; } +#endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */ + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. @@ -865,6 +868,7 @@ static inline void update_mmu_cache_pmd(struct vm_area_struct *vma, { } +#ifdef CONFIG_HAVE_ARCH_SOFT_DIRTY static inline pte_t pte_swp_mksoft_dirty(pte_t pte) { VM_BUG_ON(pte_present_nonuma(pte)); @@ -882,6 +886,7 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte) VM_BUG_ON(pte_present_nonuma(pte)); return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY); } +#endif #include #endif /* __ASSEMBLY__ */ -- cgit v0.10.2 From ac7695012a6f3269acd80d6c2b2218a6769edbf3 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:17 -0700 Subject: mm/rmap.c: make page_referenced_one() and try_to_unmap_one() static KSM was converted to use rmap_walk() and now nobody uses these functions outside mm/rmap.c. Let's covert them back to static. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/rmap.h b/include/linux/rmap.h index b66c211..9be55c7 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -183,14 +183,10 @@ static inline void page_dup_rmap(struct page *page) */ int page_referenced(struct page *, int is_locked, struct mem_cgroup *memcg, unsigned long *vm_flags); -int page_referenced_one(struct page *, struct vm_area_struct *, - unsigned long address, void *arg); #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) int try_to_unmap(struct page *, enum ttu_flags flags); -int try_to_unmap_one(struct page *, struct vm_area_struct *, - unsigned long address, void *arg); /* * Called from mm/filemap_xip.c to unmap empty zero page diff --git a/mm/rmap.c b/mm/rmap.c index 7da400d..8754e1f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -671,7 +671,7 @@ struct page_referenced_arg { /* * arg: page_referenced_arg will be passed */ -int page_referenced_one(struct page *page, struct vm_area_struct *vma, +static int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; @@ -1114,7 +1114,7 @@ out: /* * @arg: enum ttu_flags will be passed to this argument */ -int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, +static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, unsigned long address, void *arg) { struct mm_struct *mm = vma->vm_mm; -- cgit v0.10.2 From b46e14acb816038bda92f6aa0dd2c4554fe64d24 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:08:18 -0700 Subject: mm/mempolicy.c: parameter doc uniformization Also fixes kernel-doc warning Signed-off-by: Fabian Frederick Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b09586d..7f7864b 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1606,9 +1606,9 @@ COMPAT_SYSCALL_DEFINE6(mbind, compat_ulong_t, start, compat_ulong_t, len, /* * get_vma_policy(@task, @vma, @addr) - * @task - task for fallback if vma policy == default - * @vma - virtual memory area whose policy is sought - * @addr - address in @vma for shared policy lookup + * @task: task for fallback if vma policy == default + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup * * Returns effective policy for a VMA at specified address. * Falls back to @task or system default policy, as necessary. @@ -1854,11 +1854,11 @@ int node_random(const nodemask_t *maskp) #ifdef CONFIG_HUGETLBFS /* * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) - * @vma = virtual memory area whose policy is sought - * @addr = address in @vma for shared policy lookup and interleave policy - * @gfp_flags = for requested zone - * @mpol = pointer to mempolicy pointer for reference counted mempolicy - * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask + * @vma: virtual memory area whose policy is sought + * @addr: address in @vma for shared policy lookup and interleave policy + * @gfp_flags: for requested zone + * @mpol: pointer to mempolicy pointer for reference counted mempolicy + * @nodemask: pointer to nodemask pointer for MPOL_BIND nodemask * * Returns a zonelist suitable for a huge page allocation and a pointer * to the struct mempolicy for conditional unref after allocation. @@ -2270,9 +2270,9 @@ static void sp_free(struct sp_node *n) /** * mpol_misplaced - check whether current page node is valid in policy * - * @page - page to be checked - * @vma - vm area where page mapped - * @addr - virtual address where page mapped + * @page: page to be checked + * @vma: vm area where page mapped + * @addr: virtual address where page mapped * * Lookup current policy node id for vma,addr and "compare to" page's * node id. -- cgit v0.10.2 From af4459d3636790735fccd83f0337c8380a0a4cc2 Mon Sep 17 00:00:00 2001 From: Emil Medve Date: Wed, 4 Jun 2014 16:08:19 -0700 Subject: arch/x86/mm/numa.c: use for_each_memblock() Signed-off-by: Emil Medve Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Yinghai Lu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 1d045f9..a32b706 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -559,7 +559,7 @@ static void __init numa_clear_kernel_node_hotplug(void) int i, nid; nodemask_t numa_kernel_nodes = NODE_MASK_NONE; unsigned long start, end; - struct memblock_type *type = &memblock.reserved; + struct memblock_region *r; /* * At this time, all memory regions reserved by memblock are @@ -573,8 +573,8 @@ static void __init numa_clear_kernel_node_hotplug(void) } /* Mark all kernel nodes. */ - for (i = 0; i < type->cnt; i++) - node_set(type->regions[i].nid, numa_kernel_nodes); + for_each_memblock(reserved, r) + node_set(r->nid, numa_kernel_nodes); /* Clear MEMBLOCK_HOTPLUG flag for memory in kernel nodes. */ for (i = 0; i < numa_meminfo.nr_blks; i++) { -- cgit v0.10.2 From 3fb1c8dcfcda2f5bfb7d79d8b08bf2f04b1eed8f Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:08:20 -0700 Subject: mm: update comment for DEFAULT_MAX_MAP_COUNT With ELF extended numbering 16-bit bound is not hard limit any more. [akpm@linux-foundation.org: fix typo] Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 8045a55..596a0e0 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -25,6 +25,10 @@ enum { sysctl_hung_task_timeout_secs = 0 }; * Because the kernel adds some informative sections to a image of program at * generating coredump, we need some margin. The number of extra sections is * 1-3 now and depends on arch. We use "5" as safe margin, here. + * + * ELF extended numbering allows more than 65535 sections, so 16-bit bound is + * not a hard limit any more. Although some userspace tools can be surprised by + * that. */ #define MAPCOUNT_ELF_CORE_MARGIN (5) #define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN) -- cgit v0.10.2 From bdcbb659fe630fc64f6604e99a180bb2ccc630c2 Mon Sep 17 00:00:00 2001 From: Qiang Huang Date: Wed, 4 Jun 2014 16:08:21 -0700 Subject: memcg: fold mem_cgroup_stolen It is only used in __mem_cgroup_begin_update_page_stat(), the name is confusing and 2 routines for one thing also confuse people, so fold this function seems more clear. [akpm@linux-foundation.org: fix typo, per Michal] Signed-off-by: Qiang Huang Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 03d7662..4a9dfc8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1594,23 +1594,12 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) } /* - * 2 routines for checking "mem" is under move_account() or not. + * A routine for checking "mem" is under move_account() or not. * - * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This - * is used for avoiding races in accounting. If true, - * pc->mem_cgroup may be overwritten. - * - * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or - * under hierarchy of moving cgroups. This is for - * waiting at hith-memory prressure caused by "move". + * Checking a cgroup is mc.from or mc.to or under hierarchy of + * moving cgroups. This is for waiting at high-memory pressure + * caused by "move". */ - -static bool mem_cgroup_stolen(struct mem_cgroup *memcg) -{ - VM_BUG_ON(!rcu_read_lock_held()); - return atomic_read(&memcg->moving_account) > 0; -} - static bool mem_cgroup_under_move(struct mem_cgroup *memcg) { struct mem_cgroup *from; @@ -1653,7 +1642,6 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) * Take this lock when * - a code tries to modify page's memcg while it's USED. * - a code tries to modify page state accounting in a memcg. - * see mem_cgroup_stolen(), too. */ static void move_lock_mem_cgroup(struct mem_cgroup *memcg, unsigned long *flags) @@ -2326,9 +2314,10 @@ again: * If this memory cgroup is not under account moving, we don't * need to take move_lock_mem_cgroup(). Because we already hold * rcu_read_lock(), any calls to move_account will be delayed until - * rcu_read_unlock() if mem_cgroup_stolen() == true. + * rcu_read_unlock(). */ - if (!mem_cgroup_stolen(memcg)) + VM_BUG_ON(!rcu_read_lock_held()); + if (atomic_read(&memcg->moving_account) <= 0) return; move_lock_mem_cgroup(memcg, flags); -- cgit v0.10.2 From b5ffc8560cf758422e85b786cca32cd7e1513a7f Mon Sep 17 00:00:00 2001 From: Qiang Huang Date: Wed, 4 Jun 2014 16:08:22 -0700 Subject: memcg: correct comments for __mem_cgroup_begin_update_page_stat Signed-off-by: Qiang Huang Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4a9dfc8..971d7b6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2276,12 +2276,11 @@ cleanup: } /* - * Currently used to update mapped file statistics, but the routine can be - * generalized to update other statistics as well. + * Used to update mapped file or writeback or other statistics. * * Notes: Race condition * - * We usually use page_cgroup_lock() for accessing page_cgroup member but + * We usually use lock_page_cgroup() for accessing page_cgroup member but * it tends to be costly. But considering some conditions, we doesn't need * to do so _always_. * @@ -2295,8 +2294,8 @@ cleanup: * by flags. * * Considering "move", this is an only case we see a race. To make the race - * small, we check mm->moving_account and detect there are possibility of race - * If there is, we take a lock. + * small, we check memcg->moving_account and detect there are possibility + * of race or not. If there is, we take a lock. */ void __mem_cgroup_begin_update_page_stat(struct page *page, -- cgit v0.10.2 From 073ee1c6cd11cd190f4d0da84d9b4ba79d7b9e70 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:08:23 -0700 Subject: memcg: get rid of memcg_create_cache_name Instead of calling back to memcontrol.c from kmem_cache_create_memcg in order to just create the name of a per memcg cache, let's allocate it in place. We only need to pass the memcg name to kmem_cache_create_memcg for that - everything else can be done in slab_common.c. Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1fa2324..dfc2929 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -492,8 +492,6 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order); int memcg_cache_id(struct mem_cgroup *memcg); -char *memcg_create_cache_name(struct mem_cgroup *memcg, - struct kmem_cache *root_cache); int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache); void memcg_free_cache_params(struct kmem_cache *s); diff --git a/include/linux/slab.h b/include/linux/slab.h index ecbec9c..86e5b26 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -117,7 +117,8 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *, - struct kmem_cache *); + struct kmem_cache *, + const char *); #endif void kmem_cache_destroy(struct kmem_cache *); int kmem_cache_shrink(struct kmem_cache *); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 971d7b6..7df7f59 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3095,29 +3095,6 @@ int memcg_update_cache_size(struct kmem_cache *s, int num_groups) return 0; } -char *memcg_create_cache_name(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) -{ - static char *buf; - - /* - * We need a mutex here to protect the shared buffer. Since this is - * expected to be called only on cache creation, we can employ the - * slab_mutex for that purpose. - */ - lockdep_assert_held(&slab_mutex); - - if (!buf) { - buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!buf) - return NULL; - } - - cgroup_name(memcg->css.cgroup, buf, NAME_MAX + 1); - return kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, - memcg_cache_id(memcg), buf); -} - int memcg_alloc_cache_params(struct mem_cgroup *memcg, struct kmem_cache *s, struct kmem_cache *root_cache) { @@ -3158,6 +3135,7 @@ void memcg_free_cache_params(struct kmem_cache *s) static void memcg_kmem_create_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { + static char *memcg_name_buf; /* protected by memcg_slab_mutex */ struct kmem_cache *cachep; int id; @@ -3173,7 +3151,14 @@ static void memcg_kmem_create_cache(struct mem_cgroup *memcg, if (cache_from_memcg_idx(root_cache, id)) return; - cachep = kmem_cache_create_memcg(memcg, root_cache); + if (!memcg_name_buf) { + memcg_name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!memcg_name_buf) + return; + } + + cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); + cachep = kmem_cache_create_memcg(memcg, root_cache, memcg_name_buf); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root diff --git a/mm/slab_common.c b/mm/slab_common.c index 7e348cf..3217561 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -264,13 +264,15 @@ EXPORT_SYMBOL(kmem_cache_create); * kmem_cache_create_memcg - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. * @root_cache: The parent of the new cache. + * @memcg_name: The name of the memory cgroup (used for naming the new cache). * * This function attempts to create a kmem cache that will serve allocation * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) + struct kmem_cache *root_cache, + const char *memcg_name) { struct kmem_cache *s = NULL; char *cache_name; @@ -280,7 +282,8 @@ struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, mutex_lock(&slab_mutex); - cache_name = memcg_create_cache_name(memcg, root_cache); + cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name, + memcg_cache_id(memcg), memcg_name); if (!cache_name) goto out_unlock; -- cgit v0.10.2 From 93f39eea9c229778361ae7ecf5f5e95d291757da Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:08:24 -0700 Subject: memcg: memcg_kmem_create_cache: make memcg_name_buf statically allocated It isn't worth complicating the code by allocating it on the first access, because it only takes 256 bytes. Signed-off-by: Vladimir Davydov Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7df7f59..5e2bfcc 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3135,7 +3135,8 @@ void memcg_free_cache_params(struct kmem_cache *s) static void memcg_kmem_create_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache) { - static char *memcg_name_buf; /* protected by memcg_slab_mutex */ + static char memcg_name_buf[NAME_MAX + 1]; /* protected by + memcg_slab_mutex */ struct kmem_cache *cachep; int id; @@ -3151,12 +3152,6 @@ static void memcg_kmem_create_cache(struct mem_cgroup *memcg, if (cache_from_memcg_idx(root_cache, id)) return; - if (!memcg_name_buf) { - memcg_name_buf = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!memcg_name_buf) - return; - } - cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); cachep = kmem_cache_create_memcg(memcg, root_cache, memcg_name_buf); /* -- cgit v0.10.2 From 68711a746345c44ae00c64d8dbac6a9ce13ac54a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:25 -0700 Subject: mm, migration: add destination page freeing callback Memory migration uses a callback defined by the caller to determine how to allocate destination pages. When migration fails for a source page, however, it frees the destination page back to the system. This patch adds a memory migration callback defined by the caller to determine how to free destination pages. If a caller, such as memory compaction, builds its own freelist for migration targets, this can reuse already freed memory instead of scanning additional memory. If the caller provides a function to handle freeing of destination pages, it is called when page migration fails. If the caller passes NULL then freeing back to the system will be handled as usual. This patch introduces no functional change. Signed-off-by: David Rientjes Reviewed-by: Naoya Horiguchi Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Greg Thelen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 84a31ad..a2901c4 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -5,7 +5,9 @@ #include #include -typedef struct page *new_page_t(struct page *, unsigned long private, int **); +typedef struct page *new_page_t(struct page *page, unsigned long private, + int **reason); +typedef void free_page_t(struct page *page, unsigned long private); /* * Return values from addresss_space_operations.migratepage(): @@ -38,7 +40,7 @@ enum migrate_reason { extern void putback_movable_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *, enum migrate_mode); -extern int migrate_pages(struct list_head *l, new_page_t x, +extern int migrate_pages(struct list_head *l, new_page_t new, free_page_t free, unsigned long private, enum migrate_mode mode, int reason); extern int migrate_prep(void); @@ -56,8 +58,9 @@ extern int migrate_page_move_mapping(struct address_space *mapping, #else static inline void putback_movable_pages(struct list_head *l) {} -static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, enum migrate_mode mode, int reason) +static inline int migrate_pages(struct list_head *l, new_page_t new, + free_page_t free, unsigned long private, enum migrate_mode mode, + int reason) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index 6010aab..f74a362 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1016,7 +1016,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } nr_migrate = cc->nr_migratepages; - err = migrate_pages(&cc->migratepages, compaction_alloc, + err = migrate_pages(&cc->migratepages, compaction_alloc, NULL, (unsigned long)cc, cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, MR_COMPACTION); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index d50f17f..3cd1b65 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1503,7 +1503,7 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_move(&hpage->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", @@ -1584,7 +1584,7 @@ static int __soft_offline_page(struct page *page, int flags) inc_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + ret = migrate_pages(&pagelist, new_page, NULL, MPOL_MF_MOVE_ALL, MIGRATE_SYNC, MR_MEMORY_FAILURE); if (ret) { if (!list_empty(&pagelist)) { diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index cbb7ca0..469bbf5 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1394,7 +1394,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * alloc_migrate_target should be improooooved!! * migrate_pages returns # of failed pages. */ - ret = migrate_pages(&source, alloc_migrate_target, 0, + ret = migrate_pages(&source, alloc_migrate_target, NULL, 0, MIGRATE_SYNC, MR_MEMORY_HOTPLUG); if (ret) putback_movable_pages(&source); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7f7864b..16bc9fa 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1028,7 +1028,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, + err = migrate_pages(&pagelist, new_node_page, NULL, dest, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1277,7 +1277,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { WARN_ON_ONCE(flags & MPOL_MF_LAZY); nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, + NULL, (unsigned long)vma, MIGRATE_SYNC, MR_MEMPOLICY_MBIND); if (nr_failed) putback_movable_pages(&pagelist); diff --git a/mm/migrate.c b/mm/migrate.c index 6247be7..2a45967 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -938,8 +938,9 @@ out: * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. */ -static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, enum migrate_mode mode) +static int unmap_and_move(new_page_t get_new_page, free_page_t put_new_page, + unsigned long private, struct page *page, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -983,11 +984,17 @@ out: page_is_file_cache(page)); putback_lru_page(page); } + /* - * Move the new page to the LRU. If migration was not successful - * then this will free the page. + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, putback_lru_page() will drop the reference grabbed + * during isolation. */ - putback_lru_page(newpage); + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + put_new_page(newpage, private); + else + putback_lru_page(newpage); + if (result) { if (rc) *result = rc; @@ -1016,8 +1023,9 @@ out: * will wait in the page fault for migration to complete. */ static int unmap_and_move_huge_page(new_page_t get_new_page, - unsigned long private, struct page *hpage, - int force, enum migrate_mode mode) + free_page_t put_new_page, unsigned long private, + struct page *hpage, int force, + enum migrate_mode mode) { int rc = 0; int *result = NULL; @@ -1056,20 +1064,30 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!page_mapped(hpage)) rc = move_to_new_page(new_hpage, hpage, 1, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) remove_migration_ptes(hpage, hpage); if (anon_vma) put_anon_vma(anon_vma); - if (!rc) + if (rc == MIGRATEPAGE_SUCCESS) hugetlb_cgroup_migrate(hpage, new_hpage); unlock_page(hpage); out: if (rc != -EAGAIN) putback_active_hugepage(hpage); - put_page(new_hpage); + + /* + * If migration was not successful and there's a freeing callback, use + * it. Otherwise, put_page() will drop the reference grabbed during + * isolation. + */ + if (rc != MIGRATEPAGE_SUCCESS && put_new_page) + put_new_page(new_hpage, private); + else + put_page(new_hpage); + if (result) { if (rc) *result = rc; @@ -1086,6 +1104,8 @@ out: * @from: The list of pages to be migrated. * @get_new_page: The function used to allocate free pages to be used * as the target of the page migration. + * @put_new_page: The function used to free target pages if migration + * fails, or NULL if no special handling is necessary. * @private: Private data to be passed on to get_new_page() * @mode: The migration mode that specifies the constraints for * page migration, if any. @@ -1099,7 +1119,8 @@ out: * Returns the number of pages that were not migrated, or an error code. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, - unsigned long private, enum migrate_mode mode, int reason) + free_page_t put_new_page, unsigned long private, + enum migrate_mode mode, int reason) { int retry = 1; int nr_failed = 0; @@ -1121,10 +1142,11 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, if (PageHuge(page)) rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, mode); + put_new_page, private, page, + pass > 2, mode); else - rc = unmap_and_move(get_new_page, private, - page, pass > 2, mode); + rc = unmap_and_move(get_new_page, put_new_page, + private, page, pass > 2, mode); switch(rc) { case -ENOMEM: @@ -1273,7 +1295,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_page_node, + err = migrate_pages(&pagelist, new_page_node, NULL, (unsigned long)pm, MIGRATE_SYNC, MR_SYSCALL); if (err) putback_movable_pages(&pagelist); @@ -1729,7 +1751,8 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, list_add(&page->lru, &migratepages); nr_remaining = migrate_pages(&migratepages, alloc_misplaced_dst_page, - node, MIGRATE_ASYNC, MR_NUMA_MISPLACED); + NULL, node, MIGRATE_ASYNC, + MR_NUMA_MISPLACED); if (nr_remaining) { if (!list_empty(&migratepages)) { list_del(&page->lru); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 132c337..027d029 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6218,7 +6218,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, MIGRATE_SYNC, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); -- cgit v0.10.2 From d53aea3d46d64e95da9952887969f7533b9ab25e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:26 -0700 Subject: mm, compaction: return failed migration target pages back to freelist Greg reported that he found isolated free pages were returned back to the VM rather than the compaction freelist. This will cause holes behind the free scanner and cause it to reallocate additional memory if necessary later. He detected the problem at runtime seeing that ext4 metadata pages (esp the ones read by "sbi->s_group_desc[i] = sb_bread(sb, block)") were constantly visited by compaction calls of migrate_pages(). These pages had a non-zero b_count which caused fallback_migrate_page() -> try_to_release_page() -> try_to_free_buffers() to fail. Memory compaction works by having a "freeing scanner" scan from one end of a zone which isolates pages as migration targets while another "migrating scanner" scans from the other end of the same zone which isolates pages for migration. When page migration fails for an isolated page, the target page is returned to the system rather than the freelist built by the freeing scanner. This may require the freeing scanner to continue scanning memory after suitable migration targets have already been returned to the system needlessly. This patch returns destination pages to the freeing scanner freelist when page migration fails. This prevents unnecessary work done by the freeing scanner but also encourages memory to be as compacted as possible at the end of the zone. Signed-off-by: David Rientjes Reported-by: Greg Thelen Acked-by: Mel Gorman Acked-by: Vlastimil Babka Reviewed-by: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index f74a362..d0c7c99 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -790,23 +790,32 @@ static struct page *compaction_alloc(struct page *migratepage, } /* - * We cannot control nr_migratepages and nr_freepages fully when migration is - * running as migrate_pages() has no knowledge of compact_control. When - * migration is complete, we count the number of pages on the lists by hand. + * This is a migrate-callback that "frees" freepages back to the isolated + * freelist. All pages on the freelist are from the same zone, so there is no + * special handling needed for NUMA. + */ +static void compaction_free(struct page *page, unsigned long data) +{ + struct compact_control *cc = (struct compact_control *)data; + + list_add(&page->lru, &cc->freepages); + cc->nr_freepages++; +} + +/* + * We cannot control nr_migratepages fully when migration is running as + * migrate_pages() has no knowledge of of compact_control. When migration is + * complete, we count the number of pages on the list by hand. */ static void update_nr_listpages(struct compact_control *cc) { int nr_migratepages = 0; - int nr_freepages = 0; struct page *page; list_for_each_entry(page, &cc->migratepages, lru) nr_migratepages++; - list_for_each_entry(page, &cc->freepages, lru) - nr_freepages++; cc->nr_migratepages = nr_migratepages; - cc->nr_freepages = nr_freepages; } /* possible outcome of isolate_migratepages */ @@ -1016,8 +1025,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } nr_migrate = cc->nr_migratepages; - err = migrate_pages(&cc->migratepages, compaction_alloc, NULL, - (unsigned long)cc, + err = migrate_pages(&cc->migratepages, compaction_alloc, + compaction_free, (unsigned long)cc, cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, MR_COMPACTION); update_nr_listpages(cc); -- cgit v0.10.2 From 35979ef3393110ff3c12c6b94552208d3bdf1a36 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:27 -0700 Subject: mm, compaction: add per-zone migration pfn cache for async compaction Each zone has a cached migration scanner pfn for memory compaction so that subsequent calls to memory compaction can start where the previous call left off. Currently, the compaction migration scanner only updates the per-zone cached pfn when pageblocks were not skipped for async compaction. This creates a dependency on calling sync compaction to avoid having subsequent calls to async compaction from scanning an enormous amount of non-MOVABLE pageblocks each time it is called. On large machines, this could be potentially very expensive. This patch adds a per-zone cached migration scanner pfn only for async compaction. It is updated everytime a pageblock has been scanned in its entirety and when no pages from it were successfully isolated. The cached migration scanner pfn for sync compaction is updated only when called for sync compaction. Signed-off-by: David Rientjes Acked-by: Vlastimil Babka Reviewed-by: Naoya Horiguchi Cc: Greg Thelen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index ae693e1..10a96ee 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -360,9 +360,10 @@ struct zone { /* Set to true when the PG_migrate_skip bits should be cleared */ bool compact_blockskip_flush; - /* pfns where compaction scanners should start */ + /* pfn where compaction free scanner should start */ unsigned long compact_cached_free_pfn; - unsigned long compact_cached_migrate_pfn; + /* pfn where async and sync compaction migration scanner should start */ + unsigned long compact_cached_migrate_pfn[2]; #endif #ifdef CONFIG_MEMORY_HOTPLUG /* see spanned/present_pages for more description */ diff --git a/mm/compaction.c b/mm/compaction.c index d0c7c99..70c0f8c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -89,7 +89,8 @@ static void __reset_isolation_suitable(struct zone *zone) unsigned long end_pfn = zone_end_pfn(zone); unsigned long pfn; - zone->compact_cached_migrate_pfn = start_pfn; + zone->compact_cached_migrate_pfn[0] = start_pfn; + zone->compact_cached_migrate_pfn[1] = start_pfn; zone->compact_cached_free_pfn = end_pfn; zone->compact_blockskip_flush = false; @@ -131,9 +132,10 @@ void reset_isolation_suitable(pg_data_t *pgdat) */ static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { struct zone *zone = cc->zone; + unsigned long pfn; if (cc->ignore_skip_hint) return; @@ -141,20 +143,31 @@ static void update_pageblock_skip(struct compact_control *cc, if (!page) return; - if (!nr_isolated) { - unsigned long pfn = page_to_pfn(page); + if (nr_isolated) + return; + + /* + * Only skip pageblocks when all forms of compaction will be known to + * fail in the near future. + */ + if (set_unsuitable) set_pageblock_skip(page); - /* Update where compaction should restart */ - if (migrate_scanner) { - if (!cc->finished_update_migrate && - pfn > zone->compact_cached_migrate_pfn) - zone->compact_cached_migrate_pfn = pfn; - } else { - if (!cc->finished_update_free && - pfn < zone->compact_cached_free_pfn) - zone->compact_cached_free_pfn = pfn; - } + pfn = page_to_pfn(page); + + /* Update where async and sync compaction should restart */ + if (migrate_scanner) { + if (cc->finished_update_migrate) + return; + if (pfn > zone->compact_cached_migrate_pfn[0]) + zone->compact_cached_migrate_pfn[0] = pfn; + if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1]) + zone->compact_cached_migrate_pfn[1] = pfn; + } else { + if (cc->finished_update_free) + return; + if (pfn < zone->compact_cached_free_pfn) + zone->compact_cached_free_pfn = pfn; } } #else @@ -166,7 +179,7 @@ static inline bool isolation_suitable(struct compact_control *cc, static void update_pageblock_skip(struct compact_control *cc, struct page *page, unsigned long nr_isolated, - bool migrate_scanner) + bool set_unsuitable, bool migrate_scanner) { } #endif /* CONFIG_COMPACTION */ @@ -323,7 +336,8 @@ isolate_fail: /* Update the pageblock-skip if the whole pageblock was scanned */ if (blockpfn == end_pfn) - update_pageblock_skip(cc, valid_page, total_isolated, false); + update_pageblock_skip(cc, valid_page, total_isolated, true, + false); count_compact_events(COMPACTFREE_SCANNED, nr_scanned); if (total_isolated) @@ -458,7 +472,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, unsigned long flags; bool locked = false; struct page *page = NULL, *valid_page = NULL; - bool skipped_async_unsuitable = false; + bool set_unsuitable = true; const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | (unevictable ? ISOLATE_UNEVICTABLE : 0); @@ -535,8 +549,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ mt = get_pageblock_migratetype(page); if (!cc->sync && !migrate_async_suitable(mt)) { - cc->finished_update_migrate = true; - skipped_async_unsuitable = true; + set_unsuitable = false; goto next_pageblock; } } @@ -640,11 +653,10 @@ next_pageblock: /* * Update the pageblock-skip information and cached scanner pfn, * if the whole pageblock was scanned without isolating any page. - * This is not done when pageblock was skipped due to being unsuitable - * for async compaction, so that eventual sync compaction can try. */ - if (low_pfn == end_pfn && !skipped_async_unsuitable) - update_pageblock_skip(cc, valid_page, nr_isolated, true); + if (low_pfn == end_pfn) + update_pageblock_skip(cc, valid_page, nr_isolated, + set_unsuitable, true); trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); @@ -868,7 +880,8 @@ static int compact_finished(struct zone *zone, /* Compaction run completes if the migrate and free scanner meet */ if (cc->free_pfn <= cc->migrate_pfn) { /* Let the next compaction start anew. */ - zone->compact_cached_migrate_pfn = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[0] = zone->zone_start_pfn; + zone->compact_cached_migrate_pfn[1] = zone->zone_start_pfn; zone->compact_cached_free_pfn = zone_end_pfn(zone); /* @@ -993,7 +1006,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * information on where the scanners should start but check that it * is initialised by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn; + cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); @@ -1001,7 +1014,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } if (cc->migrate_pfn < start_pfn || cc->migrate_pfn > end_pfn) { cc->migrate_pfn = start_pfn; - zone->compact_cached_migrate_pfn = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[0] = cc->migrate_pfn; + zone->compact_cached_migrate_pfn[1] = cc->migrate_pfn; } trace_mm_compaction_begin(start_pfn, cc->migrate_pfn, cc->free_pfn, end_pfn); -- cgit v0.10.2 From e0b9daeb453e602a95ea43853dc12d385558ce1f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:28 -0700 Subject: mm, compaction: embed migration mode in compact_control We're going to want to manipulate the migration mode for compaction in the page allocator, and currently compact_control's sync field is only a bool. Currently, we only do MIGRATE_ASYNC or MIGRATE_SYNC_LIGHT compaction depending on the value of this bool. Convert the bool to enum migrate_mode and pass the migration mode in directly. Later, we'll want to avoid MIGRATE_SYNC_LIGHT for thp allocations in the pagefault patch to avoid unnecessary latency. This also alters compaction triggered from sysfs, either for the entire system or for a node, to force MIGRATE_SYNC. [akpm@linux-foundation.org: fix build] [iamjoonsoo.kim@lge.com: use MIGRATE_SYNC in alloc_contig_range()] Signed-off-by: David Rientjes Suggested-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Greg Thelen Cc: Naoya Horiguchi Signed-off-by: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 7e1c76e..01e3132 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,7 +22,7 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask, - bool sync, bool *contended); + enum migrate_mode mode, bool *contended); extern void compact_pgdat(pg_data_t *pgdat, int order); extern void reset_isolation_suitable(pg_data_t *pgdat); extern unsigned long compaction_suitable(struct zone *zone, int order); @@ -91,7 +91,7 @@ static inline bool compaction_restarting(struct zone *zone, int order) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + enum migrate_mode mode, bool *contended) { return COMPACT_CONTINUE; } diff --git a/mm/compaction.c b/mm/compaction.c index 70c0f8c..217a6ad 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -161,7 +161,8 @@ static void update_pageblock_skip(struct compact_control *cc, return; if (pfn > zone->compact_cached_migrate_pfn[0]) zone->compact_cached_migrate_pfn[0] = pfn; - if (cc->sync && pfn > zone->compact_cached_migrate_pfn[1]) + if (cc->mode != MIGRATE_ASYNC && + pfn > zone->compact_cached_migrate_pfn[1]) zone->compact_cached_migrate_pfn[1] = pfn; } else { if (cc->finished_update_free) @@ -208,7 +209,7 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, } /* async aborts if taking too long or contended */ - if (!cc->sync) { + if (cc->mode == MIGRATE_ASYNC) { cc->contended = true; return false; } @@ -473,7 +474,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, bool locked = false; struct page *page = NULL, *valid_page = NULL; bool set_unsuitable = true; - const isolate_mode_t mode = (!cc->sync ? ISOLATE_ASYNC_MIGRATE : 0) | + const isolate_mode_t mode = (cc->mode == MIGRATE_ASYNC ? + ISOLATE_ASYNC_MIGRATE : 0) | (unevictable ? ISOLATE_UNEVICTABLE : 0); /* @@ -483,7 +485,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (!cc->sync) + if (cc->mode == MIGRATE_ASYNC) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -548,7 +550,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * the minimum amount of work satisfies the allocation */ mt = get_pageblock_migratetype(page); - if (!cc->sync && !migrate_async_suitable(mt)) { + if (cc->mode == MIGRATE_ASYNC && + !migrate_async_suitable(mt)) { set_unsuitable = false; goto next_pageblock; } @@ -981,6 +984,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) int ret; unsigned long start_pfn = zone->zone_start_pfn; unsigned long end_pfn = zone_end_pfn(zone); + const bool sync = cc->mode != MIGRATE_ASYNC; ret = compaction_suitable(zone, cc->order); switch (ret) { @@ -1006,7 +1010,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) * information on where the scanners should start but check that it * is initialised by ensuring the values are within zone boundaries. */ - cc->migrate_pfn = zone->compact_cached_migrate_pfn[cc->sync]; + cc->migrate_pfn = zone->compact_cached_migrate_pfn[sync]; cc->free_pfn = zone->compact_cached_free_pfn; if (cc->free_pfn < start_pfn || cc->free_pfn > end_pfn) { cc->free_pfn = end_pfn & ~(pageblock_nr_pages-1); @@ -1040,8 +1044,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, - compaction_free, (unsigned long)cc, - cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC, + compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -1074,9 +1077,8 @@ out: return ret; } -static unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask, - bool sync, bool *contended) +static unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask, enum migrate_mode mode, bool *contended) { unsigned long ret; struct compact_control cc = { @@ -1085,7 +1087,7 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .sync = sync, + .mode = mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -1107,7 +1109,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from - * @sync: Whether migration is synchronous or not + * @mode: The migration mode for async, sync light, or sync migration * @contended: Return value that is true if compaction was aborted due to lock contention * @page: Optionally capture a free page of the requested order during compaction * @@ -1115,7 +1117,7 @@ int sysctl_extfrag_threshold = 500; */ unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *nodemask, - bool sync, bool *contended) + enum migrate_mode mode, bool *contended) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -1140,7 +1142,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync, + status = compact_zone_order(zone, order, gfp_mask, mode, contended); rc = max(status, rc); @@ -1190,7 +1192,7 @@ void compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .sync = false, + .mode = MIGRATE_ASYNC, }; if (!order) @@ -1203,7 +1205,7 @@ static void compact_node(int nid) { struct compact_control cc = { .order = -1, - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; diff --git a/mm/internal.h b/mm/internal.h index 6ee580d..a25424a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -134,7 +134,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - bool sync; /* Synchronous migration */ + enum migrate_mode mode; /* Async or sync migration mode */ bool ignore_skip_hint; /* Scan blocks even if marked skip */ bool finished_update_free; /* True when the zone cached pfns are * no longer being updated diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 027d029..afb29da 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2217,7 +2217,7 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, + int migratetype, enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { @@ -2231,7 +2231,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask, sync_migration, + nodemask, mode, contended_compaction); current->flags &= ~PF_MEMALLOC; @@ -2264,7 +2264,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, * As async compaction considers a subset of pageblocks, only * defer if the failure was a sync compaction failure. */ - if (sync_migration) + if (mode != MIGRATE_ASYNC) defer_compaction(preferred_zone, order); cond_resched(); @@ -2277,9 +2277,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, bool sync_migration, - bool *contended_compaction, bool *deferred_compaction, - unsigned long *did_some_progress) + int migratetype, enum migrate_mode mode, bool *contended_compaction, + bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; } @@ -2474,7 +2473,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - bool sync_migration = false; + enum migrate_mode migration_mode = MIGRATE_ASYNC; bool deferred_compaction = false; bool contended_compaction = false; @@ -2568,17 +2567,15 @@ rebalance: * Try direct compaction. The first pass is asynchronous. Subsequent * attempts after direct reclaim are synchronous */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) goto got_pg; - sync_migration = true; + migration_mode = MIGRATE_SYNC_LIGHT; /* * If compaction is deferred for high-order allocations, it is because @@ -2653,12 +2650,10 @@ rebalance: * direct reclaim and reclaim/compaction depends on compaction * being called after reclaim so call directly if necessary */ - page = __alloc_pages_direct_compact(gfp_mask, order, - zonelist, high_zoneidx, - nodemask, - alloc_flags, preferred_zone, - migratetype, sync_migration, - &contended_compaction, + page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, + high_zoneidx, nodemask, alloc_flags, + preferred_zone, migratetype, + migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); if (page) @@ -6218,7 +6213,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, cc->nr_migratepages -= nr_reclaimed; ret = migrate_pages(&cc->migratepages, alloc_migrate_target, - NULL, 0, MIGRATE_SYNC, MR_CMA); + NULL, 0, cc->mode, MR_CMA); } if (ret < 0) { putback_movable_pages(&cc->migratepages); @@ -6257,7 +6252,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .sync = true, + .mode = MIGRATE_SYNC, .ignore_skip_hint = true, }; INIT_LIST_HEAD(&cc.migratepages); -- cgit v0.10.2 From 75f30861a12a6b09b759dfeeb9290b681af89057 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:30 -0700 Subject: mm, thp: avoid excessive compaction latency during fault Synchronous memory compaction can be very expensive: it can iterate an enormous amount of memory without aborting, constantly rescheduling, waiting on page locks and lru_lock, etc, if a pageblock cannot be defragmented. Unfortunately, it's too expensive for transparent hugepage page faults and it's much better to simply fallback to pages. On 128GB machines, we find that synchronous memory compaction can take O(seconds) for a single thp fault. Now that async compaction remembers where it left off without strictly relying on sync compaction, this makes thp allocations best-effort without causing egregious latency during fault. We still need to retry async compaction after reclaim, but this won't stall for seconds. Signed-off-by: David Rientjes Acked-by: Mel Gorman Cc: Greg Thelen Cc: Naoya Horiguchi Cc: Vlastimil Babka Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index afb29da..d88d675 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2575,7 +2575,14 @@ rebalance: &did_some_progress); if (page) goto got_pg; - migration_mode = MIGRATE_SYNC_LIGHT; + + /* + * It can become very expensive to allocate transparent hugepages at + * fault, so use asynchronous memory compaction for THP unless it is + * khugepaged trying to collapse. + */ + if (!(gfp_mask & __GFP_NO_KSWAPD) || (current->flags & PF_KTHREAD)) + migration_mode = MIGRATE_SYNC_LIGHT; /* * If compaction is deferred for high-order allocations, it is because -- cgit v0.10.2 From aeef4b83806f49a0c454b7d4578671b71045bee2 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 4 Jun 2014 16:08:31 -0700 Subject: mm, compaction: terminate async compaction when rescheduling Async compaction terminates prematurely when need_resched(), see compact_checklock_irqsave(). This can never trigger, however, if the cond_resched() in isolate_migratepages_range() always takes care of the scheduling. If the cond_resched() actually triggers, then terminate this pageblock scan for async compaction as well. Signed-off-by: David Rientjes Acked-by: Mel Gorman Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 217a6ad..56331f5 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -494,8 +494,13 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, return 0; } + if (cond_resched()) { + /* Async terminates prematurely on need_resched() */ + if (cc->mode == MIGRATE_ASYNC) + return 0; + } + /* Time to isolate some pages for migration */ - cond_resched(); for (; low_pfn < end_pfn; low_pfn++) { /* give a chance to irqs before checking need_resched() */ if (locked && !(low_pfn % SWAP_CLUSTER_MAX)) { -- cgit v0.10.2 From f8c9301fa5a2a8b873c67f2a3d8230d5c13f61b7 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:08:32 -0700 Subject: mm/compaction: do not count migratepages when unnecessary During compaction, update_nr_listpages() has been used to count remaining non-migrated and free pages after a call to migrage_pages(). The freepages counting has become unneccessary, and it turns out that migratepages counting is also unnecessary in most cases. The only situation when it's needed to count cc->migratepages is when migrate_pages() returns with a negative error code. Otherwise, the non-negative return value is the number of pages that were not migrated, which is exactly the count of remaining pages in the cc->migratepages list. Furthermore, any non-zero count is only interesting for the tracepoint of mm_compaction_migratepages events, because after that all remaining unmigrated pages are put back and their count is set to 0. This patch therefore removes update_nr_listpages() completely, and changes the tracepoint definition so that the manual counting is done only when the tracepoint is enabled, and only when migrate_pages() returns a negative error code. Furthermore, migrate_pages() and the tracepoints won't be called when there's nothing to migrate. This potentially avoids some wasted cycles and reduces the volume of uninteresting mm_compaction_migratepages events where "nr_migrated=0 nr_failed=0". In the stress-highalloc mmtest, this was about 75% of the events. The mm_compaction_isolate_migratepages event is better for determining that nothing was isolated for migration, and this one was just duplicating the info. Signed-off-by: Vlastimil Babka Reviewed-by: Naoya Horiguchi Cc: Minchan Kim Cc: Mel Gorman Cc: Joonsoo Kim Cc: Bartlomiej Zolnierkiewicz Acked-by: Michal Nazarewicz Cc: Christoph Lameter Cc: Rik van Riel Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 06f544e..c6814b9 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -5,6 +5,7 @@ #define _TRACE_COMPACTION_H #include +#include #include #include @@ -47,10 +48,11 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TRACE_EVENT(mm_compaction_migratepages, - TP_PROTO(unsigned long nr_migrated, - unsigned long nr_failed), + TP_PROTO(unsigned long nr_all, + int migrate_rc, + struct list_head *migratepages), - TP_ARGS(nr_migrated, nr_failed), + TP_ARGS(nr_all, migrate_rc, migratepages), TP_STRUCT__entry( __field(unsigned long, nr_migrated) @@ -58,7 +60,22 @@ TRACE_EVENT(mm_compaction_migratepages, ), TP_fast_assign( - __entry->nr_migrated = nr_migrated; + unsigned long nr_failed = 0; + struct list_head *page_lru; + + /* + * migrate_pages() returns either a non-negative number + * with the number of pages that failed migration, or an + * error code, in which case we need to count the remaining + * pages manually + */ + if (migrate_rc >= 0) + nr_failed = migrate_rc; + else + list_for_each(page_lru, migratepages) + nr_failed++; + + __entry->nr_migrated = nr_all - nr_failed; __entry->nr_failed = nr_failed; ), diff --git a/mm/compaction.c b/mm/compaction.c index 56331f5..3c60e3d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -822,22 +822,6 @@ static void compaction_free(struct page *page, unsigned long data) cc->nr_freepages++; } -/* - * We cannot control nr_migratepages fully when migration is running as - * migrate_pages() has no knowledge of of compact_control. When migration is - * complete, we count the number of pages on the list by hand. - */ -static void update_nr_listpages(struct compact_control *cc) -{ - int nr_migratepages = 0; - struct page *page; - - list_for_each_entry(page, &cc->migratepages, lru) - nr_migratepages++; - - cc->nr_migratepages = nr_migratepages; -} - /* possible outcome of isolate_migratepages */ typedef enum { ISOLATE_ABORT, /* Abort compaction now */ @@ -1032,7 +1016,6 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) migrate_prep_local(); while ((ret = compact_finished(zone, cc)) == COMPACT_CONTINUE) { - unsigned long nr_migrate, nr_remaining; int err; switch (isolate_migratepages(zone, cc)) { @@ -1047,20 +1030,20 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) ; } - nr_migrate = cc->nr_migratepages; + if (!cc->nr_migratepages) + continue; + err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, MR_COMPACTION); - update_nr_listpages(cc); - nr_remaining = cc->nr_migratepages; - trace_mm_compaction_migratepages(nr_migrate - nr_remaining, - nr_remaining); + trace_mm_compaction_migratepages(cc->nr_migratepages, err, + &cc->migratepages); - /* Release isolated pages not migrated */ + /* All pages were either migrated or will be released */ + cc->nr_migratepages = 0; if (err) { putback_movable_pages(&cc->migratepages); - cc->nr_migratepages = 0; /* * migrate_pages() may return -ENOMEM when scanners meet * and we want compact_finished() to detect it -- cgit v0.10.2 From e9ade569910a82614ff5f2c2cea2b65a8d785da4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:08:34 -0700 Subject: mm/compaction: avoid rescanning pageblocks in isolate_freepages The compaction free scanner in isolate_freepages() currently remembers PFN of the highest pageblock where it successfully isolates, to be used as the starting pageblock for the next invocation. The rationale behind this is that page migration might return free pages to the allocator when migration fails and we don't want to skip them if the compaction continues. Since migration now returns free pages back to compaction code where they can be reused, this is no longer a concern. This patch changes isolate_freepages() so that the PFN for restarting is updated with each pageblock where isolation is attempted. Using stress-highalloc from mmtests, this resulted in 10% reduction of the pages scanned by the free scanner. Note that the somewhat similar functionality that records highest successful pageblock in zone->compact_cached_free_pfn, remains unchanged. This cache is used when the whole compaction is restarted, not for multiple invocations of the free scanner during single compaction. Signed-off-by: Vlastimil Babka Cc: Minchan Kim Cc: Mel Gorman Cc: Joonsoo Kim Cc: Bartlomiej Zolnierkiewicz Acked-by: Michal Nazarewicz Reviewed-by: Naoya Horiguchi Cc: Christoph Lameter Cc: Rik van Riel Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 3c60e3d..5844122 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -688,7 +688,6 @@ static void isolate_freepages(struct zone *zone, unsigned long block_start_pfn; /* start of current pageblock */ unsigned long block_end_pfn; /* end of current pageblock */ unsigned long low_pfn; /* lowest pfn scanner is able to scan */ - unsigned long next_free_pfn; /* start pfn for scaning at next round */ int nr_freepages = cc->nr_freepages; struct list_head *freelist = &cc->freepages; @@ -709,12 +708,6 @@ static void isolate_freepages(struct zone *zone, low_pfn = ALIGN(cc->migrate_pfn + 1, pageblock_nr_pages); /* - * If no pages are isolated, the block_start_pfn < low_pfn check - * will kick in. - */ - next_free_pfn = 0; - - /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate * and free page scanners meet or enough free pages are isolated. @@ -754,19 +747,19 @@ static void isolate_freepages(struct zone *zone, continue; /* Found a block suitable for isolating free pages from */ + cc->free_pfn = block_start_pfn; isolated = isolate_freepages_block(cc, block_start_pfn, block_end_pfn, freelist, false); nr_freepages += isolated; /* - * Record the highest PFN we isolated pages from. When next - * looking for free pages, the search will restart here as - * page migration may have returned some pages to the allocator + * Set a flag that we successfully isolated in this pageblock. + * In the next loop iteration, zone->compact_cached_free_pfn + * will not be updated and thus it will effectively contain the + * highest pageblock we isolated pages from. */ - if (isolated && next_free_pfn == 0) { + if (isolated) cc->finished_update_free = true; - next_free_pfn = block_start_pfn; - } } /* split_free_page does not map the pages */ @@ -777,9 +770,8 @@ static void isolate_freepages(struct zone *zone, * so that compact_finished() may detect this */ if (block_start_pfn < low_pfn) - next_free_pfn = cc->migrate_pfn; + cc->free_pfn = cc->migrate_pfn; - cc->free_pfn = next_free_pfn; cc->nr_freepages = nr_freepages; } -- cgit v0.10.2 From bea04b073292b2acb522c7c1aa67a4fc58151530 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:09:51 -0700 Subject: mm: use the light version __mod_zone_page_state in mlocked_vma_newpage() mlocked_vma_newpage() is called with pte lock held(a spinlock), which implies preemtion disabled, and the vm stat counter is not modified from interrupt context, so we need not use an irq-safe mod_zone_page_state() here, using a light-weight version __mod_zone_page_state() would be OK. This patch also documents __mod_zone_page_state() and some of its callsites. The comment above __mod_zone_page_state() is from Hugh Dickins, and acked by Christoph. Most credits to Hugh and Christoph for the clarification on the usage of the __mod_zone_page_state(). [akpm@linux-foundation.org: coding-style fixes] Suggested-by: Andrew Morton Acked-by: Hugh Dickins Signed-off-by: Jianyu Zhan Reviewed-by: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/internal.h b/mm/internal.h index a25424a..e067984 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -201,7 +201,12 @@ static inline int mlocked_vma_newpage(struct vm_area_struct *vma, return 0; if (!TestSetPageMlocked(page)) { - mod_zone_page_state(page_zone(page), NR_MLOCK, + /* + * We use the irq-unsafe __mod_zone_page_stat because this + * counter is not modified from interrupt context, and the pte + * lock is held(spinlock), which implies preemption disabled. + */ + __mod_zone_page_state(page_zone(page), NR_MLOCK, hpage_nr_pages(page)); count_vm_event(UNEVICTABLE_PGMLOCKED); } diff --git a/mm/rmap.c b/mm/rmap.c index 8754e1f..4644e10 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -988,6 +988,12 @@ void do_page_add_anon_rmap(struct page *page, { int first = atomic_inc_and_test(&page->_mapcount); if (first) { + /* + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption + * disabled. + */ if (PageTransHuge(page)) __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); @@ -1079,6 +1085,11 @@ void page_remove_rmap(struct page *page) /* * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED * and not charged by memcg for now. + * + * We use the irq-unsafe __{inc|mod}_zone_page_stat because + * these counters are not modified in interrupt context, and + * these counters are not modified in interrupt context, and + * pte lock(a spinlock) is held, which implies preemption disabled. */ if (unlikely(PageHuge(page))) goto out; diff --git a/mm/vmstat.c b/mm/vmstat.c index 376bd2d..b37bd49 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -207,7 +207,9 @@ void set_pgdat_percpu_threshold(pg_data_t *pgdat, } /* - * For use when we know that interrupts are disabled. + * For use when we know that interrupts are disabled, + * or when we know that preemption is disabled and that + * particular counter cannot be updated from interrupt context. */ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item, int delta) -- cgit v0.10.2 From 7ee07a44eb53374a73544ae14c71366a02d462e0 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:09:52 -0700 Subject: mm: fold mlocked_vma_newpage() into its only call site In previous commit(mm: use the light version __mod_zone_page_state in mlocked_vma_newpage()) a irq-unsafe __mod_zone_page_state is used. And as suggested by Andrew, to reduce the risks that new call sites incorrectly using mlocked_vma_newpage() without knowing they are adding racing, this patch folds mlocked_vma_newpage() into its only call site, page_add_new_anon_rmap, to make it open-cocded for people to know what is going on. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jianyu Zhan Suggested-by: Andrew Morton Suggested-by: Hugh Dickins Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/internal.h b/mm/internal.h index e067984..802c3a4 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -189,31 +189,6 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma) } /* - * Called only in fault path, to determine if a new page is being - * mapped into a LOCKED vma. If it is, mark page as mlocked. - */ -static inline int mlocked_vma_newpage(struct vm_area_struct *vma, - struct page *page) -{ - VM_BUG_ON_PAGE(PageLRU(page), page); - - if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) - return 0; - - if (!TestSetPageMlocked(page)) { - /* - * We use the irq-unsafe __mod_zone_page_stat because this - * counter is not modified from interrupt context, and the pte - * lock is held(spinlock), which implies preemption disabled. - */ - __mod_zone_page_state(page_zone(page), NR_MLOCK, - hpage_nr_pages(page)); - count_vm_event(UNEVICTABLE_PGMLOCKED); - } - return 1; -} - -/* * must be called with vma's mmap_sem held for read or write, and page locked. */ extern void mlock_vma_page(struct page *page); @@ -255,10 +230,6 @@ extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma); #endif #else /* !CONFIG_MMU */ -static inline int mlocked_vma_newpage(struct vm_area_struct *v, struct page *p) -{ - return 0; -} static inline void clear_page_mlock(struct page *page) { } static inline void mlock_vma_page(struct page *page) { } static inline void mlock_migrate_page(struct page *new, struct page *old) { } diff --git a/mm/rmap.c b/mm/rmap.c index 4644e10..e375ce4 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1032,11 +1032,25 @@ void page_add_new_anon_rmap(struct page *page, __mod_zone_page_state(page_zone(page), NR_ANON_PAGES, hpage_nr_pages(page)); __page_set_anon_rmap(page, vma, address, 1); - if (!mlocked_vma_newpage(vma, page)) { + + VM_BUG_ON_PAGE(PageLRU(page), page); + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) { SetPageActive(page); lru_cache_add(page); - } else - add_page_to_unevictable_list(page); + return; + } + + if (!TestSetPageMlocked(page)) { + /* + * We use the irq-unsafe __mod_zone_page_stat because this + * counter is not modified from interrupt context, and the pte + * lock is held(spinlock), which implies preemption disabled. + */ + __mod_zone_page_state(page_zone(page), NR_MLOCK, + hpage_nr_pages(page)); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + add_page_to_unevictable_list(page); } /** -- cgit v0.10.2 From adfab836f4908deb049a5128082719e689eed964 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 4 Jun 2014 16:09:53 -0700 Subject: swap: change swap_info singly-linked list to list_head The logic controlling the singly-linked list of swap_info_struct entries for all active, i.e. swapon'ed, swap targets is rather complex, because: - it stores the entries in priority order - there is a pointer to the highest priority entry - there is a pointer to the highest priority not-full entry - there is a highest_priority_index variable set outside the swap_lock - swap entries of equal priority should be used equally this complexity leads to bugs such as: https://lkml.org/lkml/2014/2/13/181 where different priority swap targets are incorrectly used equally. That bug probably could be solved with the existing singly-linked lists, but I think it would only add more complexity to the already difficult to understand get_swap_page() swap_list iteration logic. The first patch changes from a singly-linked list to a doubly-linked list using list_heads; the highest_priority_index and related code are removed and get_swap_page() starts each iteration at the highest priority swap_info entry, even if it's full. While this does introduce unnecessary list iteration (i.e. Schlemiel the painter's algorithm) in the case where one or more of the highest priority entries are full, the iteration and manipulation code is much simpler and behaves correctly re: the above bug; and the fourth patch removes the unnecessary iteration. The second patch adds some minor plist helper functions; nothing new really, just functions to match existing regular list functions. These are used by the next two patches. The third patch adds plist_requeue(), which is used by get_swap_page() in the next patch - it performs the requeueing of same-priority entries (which moves the entry to the end of its priority in the plist), so that all equal-priority swap_info_structs get used equally. The fourth patch converts the main list into a plist, and adds a new plist that contains only swap_info entries that are both active and not full. As Mel suggested using plists allows removing all the ordering code from swap - plists handle ordering automatically. The list naming is also clarified now that there are two lists, with the original list changed from swap_list_head to swap_active_head and the new list named swap_avail_head. A new spinlock is also added for the new list, so swap_info entries can be added or removed from the new list immediately as they become full or not full. This patch (of 4): Replace the singly-linked list tracking active, i.e. swapon'ed, swap_info_struct entries with a doubly-linked list using struct list_heads. Simplify the logic iterating and manipulating the list of entries, especially get_swap_page(), by using standard list_head functions, and removing the highest priority iteration logic. The change fixes the bug: https://lkml.org/lkml/2014/2/13/181 in which different priority swap entries after the highest priority entry are incorrectly used equally in pairs. The swap behavior is now as advertised, i.e. different priority swap entries are used in order, and equal priority swap targets are used concurrently. Signed-off-by: Dan Streetman Acked-by: Mel Gorman Cc: Shaohua Li Cc: Hugh Dickins Cc: Dan Streetman Cc: Michal Hocko Cc: Christian Ehrhardt Cc: Weijie Yang Cc: Rik van Riel Cc: Johannes Weiner Cc: Bob Liu Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Paul Gortmaker Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/swap.h b/include/linux/swap.h index 5a14b92..8bb85d6d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -214,8 +214,8 @@ struct percpu_cluster { struct swap_info_struct { unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ + struct list_head list; /* entry in swap list */ signed char type; /* strange name for an index */ - signed char next; /* next type on the swap list */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */ @@ -255,11 +255,6 @@ struct swap_info_struct { struct swap_cluster_info discard_cluster_tail; /* list tail of discard clusters */ }; -struct swap_list_t { - int head; /* head of priority-ordered swapfile list */ - int next; /* swapfile to be used next */ -}; - /* linux/mm/workingset.c */ void *workingset_eviction(struct address_space *mapping, struct page *page); bool workingset_refault(void *shadow); diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index e282624..2eab382 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -6,7 +6,7 @@ * want to expose them to the dozens of source files that include swap.h */ extern spinlock_t swap_lock; -extern struct swap_list_t swap_list; +extern struct list_head swap_list_head; extern struct swap_info_struct *swap_info[]; extern int try_to_unuse(unsigned int, bool, unsigned long); diff --git a/mm/frontswap.c b/mm/frontswap.c index 1b24bdc..fae1160 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area); static unsigned long __frontswap_curr_pages(void) { - int type; unsigned long totalpages = 0; struct swap_info_struct *si = NULL; assert_spin_locked(&swap_lock); - for (type = swap_list.head; type >= 0; type = si->next) { - si = swap_info[type]; + list_for_each_entry(si, &swap_list_head, list) totalpages += atomic_read(&si->frontswap_pages); - } return totalpages; } @@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, int si_frontswap_pages; unsigned long total_pages_to_unuse = total; unsigned long pages = 0, pages_to_unuse = 0; - int type; assert_spin_locked(&swap_lock); - for (type = swap_list.head; type >= 0; type = si->next) { - si = swap_info[type]; + list_for_each_entry(si, &swap_list_head, list) { si_frontswap_pages = atomic_read(&si->frontswap_pages); if (total_pages_to_unuse < si_frontswap_pages) { pages = pages_to_unuse = total_pages_to_unuse; @@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, } vm_unacct_memory(pages); *unused = pages_to_unuse; - *swapid = type; + *swapid = si->type; ret = 0; break; } @@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) /* * we don't want to hold swap_lock while doing a very * lengthy try_to_unuse, but swap_list may change - * so restart scan from swap_list.head each time + * so restart scan from swap_list_head each time */ spin_lock(&swap_lock); ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); diff --git a/mm/swapfile.c b/mm/swapfile.c index 4a7f7e6..6c95a8c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages; /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; static int least_priority; -static atomic_t highest_priority_index = ATOMIC_INIT(-1); static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -struct swap_list_t swap_list = {-1, -1}; +/* + * all active swap_info_structs + * protected with swap_lock, and ordered by priority. + */ +LIST_HEAD(swap_list_head); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -640,66 +643,54 @@ no_page: swp_entry_t get_swap_page(void) { - struct swap_info_struct *si; + struct swap_info_struct *si, *next; pgoff_t offset; - int type, next; - int wrapped = 0; - int hp_index; + struct list_head *tmp; spin_lock(&swap_lock); if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; atomic_long_dec(&nr_swap_pages); - for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) { - hp_index = atomic_xchg(&highest_priority_index, -1); - /* - * highest_priority_index records current highest priority swap - * type which just frees swap entries. If its priority is - * higher than that of swap_list.next swap type, we use it. It - * isn't protected by swap_lock, so it can be an invalid value - * if the corresponding swap type is swapoff. We double check - * the flags here. It's even possible the swap type is swapoff - * and swapon again and its priority is changed. In such rare - * case, low prority swap type might be used, but eventually - * high priority swap will be used after several rounds of - * swap. - */ - if (hp_index != -1 && hp_index != type && - swap_info[type]->prio < swap_info[hp_index]->prio && - (swap_info[hp_index]->flags & SWP_WRITEOK)) { - type = hp_index; - swap_list.next = type; - } - - si = swap_info[type]; - next = si->next; - if (next < 0 || - (!wrapped && si->prio != swap_info[next]->prio)) { - next = swap_list.head; - wrapped++; - } - + list_for_each(tmp, &swap_list_head) { + si = list_entry(tmp, typeof(*si), list); spin_lock(&si->lock); - if (!si->highest_bit) { - spin_unlock(&si->lock); - continue; - } - if (!(si->flags & SWP_WRITEOK)) { + if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_unlock(&si->lock); continue; } - swap_list.next = next; + /* + * rotate the current swap_info that we're going to use + * to after any other swap_info that have the same prio, + * so that all equal-priority swap_info get used equally + */ + next = si; + list_for_each_entry_continue(next, &swap_list_head, list) { + if (si->prio != next->prio) + break; + list_rotate_left(&si->list); + next = si; + } spin_unlock(&swap_lock); /* This is called for allocating swap entry for cache */ offset = scan_swap_map(si, SWAP_HAS_CACHE); spin_unlock(&si->lock); if (offset) - return swp_entry(type, offset); + return swp_entry(si->type, offset); spin_lock(&swap_lock); - next = swap_list.next; + /* + * if we got here, it's likely that si was almost full before, + * and since scan_swap_map() can drop the si->lock, multiple + * callers probably all tried to get a page from the same si + * and it filled up before we could get one. So we need to + * try again. Since we dropped the swap_lock, there may now + * be non-full higher priority swap_infos, and this si may have + * even been removed from the list (although very unlikely). + * Let's start over. + */ + tmp = &swap_list_head; } atomic_long_inc(&nr_swap_pages); @@ -766,27 +757,6 @@ out: return NULL; } -/* - * This swap type frees swap entry, check if it is the highest priority swap - * type which just frees swap entry. get_swap_page() uses - * highest_priority_index to search highest priority swap type. The - * swap_info_struct.lock can't protect us if there are multiple swap types - * active, so we use atomic_cmpxchg. - */ -static void set_highest_priority_index(int type) -{ - int old_hp_index, new_hp_index; - - do { - old_hp_index = atomic_read(&highest_priority_index); - if (old_hp_index != -1 && - swap_info[old_hp_index]->prio >= swap_info[type]->prio) - break; - new_hp_index = type; - } while (atomic_cmpxchg(&highest_priority_index, - old_hp_index, new_hp_index) != old_hp_index); -} - static unsigned char swap_entry_free(struct swap_info_struct *p, swp_entry_t entry, unsigned char usage) { @@ -830,7 +800,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, p->lowest_bit = offset; if (offset > p->highest_bit) p->highest_bit = offset; - set_highest_priority_index(p->type); atomic_long_inc(&nr_swap_pages); p->inuse_pages--; frontswap_invalidate_page(p->type, offset); @@ -1765,7 +1734,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { - int i, prev; + struct swap_info_struct *si; if (prio >= 0) p->prio = prio; @@ -1777,18 +1746,28 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, atomic_long_add(p->pages, &nr_swap_pages); total_swap_pages += p->pages; - /* insert swap space into swap_list: */ - prev = -1; - for (i = swap_list.head; i >= 0; i = swap_info[i]->next) { - if (p->prio >= swap_info[i]->prio) - break; - prev = i; + assert_spin_locked(&swap_lock); + BUG_ON(!list_empty(&p->list)); + /* + * insert into swap list; the list is in priority order, + * so that get_swap_page() can get a page from the highest + * priority swap_info_struct with available page(s), and + * swapoff can adjust the auto-assigned (i.e. negative) prio + * values for any lower-priority swap_info_structs when + * removing a negative-prio swap_info_struct + */ + list_for_each_entry(si, &swap_list_head, list) { + if (p->prio >= si->prio) { + list_add_tail(&p->list, &si->list); + return; + } } - p->next = i; - if (prev < 0) - swap_list.head = swap_list.next = p->type; - else - swap_info[prev]->next = p->type; + /* + * this covers two cases: + * 1) p->prio is less than all existing prio + * 2) the swap list is empty + */ + list_add_tail(&p->list, &swap_list_head); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -1823,8 +1802,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; - int i, type, prev; - int err; + int err, found = 0; unsigned int old_block_size; if (!capable(CAP_SYS_ADMIN)) @@ -1842,17 +1820,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) goto out; mapping = victim->f_mapping; - prev = -1; spin_lock(&swap_lock); - for (type = swap_list.head; type >= 0; type = swap_info[type]->next) { - p = swap_info[type]; + list_for_each_entry(p, &swap_list_head, list) { if (p->flags & SWP_WRITEOK) { - if (p->swap_file->f_mapping == mapping) + if (p->swap_file->f_mapping == mapping) { + found = 1; break; + } } - prev = type; } - if (type < 0) { + if (!found) { err = -EINVAL; spin_unlock(&swap_lock); goto out_dput; @@ -1864,20 +1841,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - if (prev < 0) - swap_list.head = p->next; - else - swap_info[prev]->next = p->next; - if (type == swap_list.next) { - /* just pick something that's safe... */ - swap_list.next = swap_list.head; - } spin_lock(&p->lock); if (p->prio < 0) { - for (i = p->next; i >= 0; i = swap_info[i]->next) - swap_info[i]->prio = p->prio--; + struct swap_info_struct *si = p; + + list_for_each_entry_continue(si, &swap_list_head, list) { + si->prio++; + } least_priority++; } + list_del_init(&p->list); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; @@ -1885,7 +1858,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); set_current_oom_origin(); - err = try_to_unuse(type, false, 0); /* force all pages to be unused */ + err = try_to_unuse(p->type, false, 0); /* force unuse all pages */ clear_current_oom_origin(); if (err) { @@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) frontswap_map = frontswap_map_get(p); spin_unlock(&p->lock); spin_unlock(&swap_lock); - frontswap_invalidate_area(type); + frontswap_invalidate_area(p->type); frontswap_map_set(p, NULL); mutex_unlock(&swapon_mutex); free_percpu(p->percpu_cluster); @@ -1935,7 +1908,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) vfree(cluster_info); vfree(frontswap_map); /* Destroy swap account information */ - swap_cgroup_swapoff(type); + swap_cgroup_swapoff(p->type); inode = mapping->host; if (S_ISBLK(inode->i_mode)) { @@ -2142,8 +2115,8 @@ static struct swap_info_struct *alloc_swap_info(void) */ } INIT_LIST_HEAD(&p->first_swap_extent.list); + INIT_LIST_HEAD(&p->list); p->flags = SWP_USED; - p->next = -1; spin_unlock(&swap_lock); spin_lock_init(&p->lock); -- cgit v0.10.2 From fd16618e12a05df79a3439d72d5ffdac5d34f3da Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 4 Jun 2014 16:09:55 -0700 Subject: lib/plist: add helper functions Add PLIST_HEAD() to plist.h, equivalent to LIST_HEAD() from list.h, to define and initialize a struct plist_head. Add plist_for_each_continue() and plist_for_each_entry_continue(), equivalent to list_for_each_continue() and list_for_each_entry_continue(), to iterate over a plist continuing after the current position. Add plist_prev() and plist_next(), equivalent to (struct list_head*)->prev and ->next, implemented by list_prev_entry() and list_next_entry(), to access the prev/next struct plist_node entry. These are needed because unlike struct list_head, direct access of the prev/next struct plist_node isn't possible; the list must be navigated via the contained struct list_head. e.g. instead of accessing the prev by list_prev_entry(node, node_list) it can be accessed by plist_prev(node). Signed-off-by: Dan Streetman Acked-by: Mel Gorman Cc: Paul Gortmaker Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Shaohua Li Cc: Hugh Dickins Cc: Dan Streetman Cc: Michal Hocko Cc: Christian Ehrhardt Cc: Weijie Yang Cc: Rik van Riel Cc: Johannes Weiner Cc: Bob Liu Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/plist.h b/include/linux/plist.h index aa0fb39..c815491 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -98,6 +98,13 @@ struct plist_node { } /** + * PLIST_HEAD - declare and init plist_head + * @head: name for struct plist_head variable + */ +#define PLIST_HEAD(head) \ + struct plist_head head = PLIST_HEAD_INIT(head) + +/** * PLIST_NODE_INIT - static struct plist_node initializer * @node: struct plist_node variable name * @__prio: initial node priority @@ -143,6 +150,16 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); list_for_each_entry(pos, &(head)->node_list, node_list) /** + * plist_for_each_continue - continue iteration over the plist + * @pos: the type * to use as a loop cursor + * @head: the head for your list + * + * Continue to iterate over plist, continuing after the current position. + */ +#define plist_for_each_continue(pos, head) \ + list_for_each_entry_continue(pos, &(head)->node_list, node_list) + +/** * plist_for_each_safe - iterate safely over a plist of given type * @pos: the type * to use as a loop counter * @n: another type * to use as temporary storage @@ -163,6 +180,18 @@ extern void plist_del(struct plist_node *node, struct plist_head *head); list_for_each_entry(pos, &(head)->node_list, mem.node_list) /** + * plist_for_each_entry_continue - continue iteration over list of given type + * @pos: the type * to use as a loop cursor + * @head: the head for your list + * @m: the name of the list_struct within the struct + * + * Continue to iterate over list of given type, continuing after + * the current position. + */ +#define plist_for_each_entry_continue(pos, head, m) \ + list_for_each_entry_continue(pos, &(head)->node_list, m.node_list) + +/** * plist_for_each_entry_safe - iterate safely over list of given type * @pos: the type * to use as a loop counter * @n: another type * to use as temporary storage @@ -229,6 +258,20 @@ static inline int plist_node_empty(const struct plist_node *node) #endif /** + * plist_next - get the next entry in list + * @pos: the type * to cursor + */ +#define plist_next(pos) \ + list_next_entry(pos, node_list) + +/** + * plist_prev - get the prev entry in list + * @pos: the type * to cursor + */ +#define plist_prev(pos) \ + list_prev_entry(pos, node_list) + +/** * plist_first - return the first node (and thus, highest priority) * @head: the &struct plist_head pointer * -- cgit v0.10.2 From a75f232ce0fe38bd01301899ecd97ffd0254316a Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 4 Jun 2014 16:09:57 -0700 Subject: lib/plist: add plist_requeue Add plist_requeue(), which moves the specified plist_node after all other same-priority plist_nodes in the list. This is essentially an optimized plist_del() followed by plist_add(). This is needed by swap, which (with the next patch in this set) uses a plist of available swap devices. When a swap device (either a swap partition or swap file) are added to the system with swapon(), the device is added to a plist, ordered by the swap device's priority. When swap needs to allocate a page from one of the swap devices, it takes the page from the first swap device on the plist, which is the highest priority swap device. The swap device is left in the plist until all its pages are used, and then removed from the plist when it becomes full. However, as described in man 2 swapon, swap must allocate pages from swap devices with the same priority in round-robin order; to do this, on each swap page allocation, swap uses a page from the first swap device in the plist, and then calls plist_requeue() to move that swap device entry to after any other same-priority swap devices. The next swap page allocation will again use a page from the first swap device in the plist and requeue it, and so on, resulting in round-robin usage of equal-priority swap devices. Also add plist_test_requeue() test function, for use by plist_test() to test plist_requeue() function. Signed-off-by: Dan Streetman Cc: Steven Rostedt Cc: Peter Zijlstra Acked-by: Mel Gorman Cc: Paul Gortmaker Cc: Thomas Gleixner Cc: Shaohua Li Cc: Hugh Dickins Cc: Dan Streetman Cc: Michal Hocko Cc: Christian Ehrhardt Cc: Weijie Yang Cc: Rik van Riel Cc: Johannes Weiner Cc: Bob Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/plist.h b/include/linux/plist.h index c815491..8b6c970 100644 --- a/include/linux/plist.h +++ b/include/linux/plist.h @@ -141,6 +141,8 @@ static inline void plist_node_init(struct plist_node *node, int prio) extern void plist_add(struct plist_node *node, struct plist_head *head); extern void plist_del(struct plist_node *node, struct plist_head *head); +extern void plist_requeue(struct plist_node *node, struct plist_head *head); + /** * plist_for_each - iterate over the plist * @pos: the type * to use as a loop counter diff --git a/lib/plist.c b/lib/plist.c index 1ebc95f..0f2084d 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -134,6 +134,46 @@ void plist_del(struct plist_node *node, struct plist_head *head) plist_check_head(head); } +/** + * plist_requeue - Requeue @node at end of same-prio entries. + * + * This is essentially an optimized plist_del() followed by + * plist_add(). It moves an entry already in the plist to + * after any other same-priority entries. + * + * @node: &struct plist_node pointer - entry to be moved + * @head: &struct plist_head pointer - list head + */ +void plist_requeue(struct plist_node *node, struct plist_head *head) +{ + struct plist_node *iter; + struct list_head *node_next = &head->node_list; + + plist_check_head(head); + BUG_ON(plist_head_empty(head)); + BUG_ON(plist_node_empty(node)); + + if (node == plist_last(head)) + return; + + iter = plist_next(node); + + if (node->prio != iter->prio) + return; + + plist_del(node, head); + + plist_for_each_continue(iter, head) { + if (node->prio != iter->prio) { + node_next = &iter->node_list; + break; + } + } + list_add_tail(&node->node_list, node_next); + + plist_check_head(head); +} + #ifdef CONFIG_DEBUG_PI_LIST #include #include @@ -170,6 +210,14 @@ static void __init plist_test_check(int nr_expect) BUG_ON(prio_pos->prio_list.next != &first->prio_list); } +static void __init plist_test_requeue(struct plist_node *node) +{ + plist_requeue(node, &test_head); + + if (node != plist_last(&test_head)) + BUG_ON(node->prio == plist_next(node)->prio); +} + static int __init plist_test(void) { int nr_expect = 0, i, loop; @@ -193,6 +241,10 @@ static int __init plist_test(void) nr_expect--; } plist_test_check(nr_expect); + if (!plist_node_empty(test_node + i)) { + plist_test_requeue(test_node + i); + plist_test_check(nr_expect); + } } for (i = 0; i < ARRAY_SIZE(test_node); i++) { -- cgit v0.10.2 From 18ab4d4ced0817421e6db6940374cc39d28d65da Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 4 Jun 2014 16:09:59 -0700 Subject: swap: change swap_list_head to plist, add swap_avail_head Originally get_swap_page() started iterating through the singly-linked list of swap_info_structs using swap_list.next or highest_priority_index, which both were intended to point to the highest priority active swap target that was not full. The first patch in this series changed the singly-linked list to a doubly-linked list, and removed the logic to start at the highest priority non-full entry; it starts scanning at the highest priority entry each time, even if the entry is full. Replace the manually ordered swap_list_head with a plist, swap_active_head. Add a new plist, swap_avail_head. The original swap_active_head plist contains all active swap_info_structs, as before, while the new swap_avail_head plist contains only swap_info_structs that are active and available, i.e. not full. Add a new spinlock, swap_avail_lock, to protect the swap_avail_head list. Mel Gorman suggested using plists since they internally handle ordering the list entries based on priority, which is exactly what swap was doing manually. All the ordering code is now removed, and swap_info_struct entries and simply added to their corresponding plist and automatically ordered correctly. Using a new plist for available swap_info_structs simplifies and optimizes get_swap_page(), which no longer has to iterate over full swap_info_structs. Using a new spinlock for swap_avail_head plist allows each swap_info_struct to add or remove themselves from the plist when they become full or not-full; previously they could not do so because the swap_info_struct->lock is held when they change from full<->not-full, and the swap_lock protecting the main swap_active_head must be ordered before any swap_info_struct->lock. Signed-off-by: Dan Streetman Acked-by: Mel Gorman Cc: Shaohua Li Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Hugh Dickins Cc: Dan Streetman Cc: Michal Hocko Cc: Christian Ehrhardt Cc: Weijie Yang Cc: Rik van Riel Cc: Johannes Weiner Cc: Bob Liu Cc: Paul Gortmaker Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/swap.h b/include/linux/swap.h index 8bb85d6d..9155bcd 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -214,7 +214,8 @@ struct percpu_cluster { struct swap_info_struct { unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ - struct list_head list; /* entry in swap list */ + struct plist_node list; /* entry in swap_active_head */ + struct plist_node avail_list; /* entry in swap_avail_head */ signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h index 2eab382..388293a 100644 --- a/include/linux/swapfile.h +++ b/include/linux/swapfile.h @@ -6,7 +6,7 @@ * want to expose them to the dozens of source files that include swap.h */ extern spinlock_t swap_lock; -extern struct list_head swap_list_head; +extern struct plist_head swap_active_head; extern struct swap_info_struct *swap_info[]; extern int try_to_unuse(unsigned int, bool, unsigned long); diff --git a/mm/frontswap.c b/mm/frontswap.c index fae1160..c30eec5 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -331,7 +331,7 @@ static unsigned long __frontswap_curr_pages(void) struct swap_info_struct *si = NULL; assert_spin_locked(&swap_lock); - list_for_each_entry(si, &swap_list_head, list) + plist_for_each_entry(si, &swap_active_head, list) totalpages += atomic_read(&si->frontswap_pages); return totalpages; } @@ -346,7 +346,7 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused, unsigned long pages = 0, pages_to_unuse = 0; assert_spin_locked(&swap_lock); - list_for_each_entry(si, &swap_list_head, list) { + plist_for_each_entry(si, &swap_active_head, list) { si_frontswap_pages = atomic_read(&si->frontswap_pages); if (total_pages_to_unuse < si_frontswap_pages) { pages = pages_to_unuse = total_pages_to_unuse; @@ -408,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages) /* * we don't want to hold swap_lock while doing a very * lengthy try_to_unuse, but swap_list may change - * so restart scan from swap_list_head each time + * so restart scan from swap_active_head each time */ spin_lock(&swap_lock); ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type); diff --git a/mm/swapfile.c b/mm/swapfile.c index 6c95a8c..beeeef8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -61,7 +61,22 @@ static const char Unused_offset[] = "Unused swap offset entry "; * all active swap_info_structs * protected with swap_lock, and ordered by priority. */ -LIST_HEAD(swap_list_head); +PLIST_HEAD(swap_active_head); + +/* + * all available (active, not full) swap_info_structs + * protected with swap_avail_lock, ordered by priority. + * This is used by get_swap_page() instead of swap_active_head + * because swap_active_head includes all swap_info_structs, + * but get_swap_page() doesn't need to look at full ones. + * This uses its own lock instead of swap_lock because when a + * swap_info_struct changes between not-full/full, it needs to + * add/remove itself to/from this list, but the swap_info_struct->lock + * is held and the locking order requires swap_lock to be taken + * before any swap_info_struct->lock. + */ +static PLIST_HEAD(swap_avail_head); +static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -594,6 +609,9 @@ checks: if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; + spin_lock(&swap_avail_lock); + plist_del(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); } si->swap_map[offset] = usage; inc_cluster_info_page(si, si->cluster_info, offset); @@ -645,57 +663,63 @@ swp_entry_t get_swap_page(void) { struct swap_info_struct *si, *next; pgoff_t offset; - struct list_head *tmp; - spin_lock(&swap_lock); if (atomic_long_read(&nr_swap_pages) <= 0) goto noswap; atomic_long_dec(&nr_swap_pages); - list_for_each(tmp, &swap_list_head) { - si = list_entry(tmp, typeof(*si), list); + spin_lock(&swap_avail_lock); + +start_over: + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + /* requeue si to after same-priority siblings */ + plist_requeue(&si->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); spin_lock(&si->lock); if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + if (plist_node_empty(&si->avail_list)) { + spin_unlock(&si->lock); + goto nextsi; + } + WARN(!si->highest_bit, + "swap_info %d in list but !highest_bit\n", + si->type); + WARN(!(si->flags & SWP_WRITEOK), + "swap_info %d in list but !SWP_WRITEOK\n", + si->type); + plist_del(&si->avail_list, &swap_avail_head); spin_unlock(&si->lock); - continue; + goto nextsi; } - /* - * rotate the current swap_info that we're going to use - * to after any other swap_info that have the same prio, - * so that all equal-priority swap_info get used equally - */ - next = si; - list_for_each_entry_continue(next, &swap_list_head, list) { - if (si->prio != next->prio) - break; - list_rotate_left(&si->list); - next = si; - } - - spin_unlock(&swap_lock); /* This is called for allocating swap entry for cache */ offset = scan_swap_map(si, SWAP_HAS_CACHE); spin_unlock(&si->lock); if (offset) return swp_entry(si->type, offset); - spin_lock(&swap_lock); + pr_debug("scan_swap_map of si %d failed to find offset\n", + si->type); + spin_lock(&swap_avail_lock); +nextsi: /* * if we got here, it's likely that si was almost full before, * and since scan_swap_map() can drop the si->lock, multiple * callers probably all tried to get a page from the same si - * and it filled up before we could get one. So we need to - * try again. Since we dropped the swap_lock, there may now - * be non-full higher priority swap_infos, and this si may have - * even been removed from the list (although very unlikely). - * Let's start over. + * and it filled up before we could get one; or, the si filled + * up between us dropping swap_avail_lock and taking si->lock. + * Since we dropped the swap_avail_lock, the swap_avail_head + * list may have been modified; so if next is still in the + * swap_avail_head list then try it, otherwise start over. */ - tmp = &swap_list_head; + if (plist_node_empty(&next->avail_list)) + goto start_over; } + spin_unlock(&swap_avail_lock); + atomic_long_inc(&nr_swap_pages); noswap: - spin_unlock(&swap_lock); return (swp_entry_t) {0}; } @@ -798,8 +822,18 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, dec_cluster_info_page(p, p->cluster_info, offset); if (offset < p->lowest_bit) p->lowest_bit = offset; - if (offset > p->highest_bit) + if (offset > p->highest_bit) { + bool was_full = !p->highest_bit; p->highest_bit = offset; + if (was_full && (p->flags & SWP_WRITEOK)) { + spin_lock(&swap_avail_lock); + WARN_ON(!plist_node_empty(&p->avail_list)); + if (plist_node_empty(&p->avail_list)) + plist_add(&p->avail_list, + &swap_avail_head); + spin_unlock(&swap_avail_lock); + } + } atomic_long_inc(&nr_swap_pages); p->inuse_pages--; frontswap_invalidate_page(p->type, offset); @@ -1734,12 +1768,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { - struct swap_info_struct *si; - if (prio >= 0) p->prio = prio; else p->prio = --least_priority; + /* + * the plist prio is negated because plist ordering is + * low-to-high, while swap ordering is high-to-low + */ + p->list.prio = -p->prio; + p->avail_list.prio = -p->prio; p->swap_map = swap_map; p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; @@ -1747,27 +1785,20 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, total_swap_pages += p->pages; assert_spin_locked(&swap_lock); - BUG_ON(!list_empty(&p->list)); - /* - * insert into swap list; the list is in priority order, - * so that get_swap_page() can get a page from the highest - * priority swap_info_struct with available page(s), and - * swapoff can adjust the auto-assigned (i.e. negative) prio - * values for any lower-priority swap_info_structs when - * removing a negative-prio swap_info_struct - */ - list_for_each_entry(si, &swap_list_head, list) { - if (p->prio >= si->prio) { - list_add_tail(&p->list, &si->list); - return; - } - } /* - * this covers two cases: - * 1) p->prio is less than all existing prio - * 2) the swap list is empty + * both lists are plists, and thus priority ordered. + * swap_active_head needs to be priority ordered for swapoff(), + * which on removal of any swap_info_struct with an auto-assigned + * (i.e. negative) priority increments the auto-assigned priority + * of any lower-priority swap_info_structs. + * swap_avail_head needs to be priority ordered for get_swap_page(), + * which allocates swap pages from the highest available priority + * swap_info_struct. */ - list_add_tail(&p->list, &swap_list_head); + plist_add(&p->list, &swap_active_head); + spin_lock(&swap_avail_lock); + plist_add(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -1821,7 +1852,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) mapping = victim->f_mapping; spin_lock(&swap_lock); - list_for_each_entry(p, &swap_list_head, list) { + plist_for_each_entry(p, &swap_active_head, list) { if (p->flags & SWP_WRITEOK) { if (p->swap_file->f_mapping == mapping) { found = 1; @@ -1841,16 +1872,21 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } + spin_lock(&swap_avail_lock); + plist_del(&p->avail_list, &swap_avail_head); + spin_unlock(&swap_avail_lock); spin_lock(&p->lock); if (p->prio < 0) { struct swap_info_struct *si = p; - list_for_each_entry_continue(si, &swap_list_head, list) { + plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; + si->list.prio--; + si->avail_list.prio--; } least_priority++; } - list_del_init(&p->list); + plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; p->flags &= ~SWP_WRITEOK; @@ -2115,7 +2151,8 @@ static struct swap_info_struct *alloc_swap_info(void) */ } INIT_LIST_HEAD(&p->first_swap_extent.list); - INIT_LIST_HEAD(&p->list); + plist_node_init(&p->list, 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; spin_unlock(&swap_lock); spin_lock_init(&p->lock); -- cgit v0.10.2 From fe54b1fd49b712cd94a1846e993a515fc9394dcb Mon Sep 17 00:00:00 2001 From: Marc Carino Date: Wed, 4 Jun 2014 16:10:01 -0700 Subject: cma: increase CMA_ALIGNMENT upper limit to 12 Some systems require a larger maximum PAGE_SIZE order for CMA allocations. To accommodate such systems, increase the upper-bound of the CMA_ALIGNMENT range to 12 (which ends up being 16MB on systems with 4K pages). Signed-off-by: Marc Carino Cc: Marek Szyprowski Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 4b7b452..23b8726 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -258,7 +258,7 @@ endchoice config CMA_ALIGNMENT int "Maximum PAGE_SIZE order of alignment for contiguous buffers" - range 4 9 + range 4 12 default 8 help DMA mapping framework by default aligns all buffers to the smallest -- cgit v0.10.2 From 172cb4b3d49a1339dd67ee05e3f47972a70f556f Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Jun 2014 16:10:02 -0700 Subject: mm/dmapool.c: reuse devres_release() to free resources Instead of calling an additional routine in dmam_pool_destroy() rely on what dmam_pool_release() is doing. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/dmapool.c b/mm/dmapool.c index a3a1bfe..306baa5 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -500,7 +500,6 @@ void dmam_pool_destroy(struct dma_pool *pool) { struct device *dev = pool->dev; - WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); - dma_pool_destroy(pool); + WARN_ON(devres_release(dev, dmam_pool_release, dmam_pool_match, pool)); } EXPORT_SYMBOL(dmam_pool_destroy); -- cgit v0.10.2 From 776ed0f0377914d1e65fed903c052e9eef3f4cc3 Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:10:02 -0700 Subject: memcg: cleanup kmem cache creation/destruction functions naming Current names are rather inconsistent. Let's try to improve them. Brief change log: ** old name ** ** new name ** kmem_cache_create_memcg memcg_create_kmem_cache memcg_kmem_create_cache memcg_regsiter_cache memcg_kmem_destroy_cache memcg_unregister_cache kmem_cache_destroy_memcg_children memcg_cleanup_cache_params mem_cgroup_destroy_all_caches memcg_unregister_all_caches create_work memcg_register_cache_work memcg_create_cache_work_func memcg_register_cache_func memcg_create_cache_enqueue memcg_schedule_register_cache Signed-off-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dfc2929..eb65d29 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -505,7 +505,7 @@ __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp); int __memcg_charge_slab(struct kmem_cache *cachep, gfp_t gfp, int order); void __memcg_uncharge_slab(struct kmem_cache *cachep, int order); -int __kmem_cache_destroy_memcg_children(struct kmem_cache *s); +int __memcg_cleanup_cache_params(struct kmem_cache *s); /** * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed. diff --git a/include/linux/slab.h b/include/linux/slab.h index 86e5b26..1d9abb7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -116,7 +116,7 @@ struct kmem_cache *kmem_cache_create(const char *, size_t, size_t, unsigned long, void (*)(void *)); #ifdef CONFIG_MEMCG_KMEM -struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *, +struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *, const char *); #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5e2bfcc..d176edb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3132,8 +3132,8 @@ void memcg_free_cache_params(struct kmem_cache *s) kfree(s->memcg_params); } -static void memcg_kmem_create_cache(struct mem_cgroup *memcg, - struct kmem_cache *root_cache) +static void memcg_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *root_cache) { static char memcg_name_buf[NAME_MAX + 1]; /* protected by memcg_slab_mutex */ @@ -3153,7 +3153,7 @@ static void memcg_kmem_create_cache(struct mem_cgroup *memcg, return; cgroup_name(memcg->css.cgroup, memcg_name_buf, NAME_MAX + 1); - cachep = kmem_cache_create_memcg(memcg, root_cache, memcg_name_buf); + cachep = memcg_create_kmem_cache(memcg, root_cache, memcg_name_buf); /* * If we could not create a memcg cache, do not complain, because * that's not critical at all as we can always proceed with the root @@ -3175,7 +3175,7 @@ static void memcg_kmem_create_cache(struct mem_cgroup *memcg, root_cache->memcg_params->memcg_caches[id] = cachep; } -static void memcg_kmem_destroy_cache(struct kmem_cache *cachep) +static void memcg_unregister_cache(struct kmem_cache *cachep) { struct kmem_cache *root_cache; struct mem_cgroup *memcg; @@ -3228,7 +3228,7 @@ static inline void memcg_resume_kmem_account(void) current->memcg_kmem_skip_account--; } -int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) +int __memcg_cleanup_cache_params(struct kmem_cache *s) { struct kmem_cache *c; int i, failed = 0; @@ -3239,7 +3239,7 @@ int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) if (!c) continue; - memcg_kmem_destroy_cache(c); + memcg_unregister_cache(c); if (cache_from_memcg_idx(s, i)) failed++; @@ -3248,7 +3248,7 @@ int __kmem_cache_destroy_memcg_children(struct kmem_cache *s) return failed; } -static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static void memcg_unregister_all_caches(struct mem_cgroup *memcg) { struct kmem_cache *cachep; struct memcg_cache_params *params, *tmp; @@ -3261,25 +3261,26 @@ static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) cachep = memcg_params_to_cache(params); kmem_cache_shrink(cachep); if (atomic_read(&cachep->memcg_params->nr_pages) == 0) - memcg_kmem_destroy_cache(cachep); + memcg_unregister_cache(cachep); } mutex_unlock(&memcg_slab_mutex); } -struct create_work { +struct memcg_register_cache_work { struct mem_cgroup *memcg; struct kmem_cache *cachep; struct work_struct work; }; -static void memcg_create_cache_work_func(struct work_struct *w) +static void memcg_register_cache_func(struct work_struct *w) { - struct create_work *cw = container_of(w, struct create_work, work); + struct memcg_register_cache_work *cw = + container_of(w, struct memcg_register_cache_work, work); struct mem_cgroup *memcg = cw->memcg; struct kmem_cache *cachep = cw->cachep; mutex_lock(&memcg_slab_mutex); - memcg_kmem_create_cache(memcg, cachep); + memcg_register_cache(memcg, cachep); mutex_unlock(&memcg_slab_mutex); css_put(&memcg->css); @@ -3289,12 +3290,12 @@ static void memcg_create_cache_work_func(struct work_struct *w) /* * Enqueue the creation of a per-memcg kmem_cache. */ -static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void __memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { - struct create_work *cw; + struct memcg_register_cache_work *cw; - cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); + cw = kmalloc(sizeof(*cw), GFP_NOWAIT); if (cw == NULL) { css_put(&memcg->css); return; @@ -3303,17 +3304,17 @@ static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, cw->memcg = memcg; cw->cachep = cachep; - INIT_WORK(&cw->work, memcg_create_cache_work_func); + INIT_WORK(&cw->work, memcg_register_cache_func); schedule_work(&cw->work); } -static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, - struct kmem_cache *cachep) +static void memcg_schedule_register_cache(struct mem_cgroup *memcg, + struct kmem_cache *cachep) { /* * We need to stop accounting when we kmalloc, because if the * corresponding kmalloc cache is not yet created, the first allocation - * in __memcg_create_cache_enqueue will recurse. + * in __memcg_schedule_register_cache will recurse. * * However, it is better to enclose the whole function. Depending on * the debugging options enabled, INIT_WORK(), for instance, can @@ -3322,7 +3323,7 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, * the safest choice is to do it like this, wrapping the whole function. */ memcg_stop_kmem_account(); - __memcg_create_cache_enqueue(memcg, cachep); + __memcg_schedule_register_cache(memcg, cachep); memcg_resume_kmem_account(); } @@ -3393,16 +3394,11 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, * * However, there are some clashes that can arrive from locking. * For instance, because we acquire the slab_mutex while doing - * kmem_cache_dup, this means no further allocation could happen - * with the slab_mutex held. - * - * Also, because cache creation issue get_online_cpus(), this - * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, - * that ends up reversed during cpu hotplug. (cpuset allocates - * a bunch of GFP_KERNEL memory during cpuup). Due to all that, - * better to defer everything. + * memcg_create_kmem_cache, this means no further allocation + * could happen with the slab_mutex held. So it's better to + * defer everything. */ - memcg_create_cache_enqueue(memcg, cachep); + memcg_schedule_register_cache(memcg, cachep); return cachep; out: rcu_read_unlock(); @@ -3526,7 +3522,7 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order) memcg_uncharge_kmem(memcg, PAGE_SIZE << order); } #else -static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) +static inline void memcg_unregister_all_caches(struct mem_cgroup *memcg) { } #endif /* CONFIG_MEMCG_KMEM */ @@ -6372,7 +6368,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) css_for_each_descendant_post(iter, css) mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); - mem_cgroup_destroy_all_caches(memcg); + memcg_unregister_all_caches(memcg); vmpressure_cleanup(&memcg->vmpressure); } diff --git a/mm/slab_common.c b/mm/slab_common.c index 3217561..48fafb6 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -261,7 +261,7 @@ EXPORT_SYMBOL(kmem_cache_create); #ifdef CONFIG_MEMCG_KMEM /* - * kmem_cache_create_memcg - Create a cache for a memory cgroup. + * memcg_create_kmem_cache - Create a cache for a memory cgroup. * @memcg: The memory cgroup the new cache is for. * @root_cache: The parent of the new cache. * @memcg_name: The name of the memory cgroup (used for naming the new cache). @@ -270,7 +270,7 @@ EXPORT_SYMBOL(kmem_cache_create); * requests going from @memcg to @root_cache. The new cache inherits properties * from its parent. */ -struct kmem_cache *kmem_cache_create_memcg(struct mem_cgroup *memcg, +struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, struct kmem_cache *root_cache, const char *memcg_name) { @@ -305,7 +305,7 @@ out_unlock: return s; } -static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +static int memcg_cleanup_cache_params(struct kmem_cache *s) { int rc; @@ -314,13 +314,13 @@ static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) return 0; mutex_unlock(&slab_mutex); - rc = __kmem_cache_destroy_memcg_children(s); + rc = __memcg_cleanup_cache_params(s); mutex_lock(&slab_mutex); return rc; } #else -static int kmem_cache_destroy_memcg_children(struct kmem_cache *s) +static int memcg_cleanup_cache_params(struct kmem_cache *s) { return 0; } @@ -343,7 +343,7 @@ void kmem_cache_destroy(struct kmem_cache *s) if (s->refcount) goto out_unlock; - if (kmem_cache_destroy_memcg_children(s) != 0) + if (memcg_cleanup_cache_params(s) != 0) goto out_unlock; list_del(&s->list); -- cgit v0.10.2 From 0bd62b1190607e4f1b3c2927ba48672a1cf2a83d Mon Sep 17 00:00:00 2001 From: Vladimir Davydov Date: Wed, 4 Jun 2014 16:10:03 -0700 Subject: slab: delete cache from list after __kmem_cache_shutdown succeeds Currently, on kmem_cache_destroy we delete the cache from the slab_list before __kmem_cache_shutdown, inserting it back to the list on failure. Initially, this was done, because we could release the slab_mutex in __kmem_cache_shutdown to delete sysfs slub entry, but since commit 41a212859a4d ("slub: use sysfs'es release mechanism for kmem_cache") we remove sysfs entry later in kmem_cache_destroy after dropping the slab_mutex, so that no implementation of __kmem_cache_shutdown can ever release the lock. Therefore we can simplify the code a bit by moving list_del after __kmem_cache_shutdown. Signed-off-by: Vladimir Davydov Cc: Christoph Lameter Cc: Pekka Enberg Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/slab_common.c b/mm/slab_common.c index 48fafb6..735e01a 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -346,15 +346,15 @@ void kmem_cache_destroy(struct kmem_cache *s) if (memcg_cleanup_cache_params(s) != 0) goto out_unlock; - list_del(&s->list); if (__kmem_cache_shutdown(s) != 0) { - list_add(&s->list, &slab_caches); printk(KERN_ERR "kmem_cache_destroy %s: " "Slab cache still has objects\n", s->name); dump_stack(); goto out_unlock; } + list_del(&s->list); + mutex_unlock(&slab_mutex); if (s->flags & SLAB_DESTROY_BY_RCU) rcu_barrier(); -- cgit v0.10.2 From 65bb371984d6a2c909244eb749e482bb40b72e36 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:05 -0700 Subject: mm: page_alloc: do not update zlc unless the zlc is active The zlc is used on NUMA machines to quickly skip over zones that are full. However it is always updated, even for the first zone scanned when the zlc might not even be active. As it's a write to a bitmap that potentially bounces cache line it's deceptively expensive and most machines will not care. Only update the zlc if it was active. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d88d675..8e76624 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2035,7 +2035,7 @@ try_this_zone: if (page) break; this_zone_full: - if (IS_ENABLED(CONFIG_NUMA)) + if (IS_ENABLED(CONFIG_NUMA) && zlc_active) zlc_mark_zone_full(zonelist, z); } -- cgit v0.10.2 From 800a1e750c7b04c2aa2459afca77e936e01c0029 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:06 -0700 Subject: mm: page_alloc: do not treat a zone that cannot be used for dirty pages as "full" If a zone cannot be used for a dirty page then it gets marked "full" which is cached in the zlc and later potentially skipped by allocation requests that have nothing to do with dirty zones. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8e76624..b4381ea 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1967,7 +1967,7 @@ zonelist_scan: */ if ((alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) - goto this_zone_full; + continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; if (!zone_watermark_ok(zone, order, mark, -- cgit v0.10.2 From ea5e9539abf1258f23e725cb9cb25aa74efa29eb Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:07 -0700 Subject: include/linux/jump_label.h: expose the reference count This patch exposes the jump_label reference count in preparation for the next patch. cpusets cares about both the jump_label being enabled and how many users of the cpusets there currently are. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 5c1dfb2..784304b 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -69,6 +69,10 @@ struct static_key { # include # define HAVE_JUMP_LABEL +#else +struct static_key { + atomic_t enabled; +}; #endif /* CC_HAVE_ASM_GOTO && CONFIG_JUMP_LABEL */ enum jump_label_type { @@ -79,6 +83,12 @@ enum jump_label_type { struct module; #include + +static inline int static_key_count(struct static_key *key) +{ + return atomic_read(&key->enabled); +} + #ifdef HAVE_JUMP_LABEL #define JUMP_LABEL_TYPE_FALSE_BRANCH 0UL @@ -134,10 +144,6 @@ extern void jump_label_apply_nops(struct module *mod); #else /* !HAVE_JUMP_LABEL */ -struct static_key { - atomic_t enabled; -}; - static __always_inline void jump_label_init(void) { static_key_initialized = true; @@ -145,14 +151,14 @@ static __always_inline void jump_label_init(void) static __always_inline bool static_key_false(struct static_key *key) { - if (unlikely(atomic_read(&key->enabled) > 0)) + if (unlikely(static_key_count(key) > 0)) return true; return false; } static __always_inline bool static_key_true(struct static_key *key) { - if (likely(atomic_read(&key->enabled) > 0)) + if (likely(static_key_count(key) > 0)) return true; return false; } @@ -194,7 +200,7 @@ static inline int jump_label_apply_nops(struct module *mod) static inline bool static_key_enabled(struct static_key *key) { - return (atomic_read(&key->enabled) > 0); + return static_key_count(key) > 0; } #endif /* _LINUX_JUMP_LABEL_H */ -- cgit v0.10.2 From 664eeddeef6539247691197c1ac124d4aa872ab6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:08 -0700 Subject: mm: page_alloc: use jump labels to avoid checking number_of_cpusets If cpusets are not in use then we still check a global variable on every page allocation. Use jump labels to avoid the overhead. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index b19d3dc..ade2390 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -12,10 +12,31 @@ #include #include #include +#include #ifdef CONFIG_CPUSETS -extern int number_of_cpusets; /* How many cpusets are defined in system? */ +extern struct static_key cpusets_enabled_key; +static inline bool cpusets_enabled(void) +{ + return static_key_false(&cpusets_enabled_key); +} + +static inline int nr_cpusets(void) +{ + /* jump label reference count + the top-level cpuset */ + return static_key_count(&cpusets_enabled_key) + 1; +} + +static inline void cpuset_inc(void) +{ + static_key_slow_inc(&cpusets_enabled_key); +} + +static inline void cpuset_dec(void) +{ + static_key_slow_dec(&cpusets_enabled_key); +} extern int cpuset_init(void); extern void cpuset_init_smp(void); @@ -32,13 +53,13 @@ extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask); static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask) { - return number_of_cpusets <= 1 || + return nr_cpusets() <= 1 || __cpuset_node_allowed_softwall(node, gfp_mask); } static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask) { - return number_of_cpusets <= 1 || + return nr_cpusets() <= 1 || __cpuset_node_allowed_hardwall(node, gfp_mask); } @@ -124,6 +145,8 @@ static inline void set_mems_allowed(nodemask_t nodemask) #else /* !CONFIG_CPUSETS */ +static inline bool cpusets_enabled(void) { return false; } + static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3d54c41..1300178 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -61,12 +61,7 @@ #include #include -/* - * Tracks how many cpusets are currently defined in system. - * When there is only one cpuset (the root cpuset) we can - * short circuit some hooks. - */ -int number_of_cpusets __read_mostly; +struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; /* See "Frequency meter" comments, below. */ @@ -611,7 +606,7 @@ static int generate_sched_domains(cpumask_var_t **domains, goto done; } - csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); + csa = kmalloc(nr_cpusets() * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; @@ -1888,7 +1883,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) if (is_spread_slab(parent)) set_bit(CS_SPREAD_SLAB, &cs->flags); - number_of_cpusets++; + cpuset_inc(); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -1939,7 +1934,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css) if (is_sched_load_balance(cs)) update_flag(CS_SCHED_LOAD_BALANCE, cs, 0); - number_of_cpusets--; + cpuset_dec(); clear_bit(CS_ONLINE, &cs->flags); mutex_unlock(&cpuset_mutex); @@ -1992,7 +1987,6 @@ int __init cpuset_init(void) if (!alloc_cpumask_var(&cpus_attach, GFP_KERNEL)) BUG(); - number_of_cpusets = 1; return 0; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b4381ea..a2955e1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1921,7 +1921,8 @@ zonelist_scan: if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; - if ((alloc_flags & ALLOC_CPUSET) && + if (cpusets_enabled() && + (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); -- cgit v0.10.2 From d34c5fa06fade08a689fc171bf756fba2858ae73 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:10 -0700 Subject: mm: page_alloc: only check the zone id check if pages are buddies A node/zone index is used to check if pages are compatible for merging but this happens unconditionally even if the buddy page is not free. Defer the calculation as long as possible. Ideally we would check the zone boundary but nodes can overlap. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Acked-by: Rik van Riel Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a2955e1..da52690 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -509,16 +509,26 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, if (!pfn_valid_within(page_to_pfn(buddy))) return 0; - if (page_zone_id(page) != page_zone_id(buddy)) - return 0; - if (page_is_guard(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } if (PageBuddy(buddy) && page_order(buddy) == order) { VM_BUG_ON_PAGE(page_count(buddy) != 0, buddy); + + /* + * zone check is done late to avoid uselessly + * calculating zone/node ids for pages that could + * never merge. + */ + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + return 1; } return 0; -- cgit v0.10.2 From a6e21b14f22041382e832d30deda6f26f37b1097 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:12 -0700 Subject: mm: page_alloc: only check the alloc flags and gfp_mask for dirty once Currently it's calculated once per zone in the zonelist. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index da52690..30f327a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1917,6 +1917,8 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ + bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && + (gfp_mask & __GFP_WRITE); classzone_idx = zone_idx(preferred_zone); zonelist_scan: @@ -1976,8 +1978,7 @@ zonelist_scan: * will require awareness of zones in the * dirty-throttling and the flusher threads. */ - if ((alloc_flags & ALLOC_WMARK_LOW) && - (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone)) + if (consider_zone_dirty && !zone_dirty_ok(zone)) continue; mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; -- cgit v0.10.2 From 5dab29113ca56335c78be3f98bf5ddf2ef8eb6a6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:14 -0700 Subject: mm: page_alloc: take the ALLOC_NO_WATERMARK check out of the fast path ALLOC_NO_WATERMARK is set in a few cases. Always by kswapd, always for __GFP_MEMALLOC, sometimes for swap-over-nfs, tasks etc. Each of these cases are relatively rare events but the ALLOC_NO_WATERMARK check is an unlikely branch in the fast path. This patch moves the check out of the fast path and after it has been determined that the watermarks have not been met. This helps the common fast path at the cost of making the slow path slower and hitting kswapd with a performance cost. It's a reasonable tradeoff. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Reviewed-by: Rik van Riel Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 30f327a..485932c57 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1937,9 +1937,6 @@ zonelist_scan: (alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) continue; - BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); - if (unlikely(alloc_flags & ALLOC_NO_WATERMARKS)) - goto try_this_zone; /* * Distribute pages in proportion to the individual * zone size to ensure fair page aging. The zone a @@ -1986,6 +1983,11 @@ zonelist_scan: classzone_idx, alloc_flags)) { int ret; + /* Checked here to keep the fast path fast */ + BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); + if (alloc_flags & ALLOC_NO_WATERMARKS) + goto try_this_zone; + if (IS_ENABLED(CONFIG_NUMA) && !did_zlc_setup && nr_online_nodes > 1) { /* -- cgit v0.10.2 From e58469bafd0524e848c3733bc3918d854595e20f Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:16 -0700 Subject: mm: page_alloc: use word-based accesses for get/set pageblock bitmaps The test_bit operations in get/set pageblock flags are expensive. This patch reads the bitmap on a word basis and use shifts and masks to isolate the bits of interest. Similarly masks are used to set a local copy of the bitmap and then use cmpxchg to update the bitmap if there have been no other changes made in parallel. In a test running dd onto tmpfs the overhead of the pageblock-related functions went from 1.27% in profiles to 0.5%. In addition to the performance benefits, this patch closes races that are possible between: a) get_ and set_pageblock_migratetype(), where get_pageblock_migratetype() reads part of the bits before and other part of the bits after set_pageblock_migratetype() has updated them. b) set_pageblock_migratetype() and set_pageblock_skip(), where the non-atomic read-modify-update set bit operation in set_pageblock_skip() will cause lost updates to some bits changed in the set_pageblock_migratetype(). Joonsoo Kim first reported the case a) via code inspection. Vlastimil Babka's testing with a debug patch showed that either a) or b) occurs roughly once per mmtests' stress-highalloc benchmark (although not necessarily in the same pageblock). Furthermore during development of unrelated compaction patches, it was observed that frequent calls to {start,undo}_isolate_page_range() the race occurs several thousands of times and has resulted in NULL pointer dereferences in move_freepages() and free_one_page() in places where free_list[migratetype] is manipulated by e.g. list_move(). Further debugging confirmed that migratetype had invalid value of 6, causing out of bounds access to the free_list array. That confirmed that the race exist, although it may be extremely rare, and currently only fatal where page isolation is performed due to memory hot remove. Races on pageblocks being updated by set_pageblock_migratetype(), where both old and new migratetype are lower MIGRATE_RESERVE, currently cannot result in an invalid value being observed, although theoretically they may still lead to unexpected creation or destruction of MIGRATE_RESERVE pageblocks. Furthermore, things could get suddenly worse when memory isolation is used more, or when new migratetypes are added. After this patch, the race has no longer been observed in testing. Signed-off-by: Mel Gorman Acked-by: Vlastimil Babka Reported-by: Joonsoo Kim Reported-and-tested-by: Vlastimil Babka Cc: Johannes Weiner Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Rik van Riel Cc: Peter Zijlstra Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 10a96ee..8ef1e3f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -75,9 +75,13 @@ enum { extern int page_group_by_mobility_disabled; +#define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1) +#define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1) + static inline int get_pageblock_migratetype(struct page *page) { - return get_pageblock_flags_group(page, PB_migrate, PB_migrate_end); + BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); + return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK); } struct free_area { diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 2ee8cd2..c08730c 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -30,9 +30,12 @@ enum pageblock_bits { PB_migrate, PB_migrate_end = PB_migrate + 3 - 1, /* 3 bits required for migrate types */ -#ifdef CONFIG_COMPACTION PB_migrate_skip,/* If set the block is skipped by compaction */ -#endif /* CONFIG_COMPACTION */ + + /* + * Assume the bits will always align on a word. If this assumption + * changes then get/set pageblock needs updating. + */ NR_PAGEBLOCK_BITS }; @@ -62,11 +65,33 @@ extern int pageblock_order; /* Forward declaration */ struct page; +unsigned long get_pageblock_flags_mask(struct page *page, + unsigned long end_bitidx, + unsigned long mask); +void set_pageblock_flags_mask(struct page *page, + unsigned long flags, + unsigned long end_bitidx, + unsigned long mask); + /* Declarations for getting and setting flags. See mm/page_alloc.c */ -unsigned long get_pageblock_flags_group(struct page *page, - int start_bitidx, int end_bitidx); -void set_pageblock_flags_group(struct page *page, unsigned long flags, - int start_bitidx, int end_bitidx); +static inline unsigned long get_pageblock_flags_group(struct page *page, + int start_bitidx, int end_bitidx) +{ + unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; + unsigned long mask = (1 << nr_flag_bits) - 1; + + return get_pageblock_flags_mask(page, end_bitidx, mask); +} + +static inline void set_pageblock_flags_group(struct page *page, + unsigned long flags, + int start_bitidx, int end_bitidx) +{ + unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; + unsigned long mask = (1 << nr_flag_bits) - 1; + + set_pageblock_flags_mask(page, flags, end_bitidx, mask); +} #ifdef CONFIG_COMPACTION #define get_pageblock_skip(page) \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 485932c57..6e93780 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6028,53 +6028,65 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) * @end_bitidx: The last bit of interest * returns pageblock_bits flags */ -unsigned long get_pageblock_flags_group(struct page *page, - int start_bitidx, int end_bitidx) +unsigned long get_pageblock_flags_mask(struct page *page, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long flags = 0; - unsigned long value = 1; + unsigned long pfn, bitidx, word_bitidx; + unsigned long word; zone = page_zone(page); pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (test_bit(bitidx + start_bitidx, bitmap)) - flags |= value; - - return flags; + word = bitmap[word_bitidx]; + bitidx += end_bitidx; + return (word >> (BITS_PER_LONG - bitidx - 1)) & mask; } /** - * set_pageblock_flags_group - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest * @start_bitidx: The first bit of interest * @end_bitidx: The last bit of interest * @flags: The flags to set */ -void set_pageblock_flags_group(struct page *page, unsigned long flags, - int start_bitidx, int end_bitidx) +void set_pageblock_flags_mask(struct page *page, unsigned long flags, + unsigned long end_bitidx, + unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx; - unsigned long value = 1; + unsigned long pfn, bitidx, word_bitidx; + unsigned long old_word, word; + + BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); + word_bitidx = bitidx / BITS_PER_LONG; + bitidx &= (BITS_PER_LONG-1); + VM_BUG_ON_PAGE(!zone_spans_pfn(zone, pfn), page); - for (; start_bitidx <= end_bitidx; start_bitidx++, value <<= 1) - if (flags & value) - __set_bit(bitidx + start_bitidx, bitmap); - else - __clear_bit(bitidx + start_bitidx, bitmap); + bitidx += end_bitidx; + mask <<= (BITS_PER_LONG - bitidx - 1); + flags <<= (BITS_PER_LONG - bitidx - 1); + + word = ACCESS_ONCE(bitmap[word_bitidx]); + for (;;) { + old_word = cmpxchg(&bitmap[word_bitidx], word, (word & ~mask) | flags); + if (word == old_word) + break; + word = old_word; + } } /* -- cgit v0.10.2 From dc4b0caff24d9b2918e9f27bc65499ee63187eba Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:17 -0700 Subject: mm: page_alloc: reduce number of times page_to_pfn is called In the free path we calculate page_to_pfn multiple times. Reduce that. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Acked-by: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 8ef1e3f..472426a 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -78,10 +78,15 @@ extern int page_group_by_mobility_disabled; #define NR_MIGRATETYPE_BITS (PB_migrate_end - PB_migrate + 1) #define MIGRATETYPE_MASK ((1UL << NR_MIGRATETYPE_BITS) - 1) -static inline int get_pageblock_migratetype(struct page *page) +#define get_pageblock_migratetype(page) \ + get_pfnblock_flags_mask(page, page_to_pfn(page), \ + PB_migrate_end, MIGRATETYPE_MASK) + +static inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn) { BUILD_BUG_ON(PB_migrate_end - PB_migrate != 2); - return get_pageblock_flags_mask(page, PB_migrate_end, MIGRATETYPE_MASK); + return get_pfnblock_flags_mask(page, pfn, PB_migrate_end, + MIGRATETYPE_MASK); } struct free_area { diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index c08730c..2baeee1 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -65,33 +65,26 @@ extern int pageblock_order; /* Forward declaration */ struct page; -unsigned long get_pageblock_flags_mask(struct page *page, +unsigned long get_pfnblock_flags_mask(struct page *page, + unsigned long pfn, unsigned long end_bitidx, unsigned long mask); -void set_pageblock_flags_mask(struct page *page, + +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, unsigned long end_bitidx, unsigned long mask); /* Declarations for getting and setting flags. See mm/page_alloc.c */ -static inline unsigned long get_pageblock_flags_group(struct page *page, - int start_bitidx, int end_bitidx) -{ - unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; - unsigned long mask = (1 << nr_flag_bits) - 1; - - return get_pageblock_flags_mask(page, end_bitidx, mask); -} - -static inline void set_pageblock_flags_group(struct page *page, - unsigned long flags, - int start_bitidx, int end_bitidx) -{ - unsigned long nr_flag_bits = end_bitidx - start_bitidx + 1; - unsigned long mask = (1 << nr_flag_bits) - 1; - - set_pageblock_flags_mask(page, flags, end_bitidx, mask); -} +#define get_pageblock_flags_group(page, start_bitidx, end_bitidx) \ + get_pfnblock_flags_mask(page, page_to_pfn(page), \ + end_bitidx, \ + (1 << (end_bitidx - start_bitidx + 1)) - 1) +#define set_pageblock_flags_group(page, flags, start_bitidx, end_bitidx) \ + set_pfnblock_flags_mask(page, flags, page_to_pfn(page), \ + end_bitidx, \ + (1 << (end_bitidx - start_bitidx + 1)) - 1) #ifdef CONFIG_COMPACTION #define get_pageblock_skip(page) \ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e93780..6cadc86 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -560,6 +560,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, */ static inline void __free_one_page(struct page *page, + unsigned long pfn, struct zone *zone, unsigned int order, int migratetype) { @@ -576,7 +577,7 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(migratetype == -1); - page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); + page_idx = pfn & ((1 << MAX_ORDER) - 1); VM_BUG_ON_PAGE(page_idx & ((1 << order) - 1), page); VM_BUG_ON_PAGE(bad_range(zone, page), page); @@ -711,7 +712,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, list_del(&page->lru); mt = get_freepage_migratetype(page); /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ - __free_one_page(page, zone, 0, mt); + __free_one_page(page, page_to_pfn(page), zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); if (likely(!is_migrate_isolate_page(page))) { __mod_zone_page_state(zone, NR_FREE_PAGES, 1); @@ -723,13 +724,15 @@ static void free_pcppages_bulk(struct zone *zone, int count, spin_unlock(&zone->lock); } -static void free_one_page(struct zone *zone, struct page *page, int order, +static void free_one_page(struct zone *zone, + struct page *page, unsigned long pfn, + int order, int migratetype) { spin_lock(&zone->lock); zone->pages_scanned = 0; - __free_one_page(page, zone, order, migratetype); + __free_one_page(page, pfn, zone, order, migratetype); if (unlikely(!is_migrate_isolate(migratetype))) __mod_zone_freepage_state(zone, 1 << order, migratetype); spin_unlock(&zone->lock); @@ -766,15 +769,16 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int migratetype; + unsigned long pfn = page_to_pfn(page); if (!free_pages_prepare(page, order)) return; local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - migratetype = get_pageblock_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); - free_one_page(page_zone(page), page, order, migratetype); + free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); } @@ -1380,12 +1384,13 @@ void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; + unsigned long pfn = page_to_pfn(page); int migratetype; if (!free_pages_prepare(page, 0)) return; - migratetype = get_pageblock_migratetype(page); + migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); local_irq_save(flags); __count_vm_event(PGFREE); @@ -1399,7 +1404,7 @@ void free_hot_cold_page(struct page *page, int cold) */ if (migratetype >= MIGRATE_PCPTYPES) { if (unlikely(is_migrate_isolate(migratetype))) { - free_one_page(zone, page, 0, migratetype); + free_one_page(zone, page, pfn, 0, migratetype); goto out; } migratetype = MIGRATE_MOVABLE; @@ -6028,17 +6033,16 @@ static inline int pfn_to_bitidx(struct zone *zone, unsigned long pfn) * @end_bitidx: The last bit of interest * returns pageblock_bits flags */ -unsigned long get_pageblock_flags_mask(struct page *page, +unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx, word_bitidx; + unsigned long bitidx, word_bitidx; unsigned long word; zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); word_bitidx = bitidx / BITS_PER_LONG; @@ -6050,25 +6054,25 @@ unsigned long get_pageblock_flags_mask(struct page *page, } /** - * set_pageblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages + * set_pfnblock_flags_mask - Set the requested group of flags for a pageblock_nr_pages block of pages * @page: The page within the block of interest * @start_bitidx: The first bit of interest * @end_bitidx: The last bit of interest * @flags: The flags to set */ -void set_pageblock_flags_mask(struct page *page, unsigned long flags, +void set_pfnblock_flags_mask(struct page *page, unsigned long flags, + unsigned long pfn, unsigned long end_bitidx, unsigned long mask) { struct zone *zone; unsigned long *bitmap; - unsigned long pfn, bitidx, word_bitidx; + unsigned long bitidx, word_bitidx; unsigned long old_word, word; BUILD_BUG_ON(NR_PAGEBLOCK_BITS != 4); zone = page_zone(page); - pfn = page_to_pfn(page); bitmap = get_pageblock_bitmap(zone, pfn); bitidx = pfn_to_bitidx(zone, pfn); word_bitidx = bitidx / BITS_PER_LONG; -- cgit v0.10.2 From cfc47a2803db42140167b92d991ef04018e162c7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:19 -0700 Subject: mm: page_alloc: lookup pageblock migratetype with IRQs enabled during free get_pageblock_migratetype() is called during free with IRQs disabled. This is unnecessary and disables IRQs for longer than necessary. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Acked-by: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6cadc86..ce4d371 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -774,9 +774,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) if (!free_pages_prepare(page, order)) return; + migratetype = get_pfnblock_migratetype(page, pfn); local_irq_save(flags); __count_vm_events(PGFREE, 1 << order); - migratetype = get_pfnblock_migratetype(page, pfn); set_freepage_migratetype(page, migratetype); free_one_page(page_zone(page), page, pfn, order, migratetype); local_irq_restore(flags); -- cgit v0.10.2 From 7aeb09f9104b760fc53c98cb7d20d06640baf9e6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:21 -0700 Subject: mm: page_alloc: use unsigned int for order in more places X86 prefers the use of unsigned types for iterators and there is a tendency to mix whether a signed or unsigned type if used for page order. This converts a number of sites in mm/page_alloc.c to use unsigned int for order where possible. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 472426a..6cbd1b6 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -817,10 +817,10 @@ static inline bool pgdat_is_empty(pg_data_t *pgdat) extern struct mutex zonelists_mutex; void build_all_zonelists(pg_data_t *pgdat, struct zone *zone); void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); -bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags); -bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags); +bool zone_watermark_ok(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags); +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags); enum memmap_context { MEMMAP_EARLY, MEMMAP_HOTPLUG, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ce4d371..37ef1b87 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -409,7 +409,8 @@ static int destroy_compound_page(struct page *page, unsigned long order) return bad; } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +static inline void prep_zero_page(struct page *page, unsigned int order, + gfp_t gfp_flags) { int i; @@ -453,7 +454,7 @@ static inline void set_page_guard_flag(struct page *page) { } static inline void clear_page_guard_flag(struct page *page) { } #endif -static inline void set_page_order(struct page *page, int order) +static inline void set_page_order(struct page *page, unsigned int order) { set_page_private(page, order); __SetPageBuddy(page); @@ -504,7 +505,7 @@ __find_buddy_index(unsigned long page_idx, unsigned int order) * For recording page's order, we use page_private(page). */ static inline int page_is_buddy(struct page *page, struct page *buddy, - int order) + unsigned int order) { if (!pfn_valid_within(page_to_pfn(buddy))) return 0; @@ -726,7 +727,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, static void free_one_page(struct zone *zone, struct page *page, unsigned long pfn, - int order, + unsigned int order, int migratetype) { spin_lock(&zone->lock); @@ -897,7 +898,7 @@ static inline int check_new_page(struct page *page) return 0; } -static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) +static int prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags) { int i; @@ -1108,16 +1109,17 @@ static int try_to_steal_freepages(struct zone *zone, struct page *page, /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype) +__rmqueue_fallback(struct zone *zone, unsigned int order, int start_migratetype) { struct free_area *area; - int current_order; + unsigned int current_order; struct page *page; int migratetype, new_type, i; /* Find the largest possible block of pages in the other list */ - for (current_order = MAX_ORDER-1; current_order >= order; - --current_order) { + for (current_order = MAX_ORDER-1; + current_order >= order && current_order <= MAX_ORDER-1; + --current_order) { for (i = 0;; i++) { migratetype = fallbacks[start_migratetype][i]; @@ -1345,7 +1347,7 @@ void mark_free_pages(struct zone *zone) { unsigned long pfn, max_zone_pfn; unsigned long flags; - int order, t; + unsigned int order, t; struct list_head *curr; if (zone_is_empty(zone)) @@ -1541,8 +1543,8 @@ int split_free_page(struct page *page) */ static inline struct page *buffered_rmqueue(struct zone *preferred_zone, - struct zone *zone, int order, gfp_t gfp_flags, - int migratetype) + struct zone *zone, unsigned int order, + gfp_t gfp_flags, int migratetype) { unsigned long flags; struct page *page; @@ -1691,8 +1693,9 @@ static inline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags, long free_pages) +static bool __zone_watermark_ok(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags, + long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; @@ -1726,15 +1729,15 @@ static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, return true; } -bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark, int classzone_idx, int alloc_flags) { return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, zone_page_state(z, NR_FREE_PAGES)); } -bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +bool zone_watermark_ok_safe(struct zone *z, unsigned int order, + unsigned long mark, int classzone_idx, int alloc_flags) { long free_pages = zone_page_state(z, NR_FREE_PAGES); @@ -4121,7 +4124,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, static void __meminit zone_init_free_lists(struct zone *zone) { - int order, t; + unsigned int order, t; for_each_migratetype_order(order, t) { INIT_LIST_HEAD(&zone->free_area[order].free_list[t]); zone->free_area[order].nr_free = 0; @@ -6444,7 +6447,7 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) { struct page *page; struct zone *zone; - int order, i; + unsigned int order, i; unsigned long pfn; unsigned long flags; /* find the first valid pfn */ @@ -6496,7 +6499,7 @@ bool is_free_buddy_page(struct page *page) struct zone *zone = page_zone(page); unsigned long pfn = page_to_pfn(page); unsigned long flags; - int order; + unsigned int order; spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { -- cgit v0.10.2 From b745bc85f21ea707e4ea1a91948055fa3e72c77b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:22 -0700 Subject: mm: page_alloc: convert hot/cold parameter and immediate callers to bool cold is a bool, make it one. Make the likely case the "if" part of the block instead of the else as according to the optimisation manual this is preferred. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/tile/mm/homecache.c b/arch/tile/mm/homecache.c index 004ba56..33294fd 100644 --- a/arch/tile/mm/homecache.c +++ b/arch/tile/mm/homecache.c @@ -417,7 +417,7 @@ void __homecache_free_pages(struct page *page, unsigned int order) if (put_page_testzero(page)) { homecache_change_page_home(page, order, PAGE_HOME_HASH); if (order == 0) { - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); } else { init_page_count(page); __free_pages(page, order); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index aac71ce..098f97b 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1614,7 +1614,7 @@ out_finish: static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) { - release_pages(req->pages, req->num_pages, 0); + release_pages(req->pages, req->num_pages, false); } static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d382db7..454c99f 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -371,8 +371,8 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); extern void __free_pages(struct page *page, unsigned int order); extern void free_pages(unsigned long addr, unsigned int order); -extern void free_hot_cold_page(struct page *page, int cold); -extern void free_hot_cold_page_list(struct list_head *list, int cold); +extern void free_hot_cold_page(struct page *page, bool cold); +extern void free_hot_cold_page_list(struct list_head *list, bool cold); extern void __free_kmem_pages(struct page *page, unsigned int order); extern void free_kmem_pages(unsigned long addr, unsigned int order); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 718214c..c16fb6d 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -110,7 +110,7 @@ static inline void mapping_set_gfp_mask(struct address_space *m, gfp_t mask) #define page_cache_get(page) get_page(page) #define page_cache_release(page) put_page(page) -void release_pages(struct page **pages, int nr, int cold); +void release_pages(struct page **pages, int nr, bool cold); /* * speculatively take a reference to a page. diff --git a/include/linux/swap.h b/include/linux/swap.h index 9155bcd..97cf161 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -477,7 +477,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) #define free_page_and_swap_cache(page) \ page_cache_release(page) #define free_pages_and_swap_cache(pages, nr) \ - release_pages((pages), (nr), 0); + release_pages((pages), (nr), false); static inline void show_swap_cache_info(void) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 37ef1b87..09345ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1199,7 +1199,7 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype, int cold) + int migratetype, bool cold) { int i; @@ -1218,7 +1218,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - if (likely(cold == 0)) + if (likely(!cold)) list_add(&page->lru, list); else list_add_tail(&page->lru, list); @@ -1379,9 +1379,9 @@ void mark_free_pages(struct zone *zone) /* * Free a 0-order page - * cold == 1 ? free a cold page : free a hot page + * cold == true ? free a cold page : free a hot page */ -void free_hot_cold_page(struct page *page, int cold) +void free_hot_cold_page(struct page *page, bool cold) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; @@ -1413,10 +1413,10 @@ void free_hot_cold_page(struct page *page, int cold) } pcp = &this_cpu_ptr(zone->pageset)->pcp; - if (cold) - list_add_tail(&page->lru, &pcp->lists[migratetype]); - else + if (!cold) list_add(&page->lru, &pcp->lists[migratetype]); + else + list_add_tail(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { unsigned long batch = ACCESS_ONCE(pcp->batch); @@ -1431,7 +1431,7 @@ out: /* * Free a list of 0-order pages */ -void free_hot_cold_page_list(struct list_head *list, int cold) +void free_hot_cold_page_list(struct list_head *list, bool cold) { struct page *page, *next; @@ -1548,7 +1548,7 @@ struct page *buffered_rmqueue(struct zone *preferred_zone, { unsigned long flags; struct page *page; - int cold = !!(gfp_flags & __GFP_COLD); + bool cold = ((gfp_flags & __GFP_COLD) != 0); again: if (likely(order == 0)) { @@ -2823,7 +2823,7 @@ void __free_pages(struct page *page, unsigned int order) { if (put_page_testzero(page)) { if (order == 0) - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); else __free_pages_ok(page, order); } diff --git a/mm/swap.c b/mm/swap.c index c8d6df5..11ebb97 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -67,7 +67,7 @@ static void __page_cache_release(struct page *page) static void __put_single_page(struct page *page) { __page_cache_release(page); - free_hot_cold_page(page, 0); + free_hot_cold_page(page, false); } static void __put_compound_page(struct page *page) @@ -860,7 +860,7 @@ void lru_add_drain_all(void) * grabbed the page via the LRU. If it did, give up: shrink_inactive_list() * will free it. */ -void release_pages(struct page **pages, int nr, int cold) +void release_pages(struct page **pages, int nr, bool cold) { int i; LIST_HEAD(pages_to_free); diff --git a/mm/swap_state.c b/mm/swap_state.c index e76ace3..2972eee 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -270,7 +270,7 @@ void free_pages_and_swap_cache(struct page **pages, int nr) for (i = 0; i < todo; i++) free_swap_cache(pagep[i]); - release_pages(pagep, todo, 0); + release_pages(pagep, todo, false); pagep += todo; nr -= todo; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 9253e18..494cd63 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1121,7 +1121,7 @@ keep: VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page); } - free_hot_cold_page_list(&free_pages, 1); + free_hot_cold_page_list(&free_pages, true); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); @@ -1532,7 +1532,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&page_list, 1); + free_hot_cold_page_list(&page_list, true); /* * If reclaim is isolating dirty pages under writeback, it implies @@ -1755,7 +1755,7 @@ static void shrink_active_list(unsigned long nr_to_scan, __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); - free_hot_cold_page_list(&l_hold, 1); + free_hot_cold_page_list(&l_hold, true); } #ifdef CONFIG_SWAP -- cgit v0.10.2 From 07a427884348d38a6fd56fa4d78249c407196650 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:24 -0700 Subject: mm: shmem: avoid atomic operation during shmem_getpage_gfp shmem_getpage_gfp uses an atomic operation to set the SwapBacked field before it's even added to the LRU or visible. This is unnecessary as what could it possible race against? Use an unlocked variant. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Acked-by: Rik van Riel Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index d1fe1a7..4d4b39a 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -208,6 +208,7 @@ PAGEFLAG(Pinned, pinned) TESTSCFLAG(Pinned, pinned) /* Xen */ PAGEFLAG(SavePinned, savepinned); /* Xen */ PAGEFLAG(Reserved, reserved) __CLEARPAGEFLAG(Reserved, reserved) PAGEFLAG(SwapBacked, swapbacked) __CLEARPAGEFLAG(SwapBacked, swapbacked) + __SETPAGEFLAG(SwapBacked, swapbacked) __PAGEFLAG(SlobFree, slob_free) diff --git a/mm/shmem.c b/mm/shmem.c index 9f70e02..f47fb38 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1132,7 +1132,7 @@ repeat: goto decused; } - SetPageSwapBacked(page); + __SetPageSwapBacked(page); __set_page_locked(page); error = mem_cgroup_charge_file(page, current->mm, gfp & GFP_RECLAIM_MASK); -- cgit v0.10.2 From e3741b506c5088fa8c911bb5884c430f770fb49d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:26 -0700 Subject: mm: do not use atomic operations when releasing pages There should be no references to it any more and a parallel mark should not be reordered against us. Use non-locked varient to clear page active. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/swap.c b/mm/swap.c index 11ebb97..30b6a37 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -901,7 +901,7 @@ void release_pages(struct page **pages, int nr, bool cold) } /* Clear Active bit in case of parallel mark_page_accessed */ - ClearPageActive(page); + __ClearPageActive(page); list_add(&page->lru, &pages_to_free); } -- cgit v0.10.2 From 6fb81a17d21f2a138b8f424af4cf379f2b694060 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:28 -0700 Subject: mm: do not use unnecessary atomic operations when adding pages to the LRU When adding pages to the LRU we clear the active bit unconditionally. As the page could be reachable from other paths we cannot use unlocked operations without risk of corruption such as a parallel mark_page_accessed. This patch tests if is necessary to clear the active flag before using an atomic operation. This potentially opens a tiny race when PageActive is checked as mark_page_accessed could be called after PageActive was checked. The race already exists but this patch changes it slightly. The consequence is that that the page may be promoted to the active list that might have been left on the inactive list before the patch. It's too tiny a race and too marginal a consequence to always use atomic operations for. Signed-off-by: Mel Gorman Acked-by: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Rik van Riel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/swap.c b/mm/swap.c index 30b6a37..1fb25f8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -631,13 +631,15 @@ static void __lru_cache_add(struct page *page) */ void lru_cache_add_anon(struct page *page) { - ClearPageActive(page); + if (PageActive(page)) + ClearPageActive(page); __lru_cache_add(page); } void lru_cache_add_file(struct page *page) { - ClearPageActive(page); + if (PageActive(page)) + ClearPageActive(page); __lru_cache_add(page); } EXPORT_SYMBOL(lru_cache_add_file); -- cgit v0.10.2 From e7470ee89f003634a88e7b5e5a7b65b3025987de Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:29 -0700 Subject: fs: buffer: do not use unnecessary atomic operations when discarding buffers Discarding buffers uses a bunch of atomic operations when discarding buffers because ...... I can't think of a reason. Use a cmpxchg loop to clear all the necessary flags. In most (all?) cases this will be a single atomic operations. [akpm@linux-foundation.org: move BUFFER_FLAGS_DISCARD into the .c file] Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Rik van Riel Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/buffer.c b/fs/buffer.c index e33f8d5..0d3e8d5 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1483,16 +1483,27 @@ EXPORT_SYMBOL(set_bh_page); /* * Called when truncating a buffer on a page completely. */ + +/* Bits that are cleared during an invalidate */ +#define BUFFER_FLAGS_DISCARD \ + (1 << BH_Mapped | 1 << BH_New | 1 << BH_Req | \ + 1 << BH_Delay | 1 << BH_Unwritten) + static void discard_buffer(struct buffer_head * bh) { + unsigned long b_state, b_state_old; + lock_buffer(bh); clear_buffer_dirty(bh); bh->b_bdev = NULL; - clear_buffer_mapped(bh); - clear_buffer_req(bh); - clear_buffer_new(bh); - clear_buffer_delay(bh); - clear_buffer_unwritten(bh); + b_state = bh->b_state; + for (;;) { + b_state_old = cmpxchg(&bh->b_state, b_state, + (b_state & ~BUFFER_FLAGS_DISCARD)); + if (b_state_old == b_state) + break; + b_state = b_state_old; + } unlock_buffer(bh); } -- cgit v0.10.2 From 2457aec63745e235bcafb7ef312b182d8682f0fc Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:31 -0700 Subject: mm: non-atomically mark page accessed during page cache allocation where possible aops->write_begin may allocate a new page and make it visible only to have mark_page_accessed called almost immediately after. Once the page is visible the atomic operations are necessary which is noticable overhead when writing to an in-memory filesystem like tmpfs but should also be noticable with fast storage. The objective of the patch is to initialse the accessed information with non-atomic operations before the page is visible. The bulk of filesystems directly or indirectly use grab_cache_page_write_begin or find_or_create_page for the initial allocation of a page cache page. This patch adds an init_page_accessed() helper which behaves like the first call to mark_page_accessed() but may called before the page is visible and can be done non-atomically. The primary APIs of concern in this care are the following and are used by most filesystems. find_get_page find_lock_page find_or_create_page grab_cache_page_nowait grab_cache_page_write_begin All of them are very similar in detail to the patch creates a core helper pagecache_get_page() which takes a flags parameter that affects its behavior such as whether the page should be marked accessed or not. Then old API is preserved but is basically a thin wrapper around this core function. Each of the filesystems are then updated to avoid calling mark_page_accessed when it is known that the VM interfaces have already done the job. There is a slight snag in that the timing of the mark_page_accessed() has now changed so in rare cases it's possible a page gets to the end of the LRU as PageReferenced where as previously it might have been repromoted. This is expected to be rare but it's worth the filesystem people thinking about it in case they see a problem with the timing change. It is also the case that some filesystems may be marking pages accessed that previously did not but it makes sense that filesystems have consistent behaviour in this regard. The test case used to evaulate this is a simple dd of a large file done multiple times with the file deleted on each iterations. The size of the file is 1/10th physical memory to avoid dirty page balancing. In the async case it will be possible that the workload completes without even hitting the disk and will have variable results but highlight the impact of mark_page_accessed for async IO. The sync results are expected to be more stable. The exception is tmpfs where the normal case is for the "IO" to not hit the disk. The test machine was single socket and UMA to avoid any scheduling or NUMA artifacts. Throughput and wall times are presented for sync IO, only wall times are shown for async as the granularity reported by dd and the variability is unsuitable for comparison. As async results were variable do to writback timings, I'm only reporting the maximum figures. The sync results were stable enough to make the mean and stddev uninteresting. The performance results are reported based on a run with no profiling. Profile data is based on a separate run with oprofile running. async dd 3.15.0-rc3 3.15.0-rc3 vanilla accessed-v2 ext3 Max elapsed 13.9900 ( 0.00%) 11.5900 ( 17.16%) tmpfs Max elapsed 0.5100 ( 0.00%) 0.4900 ( 3.92%) btrfs Max elapsed 12.8100 ( 0.00%) 12.7800 ( 0.23%) ext4 Max elapsed 18.6000 ( 0.00%) 13.3400 ( 28.28%) xfs Max elapsed 12.5600 ( 0.00%) 2.0900 ( 83.36%) The XFS figure is a bit strange as it managed to avoid a worst case by sheer luck but the average figures looked reasonable. samples percentage ext3 86107 0.9783 vmlinux-3.15.0-rc4-vanilla mark_page_accessed ext3 23833 0.2710 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed ext3 5036 0.0573 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed ext4 64566 0.8961 vmlinux-3.15.0-rc4-vanilla mark_page_accessed ext4 5322 0.0713 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed ext4 2869 0.0384 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed xfs 62126 1.7675 vmlinux-3.15.0-rc4-vanilla mark_page_accessed xfs 1904 0.0554 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed xfs 103 0.0030 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed btrfs 10655 0.1338 vmlinux-3.15.0-rc4-vanilla mark_page_accessed btrfs 2020 0.0273 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed btrfs 587 0.0079 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed tmpfs 59562 3.2628 vmlinux-3.15.0-rc4-vanilla mark_page_accessed tmpfs 1210 0.0696 vmlinux-3.15.0-rc4-accessed-v3r25 init_page_accessed tmpfs 94 0.0054 vmlinux-3.15.0-rc4-accessed-v3r25 mark_page_accessed [akpm@linux-foundation.org: don't run init_page_accessed() against an uninitialised pointer] Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Rik van Riel Cc: Peter Zijlstra Tested-by: Prabhakar Lad Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f29a54e..4cd0ac9 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4510,7 +4510,8 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } -static void mark_extent_buffer_accessed(struct extent_buffer *eb) +static void mark_extent_buffer_accessed(struct extent_buffer *eb, + struct page *accessed) { unsigned long num_pages, i; @@ -4519,7 +4520,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb) num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { struct page *p = extent_buffer_page(eb, i); - mark_page_accessed(p); + if (p != accessed) + mark_page_accessed(p); } } @@ -4533,7 +4535,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, start >> PAGE_CACHE_SHIFT); if (eb && atomic_inc_not_zero(&eb->refs)) { rcu_read_unlock(); - mark_extent_buffer_accessed(eb); + mark_extent_buffer_accessed(eb, NULL); return eb; } rcu_read_unlock(); @@ -4581,7 +4583,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, spin_unlock(&mapping->private_lock); unlock_page(p); page_cache_release(p); - mark_extent_buffer_accessed(exists); + mark_extent_buffer_accessed(exists, p); goto free_eb; } @@ -4596,7 +4598,6 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, attach_extent_buffer_page(eb, p); spin_unlock(&mapping->private_lock); WARN_ON(PageDirty(p)); - mark_page_accessed(p); eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ae6af07..74272a3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -470,11 +470,12 @@ static void btrfs_drop_pages(struct page **pages, size_t num_pages) for (i = 0; i < num_pages; i++) { /* page checked is some magic around finding pages that * have been modified without going through btrfs_set_page_dirty - * clear it here + * clear it here. There should be no need to mark the pages + * accessed as prepare_pages should have marked them accessed + * in prepare_pages via find_or_create_page() */ ClearPageChecked(pages[i]); unlock_page(pages[i]); - mark_page_accessed(pages[i]); page_cache_release(pages[i]); } } diff --git a/fs/buffer.c b/fs/buffer.c index 0d3e8d5..eba6e4f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -227,7 +227,7 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) int all_mapped = 1; index = block >> (PAGE_CACHE_SHIFT - bd_inode->i_blkbits); - page = find_get_page(bd_mapping, index); + page = find_get_page_flags(bd_mapping, index, FGP_ACCESSED); if (!page) goto out; @@ -1366,12 +1366,13 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) struct buffer_head *bh = lookup_bh_lru(bdev, block, size); if (bh == NULL) { + /* __find_get_block_slow will mark the page accessed */ bh = __find_get_block_slow(bdev, block); if (bh) bh_lru_install(bh); - } - if (bh) + } else touch_buffer(bh); + return bh; } EXPORT_SYMBOL(__find_get_block); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c8238a2..afe8a13 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1044,6 +1044,8 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) * allocating. If we are looking at the buddy cache we would * have taken a reference using ext4_mb_load_buddy and that * would have pinned buddy page to page cache. + * The call to ext4_mb_get_buddy_page_lock will mark the + * page accessed. */ ret = ext4_mb_get_buddy_page_lock(sb, group, &e4b); if (ret || !EXT4_MB_GRP_NEED_INIT(this_grp)) { @@ -1062,7 +1064,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) ret = -EIO; goto err; } - mark_page_accessed(page); if (e4b.bd_buddy_page == NULL) { /* @@ -1082,7 +1083,6 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) ret = -EIO; goto err; } - mark_page_accessed(page); err: ext4_mb_put_buddy_page_lock(&e4b); return ret; @@ -1141,7 +1141,7 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, /* we could use find_or_create_page(), but it locks page * what we'd like to avoid in fast path ... */ - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) /* @@ -1176,15 +1176,16 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, ret = -EIO; goto err; } + + /* Pages marked accessed already */ e4b->bd_bitmap_page = page; e4b->bd_bitmap = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); block++; pnum = block / blocks_per_page; poff = block % blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); + page = find_get_page_flags(inode->i_mapping, pnum, FGP_ACCESSED); if (page == NULL || !PageUptodate(page)) { if (page) page_cache_release(page); @@ -1209,9 +1210,10 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, ret = -EIO; goto err; } + + /* Pages marked accessed already */ e4b->bd_buddy_page = page; e4b->bd_buddy = page_address(page) + (poff * sb->s_blocksize); - mark_page_accessed(page); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 4aa521a..c405b8f 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -69,7 +69,6 @@ repeat: goto repeat; } out: - mark_page_accessed(page); return page; } @@ -137,13 +136,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) if (!page) continue; if (PageUptodate(page)) { - mark_page_accessed(page); f2fs_put_page(page, 1); continue; } f2fs_submit_page_mbio(sbi, page, blk_addr, &fio); - mark_page_accessed(page); f2fs_put_page(page, 0); } out: diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a161e95..57caa6e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -967,7 +967,6 @@ repeat: goto repeat; } got_it: - mark_page_accessed(page); return page; } @@ -1022,7 +1021,6 @@ page_hit: f2fs_put_page(page, 1); return ERR_PTR(-EIO); } - mark_page_accessed(page); return page; } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index f680d2c..903cbc9 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1089,8 +1089,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); flush_dcache_page(page); - mark_page_accessed(page); - if (!tmp) { unlock_page(page); page_cache_release(page); diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 5a49b03..492123c 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -577,7 +577,6 @@ int gfs2_internal_read(struct gfs2_inode *ip, char *buf, loff_t *pos, p = kmap_atomic(page); memcpy(buf + copied, p + offset, amt); kunmap_atomic(p); - mark_page_accessed(page); page_cache_release(page); copied += amt; index++; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 2cf09b6..b984a6e 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -136,7 +136,8 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) yield(); } } else { - page = find_lock_page(mapping, index); + page = find_get_page_flags(mapping, index, + FGP_LOCK|FGP_ACCESSED); if (!page) return NULL; } @@ -153,7 +154,6 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) map_bh(bh, sdp->sd_vfs, blkno); unlock_page(page); - mark_page_accessed(page); page_cache_release(page); return bh; diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c index a27e3fe..250ed5b 100644 --- a/fs/ntfs/attrib.c +++ b/fs/ntfs/attrib.c @@ -1748,7 +1748,6 @@ int ntfs_attr_make_non_resident(ntfs_inode *ni, const u32 data_size) if (page) { set_page_dirty(page); unlock_page(page); - mark_page_accessed(page); page_cache_release(page); } ntfs_debug("Done."); diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index db9bd8a..86ddab9 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -2060,7 +2060,6 @@ static ssize_t ntfs_file_buffered_write(struct kiocb *iocb, } do { unlock_page(pages[--do_pages]); - mark_page_accessed(pages[do_pages]); page_cache_release(pages[do_pages]); } while (do_pages); if (unlikely(status)) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4d4b39a..2093eb7 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -198,6 +198,7 @@ struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) + __SETPAGEFLAG(Referenced, referenced) PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) PAGEFLAG(Active, active) __CLEARPAGEFLAG(Active, active) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c16fb6d..0a97b58 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -259,12 +259,109 @@ pgoff_t page_cache_next_hole(struct address_space *mapping, pgoff_t page_cache_prev_hole(struct address_space *mapping, pgoff_t index, unsigned long max_scan); +#define FGP_ACCESSED 0x00000001 +#define FGP_LOCK 0x00000002 +#define FGP_CREAT 0x00000004 +#define FGP_WRITE 0x00000008 +#define FGP_NOFS 0x00000010 +#define FGP_NOWAIT 0x00000020 + +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, + int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask); + +/** + * find_get_page - find and get a page reference + * @mapping: the address_space to search + * @offset: the page index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned with an increased refcount. + * + * Otherwise, %NULL is returned. + */ +static inline struct page *find_get_page(struct address_space *mapping, + pgoff_t offset) +{ + return pagecache_get_page(mapping, offset, 0, 0, 0); +} + +static inline struct page *find_get_page_flags(struct address_space *mapping, + pgoff_t offset, int fgp_flags) +{ + return pagecache_get_page(mapping, offset, fgp_flags, 0, 0); +} + +/** + * find_lock_page - locate, pin and lock a pagecache page + * pagecache_get_page - find and get a page reference + * @mapping: the address_space to search + * @offset: the page index + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * Otherwise, %NULL is returned. + * + * find_lock_page() may sleep. + */ +static inline struct page *find_lock_page(struct address_space *mapping, + pgoff_t offset) +{ + return pagecache_get_page(mapping, offset, FGP_LOCK, 0, 0); +} + +/** + * find_or_create_page - locate or add a pagecache page + * @mapping: the page's address_space + * @index: the page's index into the mapping + * @gfp_mask: page allocation mode + * + * Looks up the page cache slot at @mapping & @offset. If there is a + * page cache page, it is returned locked and with an increased + * refcount. + * + * If the page is not present, a new page is allocated using @gfp_mask + * and added to the page cache and the VM's LRU list. The page is + * returned locked and with an increased refcount. + * + * On memory exhaustion, %NULL is returned. + * + * find_or_create_page() may sleep, even if @gfp_flags specifies an + * atomic allocation! + */ +static inline struct page *find_or_create_page(struct address_space *mapping, + pgoff_t offset, gfp_t gfp_mask) +{ + return pagecache_get_page(mapping, offset, + FGP_LOCK|FGP_ACCESSED|FGP_CREAT, + gfp_mask, gfp_mask & GFP_RECLAIM_MASK); +} + +/** + * grab_cache_page_nowait - returns locked page at given index in given cache + * @mapping: target address_space + * @index: the page index + * + * Same as grab_cache_page(), but do not wait if the page is unavailable. + * This is intended for speculative data generators, where the data can + * be regenerated if the page couldn't be grabbed. This routine should + * be safe to call while holding the lock for another page. + * + * Clear __GFP_FS when allocating the page to avoid recursion into the fs + * and deadlock against the caller's locked page. + */ +static inline struct page *grab_cache_page_nowait(struct address_space *mapping, + pgoff_t index) +{ + return pagecache_get_page(mapping, index, + FGP_LOCK|FGP_CREAT|FGP_NOFS|FGP_NOWAIT, + mapping_gfp_mask(mapping), + GFP_NOFS); +} + struct page *find_get_entry(struct address_space *mapping, pgoff_t offset); -struct page *find_get_page(struct address_space *mapping, pgoff_t offset); struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset); -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset); -struct page *find_or_create_page(struct address_space *mapping, pgoff_t index, - gfp_t gfp_mask); unsigned find_get_entries(struct address_space *mapping, pgoff_t start, unsigned int nr_entries, struct page **entries, pgoff_t *indices); @@ -287,8 +384,6 @@ static inline struct page *grab_cache_page(struct address_space *mapping, return find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); } -extern struct page * grab_cache_page_nowait(struct address_space *mapping, - pgoff_t index); extern struct page * read_cache_page(struct address_space *mapping, pgoff_t index, filler_t *filler, void *data); extern struct page * read_cache_page_gfp(struct address_space *mapping, diff --git a/include/linux/swap.h b/include/linux/swap.h index 97cf161..4348d95 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -311,6 +311,7 @@ extern void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *head); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); +extern void init_page_accessed(struct page *page); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); diff --git a/mm/filemap.c b/mm/filemap.c index 47d235b..0fcd792 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -982,26 +982,6 @@ out: EXPORT_SYMBOL(find_get_entry); /** - * find_get_page - find and get a page reference - * @mapping: the address_space to search - * @offset: the page index - * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned with an increased refcount. - * - * Otherwise, %NULL is returned. - */ -struct page *find_get_page(struct address_space *mapping, pgoff_t offset) -{ - struct page *page = find_get_entry(mapping, offset); - - if (radix_tree_exceptional_entry(page)) - page = NULL; - return page; -} -EXPORT_SYMBOL(find_get_page); - -/** * find_lock_entry - locate, pin and lock a page cache entry * @mapping: the address_space to search * @offset: the page cache index @@ -1038,66 +1018,84 @@ repeat: EXPORT_SYMBOL(find_lock_entry); /** - * find_lock_page - locate, pin and lock a pagecache page + * pagecache_get_page - find and get a page reference * @mapping: the address_space to search * @offset: the page index + * @fgp_flags: PCG flags + * @gfp_mask: gfp mask to use if a page is to be allocated * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned locked and with an increased - * refcount. - * - * Otherwise, %NULL is returned. - * - * find_lock_page() may sleep. - */ -struct page *find_lock_page(struct address_space *mapping, pgoff_t offset) -{ - struct page *page = find_lock_entry(mapping, offset); - - if (radix_tree_exceptional_entry(page)) - page = NULL; - return page; -} -EXPORT_SYMBOL(find_lock_page); - -/** - * find_or_create_page - locate or add a pagecache page - * @mapping: the page's address_space - * @index: the page's index into the mapping - * @gfp_mask: page allocation mode + * Looks up the page cache slot at @mapping & @offset. * - * Looks up the page cache slot at @mapping & @offset. If there is a - * page cache page, it is returned locked and with an increased - * refcount. + * PCG flags modify how the page is returned * - * If the page is not present, a new page is allocated using @gfp_mask - * and added to the page cache and the VM's LRU list. The page is - * returned locked and with an increased refcount. + * FGP_ACCESSED: the page will be marked accessed + * FGP_LOCK: Page is return locked + * FGP_CREAT: If page is not present then a new page is allocated using + * @gfp_mask and added to the page cache and the VM's LRU + * list. The page is returned locked and with an increased + * refcount. Otherwise, %NULL is returned. * - * On memory exhaustion, %NULL is returned. + * If FGP_LOCK or FGP_CREAT are specified then the function may sleep even + * if the GFP flags specified for FGP_CREAT are atomic. * - * find_or_create_page() may sleep, even if @gfp_flags specifies an - * atomic allocation! + * If there is a page cache page, it is returned with an increased refcount. */ -struct page *find_or_create_page(struct address_space *mapping, - pgoff_t index, gfp_t gfp_mask) +struct page *pagecache_get_page(struct address_space *mapping, pgoff_t offset, + int fgp_flags, gfp_t cache_gfp_mask, gfp_t radix_gfp_mask) { struct page *page; - int err; + repeat: - page = find_lock_page(mapping, index); - if (!page) { - page = __page_cache_alloc(gfp_mask); + page = find_get_entry(mapping, offset); + if (radix_tree_exceptional_entry(page)) + page = NULL; + if (!page) + goto no_page; + + if (fgp_flags & FGP_LOCK) { + if (fgp_flags & FGP_NOWAIT) { + if (!trylock_page(page)) { + page_cache_release(page); + return NULL; + } + } else { + lock_page(page); + } + + /* Has the page been truncated? */ + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + page_cache_release(page); + goto repeat; + } + VM_BUG_ON_PAGE(page->index != offset, page); + } + + if (page && (fgp_flags & FGP_ACCESSED)) + mark_page_accessed(page); + +no_page: + if (!page && (fgp_flags & FGP_CREAT)) { + int err; + if ((fgp_flags & FGP_WRITE) && mapping_cap_account_dirty(mapping)) + cache_gfp_mask |= __GFP_WRITE; + if (fgp_flags & FGP_NOFS) { + cache_gfp_mask &= ~__GFP_FS; + radix_gfp_mask &= ~__GFP_FS; + } + + page = __page_cache_alloc(cache_gfp_mask); if (!page) return NULL; - /* - * We want a regular kernel memory (not highmem or DMA etc) - * allocation for the radix tree nodes, but we need to honour - * the context-specific requirements the caller has asked for. - * GFP_RECLAIM_MASK collects those requirements. - */ - err = add_to_page_cache_lru(page, mapping, index, - (gfp_mask & GFP_RECLAIM_MASK)); + + if (WARN_ON_ONCE(!(fgp_flags & FGP_LOCK))) + fgp_flags |= FGP_LOCK; + + /* Init accessed so avoit atomic mark_page_accessed later */ + if (fgp_flags & FGP_ACCESSED) + init_page_accessed(page); + + err = add_to_page_cache_lru(page, mapping, offset, radix_gfp_mask); if (unlikely(err)) { page_cache_release(page); page = NULL; @@ -1105,9 +1103,10 @@ repeat: goto repeat; } } + return page; } -EXPORT_SYMBOL(find_or_create_page); +EXPORT_SYMBOL(pagecache_get_page); /** * find_get_entries - gang pagecache lookup @@ -1404,39 +1403,6 @@ repeat: } EXPORT_SYMBOL(find_get_pages_tag); -/** - * grab_cache_page_nowait - returns locked page at given index in given cache - * @mapping: target address_space - * @index: the page index - * - * Same as grab_cache_page(), but do not wait if the page is unavailable. - * This is intended for speculative data generators, where the data can - * be regenerated if the page couldn't be grabbed. This routine should - * be safe to call while holding the lock for another page. - * - * Clear __GFP_FS when allocating the page to avoid recursion into the fs - * and deadlock against the caller's locked page. - */ -struct page * -grab_cache_page_nowait(struct address_space *mapping, pgoff_t index) -{ - struct page *page = find_get_page(mapping, index); - - if (page) { - if (trylock_page(page)) - return page; - page_cache_release(page); - return NULL; - } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); - if (page && add_to_page_cache_lru(page, mapping, index, GFP_NOFS)) { - page_cache_release(page); - page = NULL; - } - return page; -} -EXPORT_SYMBOL(grab_cache_page_nowait); - /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -2406,7 +2372,6 @@ int pagecache_write_end(struct file *file, struct address_space *mapping, { const struct address_space_operations *aops = mapping->a_ops; - mark_page_accessed(page); return aops->write_end(file, mapping, pos, len, copied, page, fsdata); } EXPORT_SYMBOL(pagecache_write_end); @@ -2488,34 +2453,18 @@ EXPORT_SYMBOL(generic_file_direct_write); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags) { - int status; - gfp_t gfp_mask; struct page *page; - gfp_t gfp_notmask = 0; + int fgp_flags = FGP_LOCK|FGP_ACCESSED|FGP_WRITE|FGP_CREAT; - gfp_mask = mapping_gfp_mask(mapping); - if (mapping_cap_account_dirty(mapping)) - gfp_mask |= __GFP_WRITE; if (flags & AOP_FLAG_NOFS) - gfp_notmask = __GFP_FS; -repeat: - page = find_lock_page(mapping, index); + fgp_flags |= FGP_NOFS; + + page = pagecache_get_page(mapping, index, fgp_flags, + mapping_gfp_mask(mapping), + GFP_KERNEL); if (page) - goto found; + wait_for_stable_page(page); - page = __page_cache_alloc(gfp_mask & ~gfp_notmask); - if (!page) - return NULL; - status = add_to_page_cache_lru(page, mapping, index, - GFP_KERNEL & ~gfp_notmask); - if (unlikely(status)) { - page_cache_release(page); - if (status == -EEXIST) - goto repeat; - return NULL; - } -found: - wait_for_stable_page(page); return page; } EXPORT_SYMBOL(grab_cache_page_write_begin); @@ -2564,7 +2513,7 @@ again: status = a_ops->write_begin(file, mapping, pos, bytes, flags, &page, &fsdata); - if (unlikely(status)) + if (unlikely(status < 0)) break; if (mapping_writably_mapped(mapping)) @@ -2573,7 +2522,6 @@ again: copied = iov_iter_copy_from_user_atomic(page, i, offset, bytes); flush_dcache_page(page); - mark_page_accessed(page); status = a_ops->write_end(file, mapping, pos, bytes, copied, page, fsdata); if (unlikely(status < 0)) diff --git a/mm/shmem.c b/mm/shmem.c index f47fb38..5402481 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1372,9 +1372,13 @@ shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { + int ret; struct inode *inode = mapping->host; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); + ret = shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); + if (ret == 0 && *pagep) + init_page_accessed(*pagep); + return ret; } static int diff --git a/mm/swap.c b/mm/swap.c index 1fb25f8..9e8e347 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -614,6 +614,17 @@ void mark_page_accessed(struct page *page) } EXPORT_SYMBOL(mark_page_accessed); +/* + * Used to mark_page_accessed(page) that is not visible yet and when it is + * still safe to use non-atomic ops + */ +void init_page_accessed(struct page *page) +{ + if (!PageReferenced(page)) + __SetPageReferenced(page); +} +EXPORT_SYMBOL(init_page_accessed); + static void __lru_cache_add(struct page *page) { struct pagevec *pvec = &get_cpu_var(lru_add_pvec); -- cgit v0.10.2 From d8846374a85f4290a473a4e2a64c1ba046c4a0e1 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:33 -0700 Subject: mm: page_alloc: calculate classzone_idx once from the zonelist ref There is no need to calculate zone_idx(preferred_zone) multiple times or use the pgdat to figure it out. Signed-off-by: Mel Gorman Acked-by: Rik van Riel Acked-by: David Rientjes Cc: Johannes Weiner Cc: Vlastimil Babka Cc: Jan Kara Cc: Michal Hocko Cc: Hugh Dickins Cc: Dave Hansen Cc: Theodore Ts'o Cc: "Paul E. McKenney" Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 09345ab..8f785b15 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1916,11 +1916,10 @@ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) static struct page * get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, struct zonelist *zonelist, int high_zoneidx, int alloc_flags, - struct zone *preferred_zone, int migratetype) + struct zone *preferred_zone, int classzone_idx, int migratetype) { struct zoneref *z; struct page *page = NULL; - int classzone_idx; struct zone *zone; nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ @@ -1928,7 +1927,6 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, bool consider_zone_dirty = (alloc_flags & ALLOC_WMARK_LOW) && (gfp_mask & __GFP_WRITE); - classzone_idx = zone_idx(preferred_zone); zonelist_scan: /* * Scan zonelist, looking for a zone with enough free. @@ -2186,7 +2184,7 @@ static inline struct page * __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; @@ -2204,7 +2202,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, ALLOC_WMARK_HIGH|ALLOC_CPUSET, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto out; @@ -2239,7 +2237,7 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, enum migrate_mode mode, + int classzone_idx, int migratetype, enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { @@ -2267,7 +2265,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { preferred_zone->compact_blockskip_flush = false; compaction_defer_reset(preferred_zone, order, true); @@ -2299,7 +2297,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, enum migrate_mode mode, bool *contended_compaction, + int classzone_idx, int migratetype, + enum migrate_mode mode, bool *contended_compaction, bool *deferred_compaction, unsigned long *did_some_progress) { return NULL; @@ -2339,7 +2338,7 @@ static inline struct page * __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int classzone_idx, int migratetype, unsigned long *did_some_progress) { struct page *page = NULL; bool drained = false; @@ -2357,7 +2356,8 @@ retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, + migratetype); /* * If an allocation failed after direct reclaim, it could be because @@ -2380,14 +2380,14 @@ static inline struct page * __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { struct page *page; do { page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); @@ -2488,7 +2488,7 @@ static inline struct page * __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, struct zone *preferred_zone, - int migratetype) + int classzone_idx, int migratetype) { const gfp_t wait = gfp_mask & __GFP_WAIT; struct page *page = NULL; @@ -2537,15 +2537,18 @@ restart: * Find the true preferred zone if the allocation is unconstrained by * cpusets. */ - if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) - first_zones_zonelist(zonelist, high_zoneidx, NULL, - &preferred_zone); + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) { + struct zoneref *preferred_zoneref; + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, + NULL, &preferred_zone); + classzone_idx = zonelist_zone_idx(preferred_zoneref); + } rebalance: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) goto got_pg; @@ -2560,7 +2563,7 @@ rebalance: page = __alloc_pages_high_priority(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (page) { goto got_pg; } @@ -2591,7 +2594,8 @@ rebalance: */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, - preferred_zone, migratetype, + preferred_zone, + classzone_idx, migratetype, migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); @@ -2621,7 +2625,8 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + classzone_idx, migratetype, + &did_some_progress); if (page) goto got_pg; @@ -2640,7 +2645,7 @@ rebalance: page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, - migratetype); + classzone_idx, migratetype); if (page) goto got_pg; @@ -2681,7 +2686,8 @@ rebalance: */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, - preferred_zone, migratetype, + preferred_zone, + classzone_idx, migratetype, migration_mode, &contended_compaction, &deferred_compaction, &did_some_progress); @@ -2708,10 +2714,12 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, { enum zone_type high_zoneidx = gfp_zone(gfp_mask); struct zone *preferred_zone; + struct zoneref *preferred_zoneref; struct page *page = NULL; int migratetype = allocflags_to_migratetype(gfp_mask); unsigned int cpuset_mems_cookie; int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR; + int classzone_idx; gfp_mask &= gfp_allowed_mask; @@ -2734,11 +2742,12 @@ retry_cpuset: cpuset_mems_cookie = read_mems_allowed_begin(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, + preferred_zoneref = first_zones_zonelist(zonelist, high_zoneidx, nodemask ? : &cpuset_current_mems_allowed, &preferred_zone); if (!preferred_zone) goto out; + classzone_idx = zonelist_zone_idx(preferred_zoneref); #ifdef CONFIG_CMA if (allocflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE) @@ -2748,7 +2757,7 @@ retry: /* First allocation attempt */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order, zonelist, high_zoneidx, alloc_flags, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); if (unlikely(!page)) { /* * The first pass makes sure allocations are spread @@ -2774,7 +2783,7 @@ retry: gfp_mask = memalloc_noio_flags(gfp_mask); page = __alloc_pages_slowpath(gfp_mask, order, zonelist, high_zoneidx, nodemask, - preferred_zone, migratetype); + preferred_zone, classzone_idx, migratetype); } trace_mm_page_alloc(page, order, gfp_mask, migratetype); -- cgit v0.10.2 From 888cf2db475a256fb0cda042140f73d7881f81fe Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:34 -0700 Subject: mm: avoid unnecessary atomic operations during end_page_writeback() If a page is marked for immediate reclaim then it is moved to the tail of the LRU list. This occurs when the system is under enough memory pressure for pages under writeback to reach the end of the LRU but we test for this using atomic operations on every writeback. This patch uses an optimistic non-atomic test first. It'll miss some pages in rare cases but the consequences are not severe enough to warrant such a penalty. While the function does not dominate profiles during a simple dd test the cost of it is reduced. 73048 0.7428 vmlinux-3.15.0-rc5-mmotm-20140513 end_page_writeback 23740 0.2409 vmlinux-3.15.0-rc5-lessatomic end_page_writeback Signed-off-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/filemap.c b/mm/filemap.c index 0fcd792..7fadf1c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -753,8 +753,17 @@ EXPORT_SYMBOL(unlock_page); */ void end_page_writeback(struct page *page) { - if (TestClearPageReclaim(page)) + /* + * TestClearPageReclaim could be used here but it is an atomic + * operation and overkill in this particular case. Failing to + * shuffle a page marked for immediate reclaim is too mild to + * justify taking an atomic operation penalty at the end of + * ever page writeback. + */ + if (PageReclaim(page)) { + ClearPageReclaim(page); rotate_reclaimable_page(page); + } if (!test_clear_page_writeback(page)) BUG(); -- cgit v0.10.2 From 6edd6cc66201e06a6cc34030462217e7f4d72f4f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 4 Jun 2014 16:10:35 -0700 Subject: mm/memory-failure.c: move comment The comment about pages under writeback is far from the relevant code, so let's move it to the right place. Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3cd1b65..a047468 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1132,11 +1132,6 @@ int memory_failure(unsigned long pfn, int trapno, int flags) } } - /* - * Lock the page and wait for writeback to finish. - * It's very difficult to mess with pages currently under IO - * and in many cases impossible, so we just avoid it here. - */ lock_page(hpage); /* @@ -1186,6 +1181,10 @@ int memory_failure(unsigned long pfn, int trapno, int flags) if (PageHuge(p)) set_page_hwpoison_huge_page(hpage); + /* + * It's very difficult to mess with pages currently under IO + * and in many cases impossible, so we just avoid it here. + */ wait_on_page_writeback(p); /* -- cgit v0.10.2 From 8f34af6f93aee88291cec53ae8dff4989e58fbbd Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:10:36 -0700 Subject: mm, hugetlb: move the error handle logic out of normal code path alloc_huge_page() now mixes normal code path with error handle logic. This patches move out the error handle logic, to make normal code path more clean and redue code duplicate. Signed-off-by: Jianyu Zhan Acked-by: Davidlohr Bueso Reviewed-by: Michal Hocko Reviewed-by: Aneesh Kumar K.V Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 98f0bc1..2441942 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1386,24 +1386,17 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return ERR_PTR(-ENOSPC); ret = hugetlb_cgroup_charge_cgroup(idx, pages_per_huge_page(h), &h_cg); - if (ret) { - if (chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); - return ERR_PTR(-ENOSPC); - } + if (ret) + goto out_subpool_put; + spin_lock(&hugetlb_lock); page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, chg); if (!page) { spin_unlock(&hugetlb_lock); page = alloc_buddy_huge_page(h, NUMA_NO_NODE); - if (!page) { - hugetlb_cgroup_uncharge_cgroup(idx, - pages_per_huge_page(h), - h_cg); - if (chg || avoid_reserve) - hugepage_subpool_put_pages(spool, 1); - return ERR_PTR(-ENOSPC); - } + if (!page) + goto out_uncharge_cgroup; + spin_lock(&hugetlb_lock); list_move(&page->lru, &h->hugepage_activelist); /* Fall through */ @@ -1415,6 +1408,13 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, vma_commit_reservation(h, vma, addr); return page; + +out_uncharge_cgroup: + hugetlb_cgroup_uncharge_cgroup(idx, pages_per_huge_page(h), h_cg); +out_subpool_put: + if (chg || avoid_reserve) + hugepage_subpool_put_pages(spool, 1); + return ERR_PTR(-ENOSPC); } /* -- cgit v0.10.2 From b7596fb43aa786fb3ee5015a73034fbb9e80feaa Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 4 Jun 2014 16:10:37 -0700 Subject: include/linux/gfp.h: exclude duplicate header mmdebug.h is included twice. Signed-off-by: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 454c99f..6eb1fb3 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -6,7 +6,6 @@ #include #include #include -#include struct vm_area_struct; -- cgit v0.10.2 From 4be89a34609659042ef0bf883ad76388fb5251bb Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:10:38 -0700 Subject: mm/vmscan.c: use DIV_ROUND_UP for calculation of zone's balance_gap and correct comments. Currently, we use (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / KSWAPD_ZONE_BALANCE_GAP_RATIO to avoid a zero gap value. It's better to use DIV_ROUND_UP macro for neater code and clear meaning. Besides, the gap value is calculated against the per-zone "managed pages", not "present pages". This patch also corrects the comment and do some rephrasing. Signed-off-by: Jianyu Zhan Acked-by: Rik van Riel Acked-by: Rafael Aquini Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/swap.h b/include/linux/swap.h index 4348d95..4bdbee8 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -166,10 +166,10 @@ enum { #define COMPACT_CLUSTER_MAX SWAP_CLUSTER_MAX /* - * Ratio between the present memory in the zone and the "gap" that - * we're allowing kswapd to shrink in addition to the per-zone high - * wmark, even for zones that already have the high wmark satisfied, - * in order to provide better per-zone lru behavior. We are ok to + * Ratio between zone->managed_pages and the "gap" that above the per-zone + * "high_wmark". While balancing nodes, We allow kswapd to shrink zones that + * do not meet the (high_wmark + gap) watermark, even which already met the + * high_wmark, in order to provide better per-zone lru behavior. We are ok to * spend not more than 1% of the memory for this zone balancing "gap". */ #define KSWAPD_ZONE_BALANCE_GAP_RATIO 100 diff --git a/mm/vmscan.c b/mm/vmscan.c index 494cd63..cc29fca 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2295,9 +2295,8 @@ static inline bool compaction_ready(struct zone *zone, struct scan_control *sc) * there is a buffer of free pages available to give compaction * a reasonable chance of completing and allocating the page */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); watermark = high_wmark_pages(zone) + balance_gap + (2UL << sc->order); watermark_ok = zone_watermark_ok_safe(zone, 0, watermark, 0, 0); @@ -2949,9 +2948,8 @@ static bool kswapd_shrink_zone(struct zone *zone, * high wmark plus a "gap" where the gap is either the low * watermark or 1% of the zone, whichever is smaller. */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); + balance_gap = min(low_wmark_pages(zone), DIV_ROUND_UP( + zone->managed_pages, KSWAPD_ZONE_BALANCE_GAP_RATIO)); /* * If there is no low memory pressure or the zone is balanced then no -- cgit v0.10.2 From 422b2448fc86bf678f5d398389e41e0f169541d5 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:10:39 -0700 Subject: fs/hugetlbfs/inode.c: add static to hugetlbfs_i_mmap_mutex_key hugetlbfs_i_mmap_mutex_key is only used in inode.c Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 2611824..b242759 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -477,7 +477,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb, * annotation because huge_pmd_share() does an allocation under * i_mmap_mutex. */ -struct lock_class_key hugetlbfs_i_mmap_mutex_key; +static struct lock_class_key hugetlbfs_i_mmap_mutex_key; static struct inode *hugetlbfs_get_inode(struct super_block *sb, struct inode *dir, -- cgit v0.10.2 From be1d2cf5e36b1de3c8e17428885dcad4ca929886 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:10:39 -0700 Subject: fs/hugetlbfs/inode.c: use static const for dentry_operations ...like other filesystems. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index b242759..cc81d25 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -938,7 +938,7 @@ static int get_hstate_idx(int page_size_log) return h - hstates; } -static struct dentry_operations anon_ops = { +static const struct dentry_operations anon_ops = { .d_dname = simple_dname }; -- cgit v0.10.2 From 6e6870d4fd19e25332e7d975604497c8568949d9 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:10:40 -0700 Subject: fs/hugetlbfs/inode.c: remove null test before kfree Fix checkpatch warning: WARNING: kfree(NULL) is safe this check is probably not required Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index cc81d25..1e2872b 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -901,8 +901,7 @@ hugetlbfs_fill_super(struct super_block *sb, void *data, int silent) goto out_free; return 0; out_free: - if (sbinfo->spool) - kfree(sbinfo->spool); + kfree(sbinfo->spool); kfree(sbinfo); return -ENOMEM; } -- cgit v0.10.2 From be9765722e6b7ece8263cbab857490332339bd6f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Wed, 4 Jun 2014 16:10:41 -0700 Subject: mm, compaction: properly signal and act upon lock and need_sched() contention Compaction uses compact_checklock_irqsave() function to periodically check for lock contention and need_resched() to either abort async compaction, or to free the lock, schedule and retake the lock. When aborting, cc->contended is set to signal the contended state to the caller. Two problems have been identified in this mechanism. First, compaction also calls directly cond_resched() in both scanners when no lock is yet taken. This call either does not abort async compaction, or set cc->contended appropriately. This patch introduces a new compact_should_abort() function to achieve both. In isolate_freepages(), the check frequency is reduced to once by SWAP_CLUSTER_MAX pageblocks to match what the migration scanner does in the preliminary page checks. In case a pageblock is found suitable for calling isolate_freepages_block(), the checks within there are done on higher frequency. Second, isolate_freepages() does not check if isolate_freepages_block() aborted due to contention, and advances to the next pageblock. This violates the principle of aborting on contention, and might result in pageblocks not being scanned completely, since the scanning cursor is advanced. This problem has been noticed in the code by Joonsoo Kim when reviewing related patches. This patch makes isolate_freepages_block() check the cc->contended flag and abort. In case isolate_freepages() has already isolated some pages before aborting due to contention, page migration will proceed, which is OK since we do not want to waste the work that has been done, and page migration has own checks for contention. However, we do not want another isolation attempt by either of the scanners, so cc->contended flag check is added also to compaction_alloc() and compact_finished() to make sure compaction is aborted right after the migration. The outcome of the patch should be reduced lock contention by async compaction and lower latencies for higher-order allocations where direct compaction is involved. [akpm@linux-foundation.org: fix typo in comment] Reported-by: Joonsoo Kim Signed-off-by: Vlastimil Babka Reviewed-by: Naoya Horiguchi Cc: Minchan Kim Cc: Mel Gorman Cc: Bartlomiej Zolnierkiewicz Cc: Michal Nazarewicz Cc: Christoph Lameter Cc: Rik van Riel Acked-by: Michal Nazarewicz Tested-by: Shawn Guo Tested-by: Kevin Hilman Tested-by: Stephen Warren Tested-by: Fabio Estevam Cc: David Rientjes Cc: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 5844122..21bf292b 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -222,6 +222,30 @@ static bool compact_checklock_irqsave(spinlock_t *lock, unsigned long *flags, return true; } +/* + * Aside from avoiding lock contention, compaction also periodically checks + * need_resched() and either schedules in sync compaction or aborts async + * compaction. This is similar to what compact_checklock_irqsave() does, but + * is used where no lock is concerned. + * + * Returns false when no scheduling was needed, or sync compaction scheduled. + * Returns true when async compaction should abort. + */ +static inline bool compact_should_abort(struct compact_control *cc) +{ + /* async compaction aborts if contended */ + if (need_resched()) { + if (cc->mode == MIGRATE_ASYNC) { + cc->contended = true; + return true; + } + + cond_resched(); + } + + return false; +} + /* Returns true if the page is within a block suitable for migration to */ static bool suitable_migration_target(struct page *page) { @@ -494,11 +518,8 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, return 0; } - if (cond_resched()) { - /* Async terminates prematurely on need_resched() */ - if (cc->mode == MIGRATE_ASYNC) - return 0; - } + if (compact_should_abort(cc)) + return 0; /* Time to isolate some pages for migration */ for (; low_pfn < end_pfn; low_pfn++) { @@ -720,9 +741,11 @@ static void isolate_freepages(struct zone *zone, /* * This can iterate a massively long zone without finding any * suitable migration targets, so periodically check if we need - * to schedule. + * to schedule, or even abort async compaction. */ - cond_resched(); + if (!(block_start_pfn % (SWAP_CLUSTER_MAX * pageblock_nr_pages)) + && compact_should_abort(cc)) + break; if (!pfn_valid(block_start_pfn)) continue; @@ -760,6 +783,13 @@ static void isolate_freepages(struct zone *zone, */ if (isolated) cc->finished_update_free = true; + + /* + * isolate_freepages_block() might have aborted due to async + * compaction being contended + */ + if (cc->contended) + break; } /* split_free_page does not map the pages */ @@ -786,9 +816,13 @@ static struct page *compaction_alloc(struct page *migratepage, struct compact_control *cc = (struct compact_control *)data; struct page *freepage; - /* Isolate free pages if necessary */ + /* + * Isolate free pages if necessary, and if we are not aborting due to + * contention. + */ if (list_empty(&cc->freepages)) { - isolate_freepages(cc->zone, cc); + if (!cc->contended) + isolate_freepages(cc->zone, cc); if (list_empty(&cc->freepages)) return NULL; @@ -858,7 +892,7 @@ static int compact_finished(struct zone *zone, unsigned int order; unsigned long watermark; - if (fatal_signal_pending(current)) + if (cc->contended || fatal_signal_pending(current)) return COMPACT_PARTIAL; /* Compaction run completes if the migrate and free scanner meet */ diff --git a/mm/internal.h b/mm/internal.h index 802c3a4..7f22a11f 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -144,7 +144,10 @@ struct compact_control { int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - bool contended; /* True if a lock was contended */ + bool contended; /* True if a lock was contended, or + * need_resched() true during async + * compaction + */ }; unsigned long -- cgit v0.10.2 From 65eb71823b01051ca6e256e9cc8259141a849052 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Wed, 4 Jun 2014 16:10:43 -0700 Subject: hwpoison: remove unused global variable in do_machine_check() Remove an unused global variable mce_entry and relative operations in do_machine_check(). Signed-off-by: Chen Yucong Cc: Naoya Horiguchi Cc: Wu Fengguang Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 6e4ce2d..958b90f 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -176,8 +176,6 @@ int mce_available(struct cpuinfo_x86 *c); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); -extern atomic_t mce_entry; - typedef DECLARE_BITMAP(mce_banks_t, MAX_NR_BANKS); DECLARE_PER_CPU(mce_banks_t, mce_poll_banks); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 6cc8003..bb92f38 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -60,8 +60,6 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define SPINUNIT 100 /* 100ns */ -atomic_t mce_entry; - DEFINE_PER_CPU(unsigned, mce_exception_count); struct mce_bank *mce_banks __read_mostly; @@ -1040,8 +1038,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) DECLARE_BITMAP(valid_banks, MAX_NR_BANKS); char *msg = "Unknown"; - atomic_inc(&mce_entry); - this_cpu_inc(mce_exception_count); if (!cfg->banks) @@ -1171,7 +1167,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) mce_report_event(regs); mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); out: - atomic_dec(&mce_entry); sync_core(); } EXPORT_SYMBOL_GPL(do_machine_check); -- cgit v0.10.2 From 7fc34a62ca4434a79c68e23e70ed26111b7a4cf8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 4 Jun 2014 16:10:44 -0700 Subject: mm/msync.c: sync only the requested range in msync() msync() currently syncs more than POSIX requires or BSD or Solaris implement. It is supposed to be equivalent to fdatasync(), not fsync(), and it is only supposed to sync the portion of the file that overlaps the range passed to msync. If the VMA is non-linear, fall back to syncing the entire file, but we still optimise to only fdatasync() the entire file, not the full fsync(). akpm: there are obvious concerns with bck-compatibility: is anyone relying on the undocumented side-effect for their data integrity? And how would they ever know if this change broke their data integrity? We think the risk is reasonably low, and this patch brings the kernel into line with other OS's and with what the manpage has always said... Signed-off-by: Matthew Wilcox Reviewed-by: Christoph Hellwig Acked-by: Jeff Moyer Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/msync.c b/mm/msync.c index 632df45..a5c6736 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -58,6 +58,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) vma = find_vma(mm, start); for (;;) { struct file *file; + loff_t fstart, fend; /* Still start < end. */ error = -ENOMEM; @@ -77,12 +78,17 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) goto out_unlock; } file = vma->vm_file; + fstart = start + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + fend = fstart + (min(end, vma->vm_end) - start) - 1; start = vma->vm_end; if ((flags & MS_SYNC) && file && (vma->vm_flags & VM_SHARED)) { get_file(file); up_read(&mm->mmap_sem); - error = vfs_fsync(file, 0); + if (vma->vm_flags & VM_NONLINEAR) + error = vfs_fsync(file, 1); + else + error = vfs_fsync_range(file, fstart, fend, 1); fput(file); if (error || start >= end) goto out; -- cgit v0.10.2 From 850e9c69ca75f32aa9361a0edec6cad388a231b0 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:10:45 -0700 Subject: mm: fix typo in comment in do_fault_around() Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory.c b/mm/memory.c index 7049d39..e7ccbac 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2832,7 +2832,7 @@ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, /* * max_pgoff is either end of page table or end of vma - * or fault_around_pages() from pgoff, depending what is neast. + * or fault_around_pages() from pgoff, depending what is nearest. */ max_pgoff = pgoff - ((start_addr >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; -- cgit v0.10.2 From 28f2cd4f6da24a1aa06c226618ed5ad69e13df64 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 Jun 2014 16:10:46 -0700 Subject: fs/superblock: unregister sb shrinker before ->kill_sb() This series is aimed at regressions noticed during reclaim activity. The first two patches are shrinker patches that were posted ages ago but never merged for reasons that are unclear to me. I'm posting them again to see if there was a reason they were dropped or if they just got lost. Dave? Time? The last patch adjusts proportional reclaim. Yuanhan Liu, can you retest the vm scalability test cases on a larger machine? Hugh, does this work for you on the memcg test cases? Based on ext4, I get the following results but unfortunately my larger test machines are all unavailable so this is based on a relatively small machine. postmark 3.15.0-rc5 3.15.0-rc5 vanilla proportion-v1r4 Ops/sec Transactions 21.00 ( 0.00%) 25.00 ( 19.05%) Ops/sec FilesCreate 39.00 ( 0.00%) 45.00 ( 15.38%) Ops/sec CreateTransact 10.00 ( 0.00%) 12.00 ( 20.00%) Ops/sec FilesDeleted 6202.00 ( 0.00%) 6202.00 ( 0.00%) Ops/sec DeleteTransact 11.00 ( 0.00%) 12.00 ( 9.09%) Ops/sec DataRead/MB 25.97 ( 0.00%) 30.02 ( 15.59%) Ops/sec DataWrite/MB 49.99 ( 0.00%) 57.78 ( 15.58%) ffsb (mail server simulator) 3.15.0-rc5 3.15.0-rc5 vanilla proportion-v1r4 Ops/sec readall 9402.63 ( 0.00%) 9805.74 ( 4.29%) Ops/sec create 4695.45 ( 0.00%) 4781.39 ( 1.83%) Ops/sec delete 173.72 ( 0.00%) 177.23 ( 2.02%) Ops/sec Transactions 14271.80 ( 0.00%) 14764.37 ( 3.45%) Ops/sec Read 37.00 ( 0.00%) 38.50 ( 4.05%) Ops/sec Write 18.20 ( 0.00%) 18.50 ( 1.65%) dd of a large file 3.15.0-rc5 3.15.0-rc5 vanilla proportion-v1r4 WallTime DownloadTar 75.00 ( 0.00%) 61.00 ( 18.67%) WallTime DD 423.00 ( 0.00%) 401.00 ( 5.20%) WallTime Delete 2.00 ( 0.00%) 5.00 (-150.00%) stutter (times mmap latency during large amounts of IO) 3.15.0-rc5 3.15.0-rc5 vanilla proportion-v1r4 Unit >5ms Delays 80252.0000 ( 0.00%) 81523.0000 ( -1.58%) Unit Mmap min 8.2118 ( 0.00%) 8.3206 ( -1.33%) Unit Mmap mean 17.4614 ( 0.00%) 17.2868 ( 1.00%) Unit Mmap stddev 24.9059 ( 0.00%) 34.6771 (-39.23%) Unit Mmap max 2811.6433 ( 0.00%) 2645.1398 ( 5.92%) Unit Mmap 90% 20.5098 ( 0.00%) 18.3105 ( 10.72%) Unit Mmap 93% 22.9180 ( 0.00%) 20.1751 ( 11.97%) Unit Mmap 95% 25.2114 ( 0.00%) 22.4988 ( 10.76%) Unit Mmap 99% 46.1430 ( 0.00%) 43.5952 ( 5.52%) Unit Ideal Tput 85.2623 ( 0.00%) 78.8906 ( 7.47%) Unit Tput min 44.0666 ( 0.00%) 43.9609 ( 0.24%) Unit Tput mean 45.5646 ( 0.00%) 45.2009 ( 0.80%) Unit Tput stddev 0.9318 ( 0.00%) 1.1084 (-18.95%) Unit Tput max 46.7375 ( 0.00%) 46.7539 ( -0.04%) This patch (of 3): We will like to unregister the sb shrinker before ->kill_sb(). This will allow cached objects to be counted without call to grab_super_passive() to update ref count on sb. We want to avoid locking during memory reclamation especially when we are skipping the memory reclaim when we are out of cached objects. This is safe because grab_super_passive does a try-lock on the sb->s_umount now, and so if we are in the unmount process, it won't ever block. That means what used to be a deadlock and races we were avoiding by using grab_super_passive() is now: shrinker umount down_read(shrinker_rwsem) down_write(sb->s_umount) shrinker_unregister down_write(shrinker_rwsem) grab_super_passive(sb) down_read_trylock(sb->s_umount) .... up_read(shrinker_rwsem) up_write(shrinker_rwsem) ->kill_sb() .... So it is safe to deregister the shrinker before ->kill_sb(). Signed-off-by: Tim Chen Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Hugh Dickins Cc: Dave Chinner Tested-by: Yuanhan Liu Cc: Bob Liu Cc: Jan Kara Acked-by: Rik van Riel Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/super.c b/fs/super.c index 48377f7..a852b1a 100644 --- a/fs/super.c +++ b/fs/super.c @@ -276,10 +276,8 @@ void deactivate_locked_super(struct super_block *s) struct file_system_type *fs = s->s_type; if (atomic_dec_and_test(&s->s_active)) { cleancache_invalidate_fs(s); - fs->kill_sb(s); - - /* caches are now gone, we can safely kill the shrinker now */ unregister_shrinker(&s->s_shrink); + fs->kill_sb(s); put_filesystem(fs); put_super(s); -- cgit v0.10.2 From d23da150a37c9fe3cc83dbaf71b3e37fd434ed52 Mon Sep 17 00:00:00 2001 From: Tim Chen Date: Wed, 4 Jun 2014 16:10:47 -0700 Subject: fs/superblock: avoid locking counting inodes and dentries before reclaiming them We remove the call to grab_super_passive in call to super_cache_count. This becomes a scalability bottleneck as multiple threads are trying to do memory reclamation, e.g. when we are doing large amount of file read and page cache is under pressure. The cached objects quickly got reclaimed down to 0 and we are aborting the cache_scan() reclaim. But counting creates a log jam acquiring the sb_lock. We are holding the shrinker_rwsem which ensures the safety of call to list_lru_count_node() and s_op->nr_cached_objects. The shrinker is unregistered now before ->kill_sb() so the operation is safe when we are doing unmount. The impact will depend heavily on the machine and the workload but for a small machine using postmark tuned to use 4xRAM size the results were 3.15.0-rc5 3.15.0-rc5 vanilla shrinker-v1r1 Ops/sec Transactions 21.00 ( 0.00%) 24.00 ( 14.29%) Ops/sec FilesCreate 39.00 ( 0.00%) 44.00 ( 12.82%) Ops/sec CreateTransact 10.00 ( 0.00%) 12.00 ( 20.00%) Ops/sec FilesDeleted 6202.00 ( 0.00%) 6202.00 ( 0.00%) Ops/sec DeleteTransact 11.00 ( 0.00%) 12.00 ( 9.09%) Ops/sec DataRead/MB 25.97 ( 0.00%) 29.10 ( 12.05%) Ops/sec DataWrite/MB 49.99 ( 0.00%) 56.02 ( 12.06%) ffsb running in a configuration that is meant to simulate a mail server showed 3.15.0-rc5 3.15.0-rc5 vanilla shrinker-v1r1 Ops/sec readall 9402.63 ( 0.00%) 9567.97 ( 1.76%) Ops/sec create 4695.45 ( 0.00%) 4735.00 ( 0.84%) Ops/sec delete 173.72 ( 0.00%) 179.83 ( 3.52%) Ops/sec Transactions 14271.80 ( 0.00%) 14482.81 ( 1.48%) Ops/sec Read 37.00 ( 0.00%) 37.60 ( 1.62%) Ops/sec Write 18.20 ( 0.00%) 18.30 ( 0.55%) Signed-off-by: Tim Chen Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Hugh Dickins Cc: Dave Chinner Tested-by: Yuanhan Liu Cc: Bob Liu Cc: Jan Kara Acked-by: Rik van Riel Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/super.c b/fs/super.c index a852b1a..d20d5b1 100644 --- a/fs/super.c +++ b/fs/super.c @@ -112,9 +112,14 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); - if (!grab_super_passive(sb)) - return 0; - + /* + * Don't call grab_super_passive as it is a potential + * scalability bottleneck. The counts could get updated + * between super_cache_count and super_cache_scan anyway. + * Call to super_cache_count with shrinker_rwsem held + * ensures the safety of call to list_lru_count_node() and + * s_op->nr_cached_objects(). + */ if (sb->s_op && sb->s_op->nr_cached_objects) total_objects = sb->s_op->nr_cached_objects(sb, sc->nid); @@ -125,7 +130,6 @@ static unsigned long super_cache_count(struct shrinker *shrink, sc->nid); total_objects = vfs_pressure_ratio(total_objects); - drop_super(sb); return total_objects; } -- cgit v0.10.2 From 1a501907bbea8e6ebb0b16cf6db9e9cbf1d2c813 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 4 Jun 2014 16:10:49 -0700 Subject: mm: vmscan: use proportional scanning during direct reclaim and full scan at DEF_PRIORITY Commit "mm: vmscan: obey proportional scanning requirements for kswapd" ensured that file/anon lists were scanned proportionally for reclaim from kswapd but ignored it for direct reclaim. The intent was to minimse direct reclaim latency but Yuanhan Liu pointer out that it substitutes one long stall for many small stalls and distorts aging for normal workloads like streaming readers/writers. Hugh Dickins pointed out that a side-effect of the same commit was that when one LRU list dropped to zero that the entirety of the other list was shrunk leading to excessive reclaim in memcgs. This patch scans the file/anon lists proportionally for direct reclaim to similarly age page whether reclaimed by kswapd or direct reclaim but takes care to abort reclaim if one LRU drops to zero after reclaiming the requested number of pages. Based on ext4 and using the Intel VM scalability test 3.15.0-rc5 3.15.0-rc5 shrinker proportion Unit lru-file-readonce elapsed 5.3500 ( 0.00%) 5.4200 ( -1.31%) Unit lru-file-readonce time_range 0.2700 ( 0.00%) 0.1400 ( 48.15%) Unit lru-file-readonce time_stddv 0.1148 ( 0.00%) 0.0536 ( 53.33%) Unit lru-file-readtwice elapsed 8.1700 ( 0.00%) 8.1700 ( 0.00%) Unit lru-file-readtwice time_range 0.4300 ( 0.00%) 0.2300 ( 46.51%) Unit lru-file-readtwice time_stddv 0.1650 ( 0.00%) 0.0971 ( 41.16%) The test cases are running multiple dd instances reading sparse files. The results are within the noise for the small test machine. The impact of the patch is more noticable from the vmstats 3.15.0-rc5 3.15.0-rc5 shrinker proportion Minor Faults 35154 36784 Major Faults 611 1305 Swap Ins 394 1651 Swap Outs 4394 5891 Allocation stalls 118616 44781 Direct pages scanned 4935171 4602313 Kswapd pages scanned 15921292 16258483 Kswapd pages reclaimed 15913301 16248305 Direct pages reclaimed 4933368 4601133 Kswapd efficiency 99% 99% Kswapd velocity 670088.047 682555.961 Direct efficiency 99% 99% Direct velocity 207709.217 193212.133 Percentage direct scans 23% 22% Page writes by reclaim 4858.000 6232.000 Page writes file 464 341 Page writes anon 4394 5891 Note that there are fewer allocation stalls even though the amount of direct reclaim scanning is very approximately the same. Signed-off-by: Mel Gorman Cc: Johannes Weiner Cc: Hugh Dickins Cc: Tim Chen Cc: Dave Chinner Tested-by: Yuanhan Liu Cc: Bob Liu Cc: Jan Kara Cc: Rik van Riel Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmscan.c b/mm/vmscan.c index cc29fca..9149444 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2064,13 +2064,27 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; - bool scan_adjusted = false; + bool scan_adjusted; get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ memcpy(targets, nr, sizeof(nr)); + /* + * Global reclaiming within direct reclaim at DEF_PRIORITY is a normal + * event that can occur when there is little memory pressure e.g. + * multiple streaming readers/writers. Hence, we do not abort scanning + * when the requested number of pages are reclaimed when scanning at + * DEF_PRIORITY on the assumption that the fact we are direct + * reclaiming implies that kswapd is not keeping up and it is best to + * do a batch of work at once. For memcg reclaim one check is made to + * abort proportional reclaim if either the file or anon lru has already + * dropped to zero at the first pass. + */ + scan_adjusted = (global_reclaim(sc) && !current_is_kswapd() && + sc->priority == DEF_PRIORITY); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { @@ -2091,17 +2105,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) continue; /* - * For global direct reclaim, reclaim only the number of pages - * requested. Less care is taken to scan proportionally as it - * is more important to minimise direct reclaim stall latency - * than it is to properly age the LRU lists. - */ - if (global_reclaim(sc) && !current_is_kswapd()) - break; - - /* * For kswapd and memcg, reclaim at least the number of pages - * requested. Ensure that the anon and file LRUs shrink + * requested. Ensure that the anon and file LRUs are scanned * proportionally what was requested by get_scan_count(). We * stop reclaiming one LRU and reduce the amount scanning * proportional to the original scan target. @@ -2109,6 +2114,15 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + /* + * It's just vindictive to attack the larger once the smaller + * has gone to zero. And given the way we stop scanning the + * smaller below, this makes sure that we only make one nudge + * towards proportionality once we've got nr_to_reclaim. + */ + if (!nr_file || !nr_anon) + break; + if (nr_file > nr_anon) { unsigned long scan_target = targets[LRU_INACTIVE_ANON] + targets[LRU_ACTIVE_ANON] + 1; -- cgit v0.10.2 From 226b4ccdcb6371645c25ec99b59bfde65987318c Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 4 Jun 2014 16:10:50 -0700 Subject: mm/process_vm_access: move config option into init/Kconfig CONFIG_CROSS_MEMORY_ATTACH adds couple syscalls: process_vm_readv and process_vm_writev, it's a kind of IPC for copying data between processes. Currently this option is placed inside "Processor type and features". This patch moves it into "General setup" (where all other arch-independed syscalls and ipc features are placed) and changes prompt string to less cryptic. Signed-off-by: Konstantin Khlebnikov Cc: Christopher Yeoh Cc: Davidlohr Bueso Cc: Hugh Dickins Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/init/Kconfig b/init/Kconfig index 0a2f09a..ce034ad 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -261,6 +261,16 @@ config POSIX_MQUEUE_SYSCTL depends on SYSCTL default y +config CROSS_MEMORY_ATTACH + bool "Enable process_vm_readv/writev syscalls" + depends on MMU + default y + help + Enabling this option adds the system calls process_vm_readv and + process_vm_writev which allow a process with the correct privileges + to directly read from or write to to another process's address space. + See the man page for more details. + config FHANDLE bool "open by fhandle syscalls" select EXPORTFS diff --git a/mm/Kconfig b/mm/Kconfig index 75ac479..0f00bff 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -436,16 +436,6 @@ choice benefit. endchoice -config CROSS_MEMORY_ATTACH - bool "Cross Memory Support" - depends on MMU - default y - help - Enabling this option adds the system calls process_vm_readv and - process_vm_writev which allow a process with the correct privileges - to directly read from or write to to another process's address space. - See the man page for more details. - # # UP and nommu archs use km based percpu allocator # -- cgit v0.10.2 From 3d92860f979f725a9c10c2fc26c0415a4332adbf Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 4 Jun 2014 16:10:51 -0700 Subject: mm/rmap.c: don't call mmu_notifier_invalidate_page() during munlock In its munmap mode, try_to_unmap_one() searches other mlocked vmas, it never unmaps pages. There is no reason for invalidation because ptes are left unchanged. Signed-off-by: Konstantin Khlebnikov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/rmap.c b/mm/rmap.c index e375ce4..ab74290 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1252,7 +1252,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); - if (ret != SWAP_FAIL) + if (ret != SWAP_FAIL && TTU_ACTION(flags) != TTU_MUNLOCK) mmu_notifier_invalidate_page(mm, address); out: return ret; -- cgit v0.10.2 From daa5ba768b9e15da8867824d2f1e8d455f1acac2 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 4 Jun 2014 16:10:52 -0700 Subject: mm/rmap.c: cleanup ttu_flags Transform action part of ttu_flags into individiual bits. These flags aren't part of any uses-space visible api or even trace events. Signed-off-by: Konstantin Khlebnikov Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 9be55c7..be57450 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -72,10 +72,9 @@ struct anon_vma_chain { }; enum ttu_flags { - TTU_UNMAP = 0, /* unmap mode */ - TTU_MIGRATION = 1, /* migration mode */ - TTU_MUNLOCK = 2, /* munlock mode */ - TTU_ACTION_MASK = 0xff, + TTU_UNMAP = 1, /* unmap mode */ + TTU_MIGRATION = 2, /* migration mode */ + TTU_MUNLOCK = 4, /* munlock mode */ TTU_IGNORE_MLOCK = (1 << 8), /* ignore mlock */ TTU_IGNORE_ACCESS = (1 << 9), /* don't age */ diff --git a/mm/rmap.c b/mm/rmap.c index ab74290..ea8e20d 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1162,7 +1162,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, if (vma->vm_flags & VM_LOCKED) goto out_mlock; - if (TTU_ACTION(flags) == TTU_MUNLOCK) + if (flags & TTU_MUNLOCK) goto out_unmap; } if (!(flags & TTU_IGNORE_ACCESS)) { @@ -1230,7 +1230,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * pte. do_swap_page() will wait until the migration * pte is removed and then restart fault handling. */ - BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); + BUG_ON(!(flags & TTU_MIGRATION)); entry = make_migration_entry(page, pte_write(pteval)); } swp_pte = swp_entry_to_pte(entry); @@ -1239,7 +1239,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, set_pte_at(mm, address, pte, swp_pte); BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && - (TTU_ACTION(flags) == TTU_MIGRATION)) { + (flags & TTU_MIGRATION)) { /* Establish migration entry for a file page */ swp_entry_t entry; entry = make_migration_entry(page, pte_write(pteval)); @@ -1252,7 +1252,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, out_unmap: pte_unmap_unlock(pte, ptl); - if (ret != SWAP_FAIL && TTU_ACTION(flags) != TTU_MUNLOCK) + if (ret != SWAP_FAIL && !(flags & TTU_MUNLOCK)) mmu_notifier_invalidate_page(mm, address); out: return ret; @@ -1539,7 +1539,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) * locking requirements of exec(), migration skips * temporary VMAs until after exec() completes. */ - if (flags & TTU_MIGRATION && !PageKsm(page) && PageAnon(page)) + if ((flags & TTU_MIGRATION) && !PageKsm(page) && PageAnon(page)) rwc.invalid_vma = invalid_migration_vma; ret = rmap_walk(page, &rwc); -- cgit v0.10.2 From 7d018176e6d50510b142bccbd60d8c6ed5e72e56 Mon Sep 17 00:00:00 2001 From: Zhang Zhen Date: Wed, 4 Jun 2014 16:10:53 -0700 Subject: mm/page_alloc.c: cleanup add_active_range() related comments add_active_range() has been repalced by memblock_set_node(). Clean up the comments to comply with that change. Signed-off-by: Zhang Zhen Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8f785b15..a59bdb6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4387,9 +4387,6 @@ int __meminit init_currently_empty_zone(struct zone *zone, #ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID /* * Required by SPARSEMEM. Given a PFN, return what node the PFN is on. - * Architectures may implement their own version but if add_active_range() - * was used and there are no special requirements, this is a convenient - * alternative */ int __meminit __early_pfn_to_nid(unsigned long pfn) { @@ -4444,10 +4441,9 @@ bool __meminit early_pfn_in_nid(unsigned long pfn, int node) * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed. * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * this function may be used instead of calling memblock_free_early_nid() - * manually. + * If an architecture guarantees that all ranges registered contain no holes + * and may be freed, this this function may be used instead of calling + * memblock_free_early_nid() manually. */ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) { @@ -4469,9 +4465,8 @@ void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn) * sparse_memory_present_with_active_regions - Call memory_present for each active range * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used. * - * If an architecture guarantees that all ranges registered with - * add_active_ranges() contain no holes and may be freed, this - * function may be used instead of calling memory_present() manually. + * If an architecture guarantees that all ranges registered contain no holes and may + * be freed, this function may be used instead of calling memory_present() manually. */ void __init sparse_memory_present_with_active_regions(int nid) { @@ -4489,7 +4484,7 @@ void __init sparse_memory_present_with_active_regions(int nid) * @end_pfn: Passed by reference. On return, it will have the node end_pfn. * * It returns the start and end page frame of a node based on information - * provided by an arch calling add_active_range(). If called for a node + * provided by memblock_set_node(). If called for a node * with no available memory, a warning is printed and the start and end * PFNs will be 0. */ @@ -5066,7 +5061,7 @@ static unsigned long __init find_min_pfn_for_node(int nid) * find_min_pfn_with_active_regions - Find the minimum PFN registered * * It returns the minimum PFN based on information provided via - * add_active_range(). + * memblock_set_node(). */ unsigned long __init find_min_pfn_with_active_regions(void) { @@ -5287,7 +5282,7 @@ static void check_for_memory(pg_data_t *pgdat, int nid) * @max_zone_pfn: an array of max PFNs for each zone * * This will call free_area_init_node() for each active node in the system. - * Using the page ranges provided by add_active_range(), the size of each + * Using the page ranges provided by memblock_set_node(), the size of each * zone in each node and their holes is calculated. If the maximum PFN * between two adjacent zones match, it is assumed that the zone is empty. * For example, if arch_max_dma_pfn == arch_max_dma32_pfn, it is assumed -- cgit v0.10.2 From a9b0f8618d46ba027243b8ecb5c2468a7112d235 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:10:54 -0700 Subject: mm: nominate faultaround area in bytes rather than page order There is evidencs that the faultaround feature is less relevant on architectures with page size bigger then 4k. Which makes sense since page fault overhead per byte of mapped area should be less there. Let's rework the feature to specify faultaround area in bytes instead of page order. It's 64 kilobytes for now. The patch effectively disables faultaround on architectures with page size >= 64k (like ppc64). It's possible that some other size of faultaround area is relevant for a platform. We can expose `fault_around_bytes' variable to arch-specific code once such platforms will be found. Signed-off-by: Kirill A. Shutemov Cc: Rusty Russell Cc: Hugh Dickins Cc: Madhavan Srinivasan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Peter Zijlstra Cc: Ingo Molnar Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory.c b/mm/memory.c index e7ccbac..62a08a7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2758,63 +2758,47 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, update_mmu_cache(vma, address, pte); } -#define FAULT_AROUND_ORDER 4 +static unsigned long fault_around_bytes = 65536; + +static inline unsigned long fault_around_pages(void) +{ + return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE; +} + +static inline unsigned long fault_around_mask(void) +{ + return ~(rounddown_pow_of_two(fault_around_bytes) - 1) & PAGE_MASK; +} -#ifdef CONFIG_DEBUG_FS -static unsigned int fault_around_order = FAULT_AROUND_ORDER; -static int fault_around_order_get(void *data, u64 *val) +#ifdef CONFIG_DEBUG_FS +static int fault_around_bytes_get(void *data, u64 *val) { - *val = fault_around_order; + *val = fault_around_bytes; return 0; } -static int fault_around_order_set(void *data, u64 val) +static int fault_around_bytes_set(void *data, u64 val) { - BUILD_BUG_ON((1UL << FAULT_AROUND_ORDER) > PTRS_PER_PTE); - if (1UL << val > PTRS_PER_PTE) + if (val / PAGE_SIZE > PTRS_PER_PTE) return -EINVAL; - fault_around_order = val; + fault_around_bytes = val; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fault_around_order_fops, - fault_around_order_get, fault_around_order_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fault_around_bytes_fops, + fault_around_bytes_get, fault_around_bytes_set, "%llu\n"); static int __init fault_around_debugfs(void) { void *ret; - ret = debugfs_create_file("fault_around_order", 0644, NULL, NULL, - &fault_around_order_fops); + ret = debugfs_create_file("fault_around_bytes", 0644, NULL, NULL, + &fault_around_bytes_fops); if (!ret) - pr_warn("Failed to create fault_around_order in debugfs"); + pr_warn("Failed to create fault_around_bytes in debugfs"); return 0; } late_initcall(fault_around_debugfs); - -static inline unsigned long fault_around_pages(void) -{ - return 1UL << fault_around_order; -} - -static inline unsigned long fault_around_mask(void) -{ - return ~((1UL << (PAGE_SHIFT + fault_around_order)) - 1); -} -#else -static inline unsigned long fault_around_pages(void) -{ - unsigned long nr_pages; - - nr_pages = 1UL << FAULT_AROUND_ORDER; - BUILD_BUG_ON(nr_pages > PTRS_PER_PTE); - return nr_pages; -} - -static inline unsigned long fault_around_mask(void) -{ - return ~((1UL << (PAGE_SHIFT + FAULT_AROUND_ORDER)) - 1); -} #endif static void do_fault_around(struct vm_area_struct *vma, unsigned long address, @@ -2871,7 +2855,7 @@ static int do_read_fault(struct mm_struct *mm, struct vm_area_struct *vma, * if page by the offset is not ready to be mapped (cold cache or * something). */ - if (vma->vm_ops->map_pages) { + if (vma->vm_ops->map_pages && fault_around_pages() > 1) { pte = pte_offset_map_lock(mm, pmd, address, &ptl); do_fault_around(vma, address, pte, pgoff, flags); if (!pte_same(*pte, orig_pte)) -- cgit v0.10.2 From 1fdb412bd825998efbced3a16f6ce7e0329728cf Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:10:55 -0700 Subject: mm: document do_fault_around() feature Some clarification on how faultaround works. [akpm@linux-foundation.org: tweak comment text] Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory.c b/mm/memory.c index 62a08a7..d67fd9f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2760,6 +2760,10 @@ void do_set_pte(struct vm_area_struct *vma, unsigned long address, static unsigned long fault_around_bytes = 65536; +/* + * fault_around_pages() and fault_around_mask() round down fault_around_bytes + * to nearest page order. It's what do_fault_around() expects to see. + */ static inline unsigned long fault_around_pages(void) { return rounddown_pow_of_two(fault_around_bytes) / PAGE_SIZE; @@ -2801,6 +2805,29 @@ static int __init fault_around_debugfs(void) late_initcall(fault_around_debugfs); #endif +/* + * do_fault_around() tries to map few pages around the fault address. The hope + * is that the pages will be needed soon and this will lower the number of + * faults to handle. + * + * It uses vm_ops->map_pages() to map the pages, which skips the page if it's + * not ready to be mapped: not up-to-date, locked, etc. + * + * This function is called with the page table lock taken. In the split ptlock + * case the page table lock only protects only those entries which belong to + * the page table corresponding to the fault address. + * + * This function doesn't cross the VMA boundaries, in order to call map_pages() + * only once. + * + * fault_around_pages() defines how many pages we'll try to map. + * do_fault_around() expects it to return a power of two less than or equal to + * PTRS_PER_PTE. + * + * The virtual address of the area that we map is naturally aligned to the + * fault_around_pages() value (and therefore to page order). This way it's + * easier to guarantee that we don't cross page table boundaries. + */ static void do_fault_around(struct vm_area_struct *vma, unsigned long address, pte_t *pte, pgoff_t pgoff, unsigned int flags) { -- cgit v0.10.2 From 100873d7a777b67ad35197c5a998b5e778f8bf3f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 4 Jun 2014 16:10:56 -0700 Subject: hugetlb: rename hugepage_migration_support() to ..._supported() We already have a function named hugepages_supported(), and the similar name hugepage_migration_support() is a bit unconfortable, so let's rename it hugepage_migration_supported(). Signed-off-by: Naoya Horiguchi Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 35786ee..255cd5c 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -397,7 +397,7 @@ static inline pgoff_t basepage_index(struct page *page) extern void dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn); -static inline int hugepage_migration_support(struct hstate *h) +static inline int hugepage_migration_supported(struct hstate *h) { #ifdef CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION return huge_page_shift(h) == PMD_SHIFT; @@ -453,7 +453,7 @@ static inline pgoff_t basepage_index(struct page *page) return page->index; } #define dissolve_free_huge_pages(s, e) do {} while (0) -#define hugepage_migration_support(h) 0 +#define hugepage_migration_supported(h) 0 static inline spinlock_t *huge_pte_lockptr(struct hstate *h, struct mm_struct *mm, pte_t *pte) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2441942..226910c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -544,7 +544,7 @@ static struct page *dequeue_huge_page_node(struct hstate *h, int nid) /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepages_treat_as_movable || hugepage_migration_support(h)) + if (hugepages_treat_as_movable || hugepage_migration_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; diff --git a/mm/migrate.c b/mm/migrate.c index 2a45967..63f0cd5 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1039,7 +1039,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, * tables or check whether the hugepage is pmd-based or not before * kicking migration. */ - if (!hugepage_migration_support(page_hstate(hpage))) { + if (!hugepage_migration_supported(page_hstate(hpage))) { putback_active_hugepage(hpage); return -ENOSYS; } -- cgit v0.10.2 From 50088c440910730baf3248acfad2c846fb3eea77 Mon Sep 17 00:00:00 2001 From: Chen Yucong Date: Wed, 4 Jun 2014 16:10:57 -0700 Subject: mm/swapfile.c: delete the "last_in_cluster < scan_base" loop in the body of scan_swap_map() Via commit ebc2a1a69111 ("swap: make cluster allocation per-cpu"), we can find that all SWP_SOLIDSTATE "seek is cheap"(SSD case) has already gone to si->cluster_info scan_swap_map_try_ssd_cluster() route. So that the "last_in_cluster < scan_base" loop in the body of scan_swap_map() has already become a dead code snippet, and it should have been deleted. This patch is to delete the redundant loop as Hugh and Shaohua suggested. [hughd@google.com: fix comment, simplify code] Signed-off-by: Chen Yucong Cc: Shaohua Li Acked-by: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/swapfile.c b/mm/swapfile.c index beeeef8..4c524f7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -523,13 +523,10 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, /* * If seek is expensive, start searching for new cluster from * start of partition, to minimize the span of allocated swap. - * But if seek is cheap, search from our current position, so - * that swap is allocated from all over the partition: if the - * Flash Translation Layer only remaps within limited zones, - * we don't want to wear out the first zone too quickly. + * If seek is cheap, that is the SWP_SOLIDSTATE si->cluster_info + * case, just handled by scan_swap_map_try_ssd_cluster() above. */ - if (!(si->flags & SWP_SOLIDSTATE)) - scan_base = offset = si->lowest_bit; + scan_base = offset = si->lowest_bit; last_in_cluster = offset + SWAPFILE_CLUSTER - 1; /* Locate the first empty (unaligned) cluster */ @@ -549,26 +546,6 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } } - offset = si->lowest_bit; - last_in_cluster = offset + SWAPFILE_CLUSTER - 1; - - /* Locate the first empty (unaligned) cluster */ - for (; last_in_cluster < scan_base; offset++) { - if (si->swap_map[offset]) - last_in_cluster = offset + SWAPFILE_CLUSTER; - else if (offset == last_in_cluster) { - spin_lock(&si->lock); - offset -= SWAPFILE_CLUSTER - 1; - si->cluster_next = offset; - si->cluster_nr = SWAPFILE_CLUSTER - 1; - goto checks; - } - if (unlikely(--latency_ration < 0)) { - cond_resched(); - latency_ration = LATENCY_LIMIT; - } - } - offset = scan_base; spin_lock(&si->lock); si->cluster_nr = SWAPFILE_CLUSTER - 1; -- cgit v0.10.2 From d2f3102838d90ed6ed09a6154bdb2306f7cf1548 Mon Sep 17 00:00:00 2001 From: Jianyu Zhan Date: Wed, 4 Jun 2014 16:10:58 -0700 Subject: mm/page-writeback.c: remove outdated comment There is an orphaned prehistoric comment , which used to be against get_dirty_limits(), the dawn of global_dirtyable_memory(). Back then, the implementation of get_dirty_limits() is complicated and full of magic numbers, so this comment is necessary. But we now use the clear and neat global_dirtyable_memory(), which renders this comment ambiguous and useless. Remove it. Signed-off-by: Jianyu Zhan Acked-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b9b8e82..533fa60 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -156,24 +156,6 @@ static unsigned long writeout_period_time = 0; #define VM_COMPLETIONS_PERIOD_LEN (3*HZ) /* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ - -/* * In a memory zone, there is a certain amount of pages we consider * available for the page cache, which is essentially the number of * free and reclaimable pages, minus some zone reserves to protect -- cgit v0.10.2 From a70ffcac741d31a406c1d2b832ae43d658e7e1cf Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 4 Jun 2014 16:10:59 -0700 Subject: mm/memory-failure.c-failure: send right signal code to correct thread When a thread in a multi-threaded application hits a machine check because of an uncorrectable error in memory - we want to send the SIGBUS with si.si_code = BUS_MCEERR_AR to that thread. Currently we fail to do that if the active thread is not the primary thread in the process. collect_procs() just finds primary threads and this test: if ((flags & MF_ACTION_REQUIRED) && t == current) { will see that the thread we found isn't the current thread and so send a si.si_code = BUS_MCEERR_AO to the primary (and nothing to the active thread at this time). We can fix this by checking whether "current" shares the same mm with the process that collect_procs() said owned the page. If so, we send the SIGBUS to current (with code BUS_MCEERR_AR). Signed-off-by: Tony Luck Signed-off-by: Naoya Horiguchi Reported-by: Otto Bruggeman Cc: Andi Kleen Cc: Borislav Petkov Cc: Chen Gong Cc: [3.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory-failure.c b/mm/memory-failure.c index a047468..89ad452 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -204,9 +204,9 @@ static int kill_proc(struct task_struct *t, unsigned long addr, int trapno, #endif si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; - if ((flags & MF_ACTION_REQUIRED) && t == current) { + if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) { si.si_code = BUS_MCEERR_AR; - ret = force_sig_info(SIGBUS, &si, t); + ret = force_sig_info(SIGBUS, &si, current); } else { /* * Don't use force here, it's convenient if the signal -- cgit v0.10.2 From 74614de17db6fb472370c426d4f934d8d616edf2 Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 4 Jun 2014 16:11:01 -0700 Subject: mm/memory-failure.c: don't let collect_procs() skip over processes for MF_ACTION_REQUIRED When Linux sees an "action optional" machine check (where h/w has reported an error that is not in the current execution path) we generally do not want to signal a process, since most processes do not have a SIGBUS handler - we'd just prematurely terminate the process for a problem that they might never actually see. task_early_kill() decides whether to consider a process - and it checks whether this specific process has been marked for early signals with "prctl", or if the system administrator has requested early signals for all processes using /proc/sys/vm/memory_failure_early_kill. But for MF_ACTION_REQUIRED case we must not defer. The error is in the execution path of the current thread so we must send the SIGBUS immediatley. Fix by passing a flag argument through collect_procs*() to task_early_kill() so it knows whether we can defer or must take action. Signed-off-by: Tony Luck Signed-off-by: Naoya Horiguchi Cc: Andi Kleen Cc: Borislav Petkov Cc: Chen Gong Cc: [3.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 89ad452..ed339c5 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -380,10 +380,12 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, } } -static int task_early_kill(struct task_struct *tsk) +static int task_early_kill(struct task_struct *tsk, int force_early) { if (!tsk->mm) return 0; + if (force_early) + return 1; if (tsk->flags & PF_MCE_PROCESS) return !!(tsk->flags & PF_MCE_EARLY); return sysctl_memory_failure_early_kill; @@ -393,7 +395,7 @@ static int task_early_kill(struct task_struct *tsk) * Collect processes when the error hit an anonymous page. */ static void collect_procs_anon(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -409,7 +411,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, for_each_process (tsk) { struct anon_vma_chain *vmac; - if (!task_early_kill(tsk)) + if (!task_early_kill(tsk, force_early)) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, pgoff, pgoff) { @@ -428,7 +430,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, * Collect processes when the error hit a file mapped page. */ static void collect_procs_file(struct page *page, struct list_head *to_kill, - struct to_kill **tkc) + struct to_kill **tkc, int force_early) { struct vm_area_struct *vma; struct task_struct *tsk; @@ -439,7 +441,7 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, for_each_process(tsk) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - if (!task_early_kill(tsk)) + if (!task_early_kill(tsk, force_early)) continue; vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, @@ -465,7 +467,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * First preallocate one tokill structure outside the spin locks, * so that we can kill at least one process reasonably reliable. */ -static void collect_procs(struct page *page, struct list_head *tokill) +static void collect_procs(struct page *page, struct list_head *tokill, + int force_early) { struct to_kill *tk; @@ -476,9 +479,9 @@ static void collect_procs(struct page *page, struct list_head *tokill) if (!tk) return; if (PageAnon(page)) - collect_procs_anon(page, tokill, &tk); + collect_procs_anon(page, tokill, &tk, force_early); else - collect_procs_file(page, tokill, &tk); + collect_procs_file(page, tokill, &tk, force_early); kfree(tk); } @@ -963,7 +966,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(ppage, &tokill); + collect_procs(ppage, &tokill, flags & MF_ACTION_REQUIRED); ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) -- cgit v0.10.2 From 3ba08129e38437561df44c36b7ea9081185d5333 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 4 Jun 2014 16:11:02 -0700 Subject: mm/memory-failure.c: support use of a dedicated thread to handle SIGBUS(BUS_MCEERR_AO) Currently memory error handler handles action optional errors in the deferred manner by default. And if a recovery aware application wants to handle it immediately, it can do it by setting PF_MCE_EARLY flag. However, such signal can be sent only to the main thread, so it's problematic if the application wants to have a dedicated thread to handler such signals. So this patch adds dedicated thread support to memory error handler. We have PF_MCE_EARLY flags for each thread separately, so with this patch AO signal is sent to the thread with PF_MCE_EARLY flag set, not the main thread. If you want to implement a dedicated thread, you call prctl() to set PF_MCE_EARLY on the thread. Memory error handler collects processes to be killed, so this patch lets it check PF_MCE_EARLY flag on each thread in the collecting routines. No behavioral change for all non-early kill cases. Tony said: : The old behavior was crazy - someone with a multithreaded process might : well expect that if they call prctl(PF_MCE_EARLY) in just one thread, then : that thread would see the SIGBUS with si_code = BUS_MCEERR_A0 - even if : that thread wasn't the main thread for the process. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Naoya Horiguchi Reviewed-by: Tony Luck Cc: Kamil Iskra Cc: Andi Kleen Cc: Borislav Petkov Cc: Chen Gong Cc: [3.2+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/vm/hwpoison.txt b/Documentation/vm/hwpoison.txt index 5500684..6ae89a9 100644 --- a/Documentation/vm/hwpoison.txt +++ b/Documentation/vm/hwpoison.txt @@ -84,6 +84,11 @@ PR_MCE_KILL PR_MCE_KILL_EARLY: Early kill PR_MCE_KILL_LATE: Late kill PR_MCE_KILL_DEFAULT: Use system global default + Note that if you want to have a dedicated thread which handles + the SIGBUS(BUS_MCEERR_AO) on behalf of the process, you should + call prctl(PR_MCE_KILL_EARLY) on the designated thread. Otherwise, + the SIGBUS is sent to the main thread. + PR_MCE_KILL_GET return current mode diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ed339c5..cd8989c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -380,15 +380,44 @@ static void kill_procs(struct list_head *to_kill, int forcekill, int trapno, } } -static int task_early_kill(struct task_struct *tsk, int force_early) +/* + * Find a dedicated thread which is supposed to handle SIGBUS(BUS_MCEERR_AO) + * on behalf of the thread group. Return task_struct of the (first found) + * dedicated thread if found, and return NULL otherwise. + * + * We already hold read_lock(&tasklist_lock) in the caller, so we don't + * have to call rcu_read_lock/unlock() in this function. + */ +static struct task_struct *find_early_kill_thread(struct task_struct *tsk) +{ + struct task_struct *t; + + for_each_thread(tsk, t) + if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY)) + return t; + return NULL; +} + +/* + * Determine whether a given process is "early kill" process which expects + * to be signaled when some page under the process is hwpoisoned. + * Return task_struct of the dedicated thread (main thread unless explicitly + * specified) if the process is "early kill," and otherwise returns NULL. + */ +static struct task_struct *task_early_kill(struct task_struct *tsk, + int force_early) { + struct task_struct *t; if (!tsk->mm) - return 0; + return NULL; if (force_early) - return 1; - if (tsk->flags & PF_MCE_PROCESS) - return !!(tsk->flags & PF_MCE_EARLY); - return sysctl_memory_failure_early_kill; + return tsk; + t = find_early_kill_thread(tsk); + if (t) + return t; + if (sysctl_memory_failure_early_kill) + return tsk; + return NULL; } /* @@ -410,16 +439,17 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, read_lock(&tasklist_lock); for_each_process (tsk) { struct anon_vma_chain *vmac; + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk, force_early)) + if (!t) continue; anon_vma_interval_tree_foreach(vmac, &av->rb_root, pgoff, pgoff) { vma = vmac->vma; if (!page_mapped_in_vma(page, vma)) continue; - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); @@ -440,10 +470,10 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, read_lock(&tasklist_lock); for_each_process(tsk) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + struct task_struct *t = task_early_kill(tsk, force_early); - if (!task_early_kill(tsk, force_early)) + if (!t) continue; - vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { /* @@ -453,8 +483,8 @@ static void collect_procs_file(struct page *page, struct list_head *to_kill, * Assume applications who requested early kill want * to be informed of all such data corruptions. */ - if (vma->vm_mm == tsk->mm) - add_to_kill(tsk, page, vma, to_kill, tkc); + if (vma->vm_mm == t->mm) + add_to_kill(t, page, vma, to_kill, tkc); } } read_unlock(&tasklist_lock); -- cgit v0.10.2 From 4a0da71b96b9d4080c0820e9e7d02470ebe62dc6 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 4 Jun 2014 16:11:03 -0700 Subject: Documentation/sysctl/vm.txt: clarify vfs_cache_pressure description Existing description is worded in a way which almost encourages setting of vfs_cache_pressure above 100, possibly way above it. Users are left in a dark what this numeric value is - an int? a percentage? what the scale is? As a result, we are getting reports about noticeable performance degradation from users who have set vfs_cache_pressure to ridiculously high values - because they thought there is no downside to it. Via code inspection it's obvious that this value is treated as a percentage. This patch changes text to reflect this fact, and adds a cautionary paragraph advising against setting vfs_cache_pressure sky high. Signed-off-by: Denys Vlasenko Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 5b6da0f..bd4b34c 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -746,8 +746,8 @@ Changing this takes effect whenever an application requests memory. vfs_cache_pressure ------------------ -Controls the tendency of the kernel to reclaim the memory which is used for -caching of directory and inode objects. +This percentage value controls the tendency of the kernel to reclaim +the memory which is used for caching of directory and inode objects. At the default value of vfs_cache_pressure=100 the kernel will attempt to reclaim dentries and inodes at a "fair" rate with respect to pagecache and @@ -757,6 +757,11 @@ never reclaim dentries and inodes due to memory pressure and this can easily lead to out-of-memory conditions. Increasing vfs_cache_pressure beyond 100 causes the kernel to prefer to reclaim dentries and inodes. +Increasing vfs_cache_pressure significantly beyond 100 may have negative +performance impact. Reclaim code needs to take various locks to find freeable +directory and inode objects. With vfs_cache_pressure=1000, it will look for +ten times more freeable objects than there are. + ============================================================== zone_reclaim_mode: -- cgit v0.10.2 From 2a7a0e0fdc49a08740a69d51ef44ef09763072b0 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 4 Jun 2014 16:11:04 -0700 Subject: mm, memcg: periodically schedule when emptying page list mem_cgroup_force_empty_list() can iterate a large number of pages on an lru and mem_cgroup_move_parent() doesn't return an errno unless certain criteria, none of which indicate that the iteration may be taking too long, is met. We have encountered the following stack trace many times indicating "need_resched set for > 51000020 ns (51 ticks) without schedule", for example: scheduler_tick() mem_cgroup_move_account+0x4d/0x1d5 mem_cgroup_move_parent+0x8d/0x109 mem_cgroup_reparent_charges+0x149/0x2ba mem_cgroup_css_offline+0xeb/0x11b cgroup_offline_fn+0x68/0x16b process_one_work+0x129/0x350 If this iteration is taking too long, we still need to do cond_resched() even when an individual page is not busy. [rientjes@google.com: changelog] Signed-off-by: Hugh Dickins Signed-off-by: David Rientjes Acked-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d176edb..a500cb0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4675,9 +4675,9 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, if (mem_cgroup_move_parent(page, pc, memcg)) { /* found lock contention or "pc" is obsolete. */ busy = page; - cond_resched(); } else busy = NULL; + cond_resched(); } while (!list_empty(list)); } -- cgit v0.10.2 From 38515c73398a4c58059ecf1087e844561b58ee0f Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Wed, 4 Jun 2014 16:11:06 -0700 Subject: zram: correct offset usage in zram_bio_discard We want to skip the physical block(PAGE_SIZE) which is partially covered by the discard bio, so we check the remaining size and subtract it if there is a need to goto the next physical block. The current offset usage in zram_bio_discard is incorrect, it will cause its upper filesystem breakdown. Consider the following scenario: On some architecture or config, PAGE_SIZE is 64K for example, filesystem is set up on zram disk without PAGE_SIZE aligned, a discard bio leads to a offset = 4K and size=72K, normally, it should not really discard any physical block as it partially cover two physical blocks. However, with the current offset usage, it will discard the second physical block and free its memory, which will cause filesystem breakdown. This patch corrects the offset usage in zram_bio_discard. Signed-off-by: Weijie Yang Cc: Minchan Kim Cc: Nitin Gupta Acked-by: Joonsoo Kim Cc: Sergey Senozhatsky Cc: Bob Liu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 9849b52..48eccb3 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -572,10 +572,10 @@ static void zram_bio_discard(struct zram *zram, u32 index, * skipping this logical block is appropriate here. */ if (offset) { - if (n < offset) + if (n <= (PAGE_SIZE - offset)) return; - n -= offset; + n -= (PAGE_SIZE - offset); index++; } -- cgit v0.10.2 From 50417c55562c03e6746b13aee650c2bbb048fea3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:07 -0700 Subject: mm/zbud.c: make size unsigned like unique callsite zbud_alloc is only called by zswap_frontswap_store with unsigned int len. Change function parameter + update >= 0 check. Signed-off-by: Fabian Frederick Acked-by: Seth Jennings Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/zbud.h b/include/linux/zbud.h index 2571a5c..13af0d4 100644 --- a/include/linux/zbud.h +++ b/include/linux/zbud.h @@ -11,7 +11,7 @@ struct zbud_ops { struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops); void zbud_destroy_pool(struct zbud_pool *pool); -int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, unsigned long *handle); void zbud_free(struct zbud_pool *pool, unsigned long handle); int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries); diff --git a/mm/zbud.c b/mm/zbud.c index 9451361..01df13a 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -247,7 +247,7 @@ void zbud_destroy_pool(struct zbud_pool *pool) * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate * a new page. */ -int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, +int zbud_alloc(struct zbud_pool *pool, unsigned int size, gfp_t gfp, unsigned long *handle) { int chunks, i, freechunks; @@ -255,7 +255,7 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, enum buddy bud; struct page *page; - if (size <= 0 || gfp & __GFP_HIGHMEM) + if (!size || (gfp & __GFP_HIGHMEM)) return -EINVAL; if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) return -ENOSPC; -- cgit v0.10.2 From 7eb52512a977854eca51d9b692c2f3be8a0e5eeb Mon Sep 17 00:00:00 2001 From: Weijie Yang Date: Wed, 4 Jun 2014 16:11:08 -0700 Subject: zsmalloc: fixup trivial zs size classes value in comments According to calculation, ZS_SIZE_CLASSES value is 255 on systems with 4K page size, not 254. The old value may forget count the ZS_MIN_ALLOC_SIZE in. This patch fixes this trivial issue in the comments. Signed-off-by: Weijie Yang Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 5ae5d85..fe78189 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -141,7 +141,7 @@ #define ZS_MAX_ALLOC_SIZE PAGE_SIZE /* - * On systems with 4K page size, this gives 254 size classes! There is a + * On systems with 4K page size, this gives 255 size classes! There is a * trader-off here: * - Large number of size classes is potentially wasteful as free page are * spread across these classes -- cgit v0.10.2 From 93ef6d6ca11382eff03812797da457bc176653a4 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 4 Jun 2014 16:11:09 -0700 Subject: mm/vmalloc.c: export unmap_kernel_range() zsmalloc needs exported unmap_kernel_range for building as a module. See https://lkml.org/lkml/2013/1/18/487 I didn't send a patch to make unmap_kernel_range exportable at that time because zram was staging stuff and I thought VM function exporting for staging stuff makes no sense. Now zsmalloc was promoted. If we can't build zsmalloc as module, it means we can't build zram as module, either. Additionally, buddy map_vm_area is already exported so let's export unmap_kernel_range to help his buddy. Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 2ed75fb..f64632b 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1268,6 +1268,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) vunmap_page_range(addr, end); flush_tlb_kernel_range(addr, end); } +EXPORT_SYMBOL_GPL(unmap_kernel_range); int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) { -- cgit v0.10.2 From d867f203b974e9a670028fda909ef09044b221f6 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 4 Jun 2014 16:11:10 -0700 Subject: mm/zsmalloc: make zsmalloc module-buildable Now, we can build zsmalloc as module because unmap_kernel_range was exported. Signed-off-by: Minchan Kim Cc: Nitin Gupta Cc: Sergey Senozhatsky Cc: Jerome Marchand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/Kconfig b/mm/Kconfig index 0f00bff..3e9977a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -551,7 +551,7 @@ config MEM_SOFT_DIRTY See Documentation/vm/soft-dirty.txt for more details. config ZSMALLOC - bool "Memory allocator for compressed pages" + tristate "Memory allocator for compressed pages" depends on MMU default n help -- cgit v0.10.2 From 72d09633c9afa02bea317d65eb8b8a4ce7659a2a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 Jun 2014 16:11:11 -0700 Subject: mm/zswap: NUMA aware allocation for zswap_dstmem zswap_dstmem is a percpu block of memory, which should be allocated using kmalloc_node(), to get better NUMA locality. Without it, all the blocks are allocated from a single node. Signed-off-by: Eric Dumazet Acked-by: Seth Jennings Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/zswap.c b/mm/zswap.c index aeaef0f..008388fe 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -347,7 +347,7 @@ static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) return NOTIFY_BAD; } *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; - dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu)); if (!dst) { pr_err("can't allocate compressor buffer\n"); crypto_free_comp(tfm); -- cgit v0.10.2 From f6187769dae48234f3877df3c4d99294cc2254fa Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:12 -0700 Subject: sys_sgetmask/sys_ssetmask: add CONFIG_SGETMASK_SYSCALL sys_sgetmask and sys_ssetmask are obsolete system calls no longer supported in libc. This patch replaces architecture related __ARCH_WANT_SYS_SGETMAX by expert mode configuration.That option is enabled by default for those architectures. Signed-off-by: Fabian Frederick Cc: Steven Miao Cc: Mikael Starvik Cc: Jesper Nilsson Cc: David Howells Cc: Geert Uytterhoeven Cc: Michal Simek Cc: Ralf Baechle Cc: Koichi Yasutake Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: "David S. Miller" Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Greg Ungerer Cc: Heiko Carstens Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/blackfin/include/asm/unistd.h b/arch/blackfin/include/asm/unistd.h index c35414b..c8c8ff9 100644 --- a/arch/blackfin/include/asm/unistd.h +++ b/arch/blackfin/include/asm/unistd.h @@ -12,7 +12,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_FADVISE64 #define __ARCH_WANT_SYS_GETPGRP diff --git a/arch/cris/include/asm/unistd.h b/arch/cris/include/asm/unistd.h index 5cc7d19..0f40fed 100644 --- a/arch/cris/include/asm/unistd.h +++ b/arch/cris/include/asm/unistd.h @@ -15,7 +15,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/frv/include/asm/unistd.h b/arch/frv/include/asm/unistd.h index 70ec729..17b5df8 100644 --- a/arch/frv/include/asm/unistd.h +++ b/arch/frv/include/asm/unistd.h @@ -13,7 +13,6 @@ /* #define __ARCH_WANT_SYS_GETHOSTNAME */ #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -/* #define __ARCH_WANT_SYS_SGETMASK */ /* #define __ARCH_WANT_SYS_SIGNAL */ #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/m68k/include/asm/unistd.h b/arch/m68k/include/asm/unistd.h index 33afa56..1fcdd34 100644 --- a/arch/m68k/include/asm/unistd.h +++ b/arch/m68k/include/asm/unistd.h @@ -13,7 +13,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/microblaze/include/asm/unistd.h b/arch/microblaze/include/asm/unistd.h index b14232b..fd56a8f 100644 --- a/arch/microblaze/include/asm/unistd.h +++ b/arch/microblaze/include/asm/unistd.h @@ -19,7 +19,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/mips/include/asm/unistd.h b/arch/mips/include/asm/unistd.h index 413d6c6..e558130 100644 --- a/arch/mips/include/asm/unistd.h +++ b/arch/mips/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_UTIME #define __ARCH_WANT_SYS_WAITPID #define __ARCH_WANT_SYS_SOCKETCALL diff --git a/arch/mn10300/include/asm/unistd.h b/arch/mn10300/include/asm/unistd.h index 9d4e2d1..0522468 100644 --- a/arch/mn10300/include/asm/unistd.h +++ b/arch/mn10300/include/asm/unistd.h @@ -26,7 +26,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/parisc/include/asm/unistd.h b/arch/parisc/include/asm/unistd.h index 74d8358..5f4c68d 100644 --- a/arch/parisc/include/asm/unistd.h +++ b/arch/parisc/include/asm/unistd.h @@ -145,7 +145,6 @@ type name(type1 arg1, type2 arg2, type3 arg3, type4 arg4, type5 arg5) \ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_COMPAT_SYS_TIME diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index 9b892bb..5ce5552 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -29,7 +29,6 @@ #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_IPC #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/sh/include/asm/unistd.h b/arch/sh/include/asm/unistd.h index e77816c..126fe83 100644 --- a/arch/sh/include/asm/unistd.h +++ b/arch/sh/include/asm/unistd.h @@ -11,7 +11,6 @@ # define __ARCH_WANT_SYS_GETHOSTNAME # define __ARCH_WANT_SYS_IPC # define __ARCH_WANT_SYS_PAUSE -# define __ARCH_WANT_SYS_SGETMASK # define __ARCH_WANT_SYS_SIGNAL # define __ARCH_WANT_SYS_TIME # define __ARCH_WANT_SYS_UTIME diff --git a/arch/sparc/include/asm/unistd.h b/arch/sparc/include/asm/unistd.h index dfa53fd..0aac1e8 100644 --- a/arch/sparc/include/asm/unistd.h +++ b/arch/sparc/include/asm/unistd.h @@ -25,7 +25,6 @@ #define __ARCH_WANT_SYS_ALARM #define __ARCH_WANT_SYS_GETHOSTNAME #define __ARCH_WANT_SYS_PAUSE -#define __ARCH_WANT_SYS_SGETMASK #define __ARCH_WANT_SYS_SIGNAL #define __ARCH_WANT_SYS_TIME #define __ARCH_WANT_SYS_UTIME diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 3f556c6..2b19caa 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -41,7 +41,6 @@ # define __ARCH_WANT_SYS_OLD_GETRLIMIT # define __ARCH_WANT_SYS_OLD_UNAME # define __ARCH_WANT_SYS_PAUSE -# define __ARCH_WANT_SYS_SGETMASK # define __ARCH_WANT_SYS_SIGNAL # define __ARCH_WANT_SYS_SIGPENDING # define __ARCH_WANT_SYS_SIGPROCMASK diff --git a/init/Kconfig b/init/Kconfig index ce034ad..9d76b99 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1313,6 +1313,16 @@ config UID16 help This enables the legacy 16-bit UID syscall wrappers. +config SGETMASK_SYSCALL + bool "sgetmask/ssetmask syscalls support" if EXPERT + def_bool PARISC || MN10300 || BLACKFIN || M68K || PPC || MIPS || X86 || SPARC || CRIS || MICROBLAZE || SUPERH + ---help--- + sys_sgetmask and sys_ssetmask are obsolete system calls + no longer supported in libc but still enabled by default in some + architectures. + + If unsure, leave the default option here. + config SYSFS_SYSCALL bool "Sysfs syscall support" if EXPERT default y diff --git a/kernel/signal.c b/kernel/signal.c index 6ea13c0..6e600aaa 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3496,7 +3496,7 @@ COMPAT_SYSCALL_DEFINE3(sigaction, int, sig, } #endif -#ifdef __ARCH_WANT_SYS_SGETMASK +#ifdef CONFIG_SGETMASK_SYSCALL /* * For backwards compatibility. Functionality superseded by sigprocmask. @@ -3517,7 +3517,7 @@ SYSCALL_DEFINE1(ssetmask, int, newmask) return old; } -#endif /* __ARCH_WANT_SGETMASK */ +#endif /* CONFIG_SGETMASK_SYSCALL */ #ifdef __ARCH_WANT_SYS_SIGNAL /* diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index bc8d1b7..36441b5 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -135,6 +135,8 @@ cond_syscall(sys_setresgid16); cond_syscall(sys_setresuid16); cond_syscall(sys_setreuid16); cond_syscall(sys_setuid16); +cond_syscall(sys_sgetmask); +cond_syscall(sys_ssetmask); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); cond_syscall(sys_ipc); -- cgit v0.10.2 From e37dcbfbb2b2365870cbdd12f9d505c52941f300 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:14 -0700 Subject: fs/efivarfs/super.c: use static const for dentry_operations ...like other filesystems. Signed-off-by: Fabian Frederick Cc: Matthew Garrett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index becc725..0a48886 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -83,7 +83,7 @@ static int efivarfs_d_hash(const struct dentry *dentry, struct qstr *qstr) return 0; } -static struct dentry_operations efivarfs_d_ops = { +static const struct dentry_operations efivarfs_d_ops = { .d_compare = efivarfs_d_compare, .d_hash = efivarfs_d_hash, .d_delete = always_delete_dentry, -- cgit v0.10.2 From 00f01791e159ba7fb9d397c27d34377206157c6a Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:15 -0700 Subject: fs/exportfs/expfs.c: kernel-doc warning fixes Fixing 2 typo in function comments. Signed-off-by: Fabian Frederick Cc: Al Viro Cc: "J. Bruce Fields" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 48a359d..b01fbfb 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -259,7 +259,7 @@ static int filldir_one(void * __buf, const char * name, int len, /** * get_name - default export_operations->get_name function - * @dentry: the directory in which to find a name + * @path: the directory in which to find a name * @name: a pointer to a %NAME_MAX+1 char buffer to store the name * @child: the dentry for the child directory. * @@ -337,7 +337,7 @@ out: /** * export_encode_fh - default export_operations->encode_fh function * @inode: the object to encode - * @fh: where to store the file handle fragment + * @fid: where to store the file handle fragment * @max_len: maximum length to store there * @parent: parent directory inode, if wanted * -- cgit v0.10.2 From 2c0d259e0e580dd95dd5d2d5aa4926169228d4a0 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Wed, 4 Jun 2014 16:11:16 -0700 Subject: compiler.h: avoid sparse errors in __compiletime_error_fallback() Usually, BUG_ON and friends aren't even evaluated in sparse, but recently compiletime_assert_atomic_type() was added, and that now results in a sparse warning every time it is used. The reason turns out to be the temporary variable, after it sparse no longer considers the value to be a constant, and results in a warning and an error. The error is the more annoying part of this as it suppresses any further warnings in the same file, hiding other problems. Unfortunately the condition cannot be simply expanded out to avoid the temporary variable since it breaks compiletime_assert on old versions of GCC such as GCC 4.2.4 which the latest metag compiler is based on. Therefore #ifndef __CHECKER__ out the __compiletime_error_fallback which uses the potentially negative size array to trigger a conditional compiler error, so that sparse doesn't see it. Signed-off-by: James Hogan Cc: Johannes Berg Cc: Daniel Santos Cc: Luciano Coelho Cc: Peter Zijlstra Cc: Paul E. McKenney Acked-by: Johannes Berg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/compiler.h b/include/linux/compiler.h index ee7239e..64fdfe1 100644 --- a/include/linux/compiler.h +++ b/include/linux/compiler.h @@ -323,9 +323,18 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect); #endif #ifndef __compiletime_error # define __compiletime_error(message) -# define __compiletime_error_fallback(condition) \ +/* + * Sparse complains of variable sized arrays due to the temporary variable in + * __compiletime_assert. Unfortunately we can't just expand it out to make + * sparse see a constant array size without breaking compiletime_assert on old + * versions of GCC (e.g. 4.2.4), so hide the array from sparse altogether. + */ +# ifndef __CHECKER__ +# define __compiletime_error_fallback(condition) \ do { ((void)sizeof(char[1 - 2 * condition])); } while (0) -#else +# endif +#endif +#ifndef __compiletime_error_fallback # define __compiletime_error_fallback(condition) do { } while (0) #endif -- cgit v0.10.2 From 84117da5b79ffb4077bb05d64c86dfa4d746115c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:17 -0700 Subject: kernel/cpu.c: convert printk to pr_foo() no level printk converted to pr_warn (if err) no level printk converted to pr_info (disabling non-boot cpus) Other printk converted to respective level. Signed-off-by: Fabian Frederick Cc: "Rafael J. Wysocki" Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/cpu.c b/kernel/cpu.c index 247979a..acf791c 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -283,8 +283,7 @@ static inline void check_for_tasks(int cpu) task_cputime(p, &utime, &stime); if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (utime || stime)) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " - "(state = %ld, flags = %x)\n", + pr_warn("Task %s (pid = %d) is on cpu %d (state = %ld, flags = %x)\n", p->comm, task_pid_nr(p), cpu, p->state, p->flags); } @@ -336,8 +335,8 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (err) { nr_calls--; __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); - printk("%s: attempt to take down CPU %u failed\n", - __func__, cpu); + pr_warn("%s: attempt to take down CPU %u failed\n", + __func__, cpu); goto out_release; } @@ -444,8 +443,8 @@ static int _cpu_up(unsigned int cpu, int tasks_frozen) ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); if (ret) { nr_calls--; - printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", - __func__, cpu); + pr_warn("%s: attempt to bring up CPU %u failed\n", + __func__, cpu); goto out_notify; } @@ -475,11 +474,10 @@ int cpu_up(unsigned int cpu) int err = 0; if (!cpu_possible(cpu)) { - printk(KERN_ERR "can't online cpu %d because it is not " - "configured as may-hotadd at boot time\n", cpu); + pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n", + cpu); #if defined(CONFIG_IA64) - printk(KERN_ERR "please check additional_cpus= boot " - "parameter\n"); + pr_err("please check additional_cpus= boot parameter\n"); #endif return -EINVAL; } @@ -518,7 +516,7 @@ int disable_nonboot_cpus(void) */ cpumask_clear(frozen_cpus); - printk("Disabling non-boot CPUs ...\n"); + pr_info("Disabling non-boot CPUs ...\n"); for_each_online_cpu(cpu) { if (cpu == first_cpu) continue; @@ -526,8 +524,7 @@ int disable_nonboot_cpus(void) if (!error) cpumask_set_cpu(cpu, frozen_cpus); else { - printk(KERN_ERR "Error taking CPU%d down: %d\n", - cpu, error); + pr_err("Error taking CPU%d down: %d\n", cpu, error); break; } } @@ -537,7 +534,7 @@ int disable_nonboot_cpus(void) /* Make sure the CPUs won't be enabled by someone else */ cpu_hotplug_disabled = 1; } else { - printk(KERN_ERR "Non-boot CPUs are not disabled\n"); + pr_err("Non-boot CPUs are not disabled\n"); } cpu_maps_update_done(); return error; @@ -561,17 +558,17 @@ void __ref enable_nonboot_cpus(void) if (cpumask_empty(frozen_cpus)) goto out; - printk(KERN_INFO "Enabling non-boot CPUs ...\n"); + pr_info("Enabling non-boot CPUs ...\n"); arch_enable_nonboot_cpus_begin(); for_each_cpu(cpu, frozen_cpus) { error = _cpu_up(cpu, 1); if (!error) { - printk(KERN_INFO "CPU%d is up\n", cpu); + pr_info("CPU%d is up\n", cpu); continue; } - printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error); + pr_warn("Error taking CPU%d up: %d\n", cpu, error); } arch_enable_nonboot_cpus_end(); -- cgit v0.10.2 From 462b29b8564c489e0aa3f5a3a505fd2776af5e55 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:18 -0700 Subject: kernel/backtracetest.c: replace no level printk by pr_info() Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c index a5e026b..1323360 100644 --- a/kernel/backtracetest.c +++ b/kernel/backtracetest.c @@ -19,8 +19,8 @@ static void backtrace_test_normal(void) { - printk("Testing a backtrace from process context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a backtrace from process context.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); dump_stack(); } @@ -37,8 +37,8 @@ static DECLARE_TASKLET(backtrace_tasklet, &backtrace_test_irq_callback, 0); static void backtrace_test_irq(void) { - printk("Testing a backtrace from irq context.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a backtrace from irq context.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); init_completion(&backtrace_work); tasklet_schedule(&backtrace_tasklet); @@ -51,8 +51,8 @@ static void backtrace_test_saved(void) struct stack_trace trace; unsigned long entries[8]; - printk("Testing a saved backtrace.\n"); - printk("The following trace is a kernel self test and not a bug!\n"); + pr_info("Testing a saved backtrace.\n"); + pr_info("The following trace is a kernel self test and not a bug!\n"); trace.nr_entries = 0; trace.max_entries = ARRAY_SIZE(entries); @@ -65,19 +65,19 @@ static void backtrace_test_saved(void) #else static void backtrace_test_saved(void) { - printk("Saved backtrace test skipped.\n"); + pr_info("Saved backtrace test skipped.\n"); } #endif static int backtrace_regression_test(void) { - printk("====[ backtrace testing ]===========\n"); + pr_info("====[ backtrace testing ]===========\n"); backtrace_test_normal(); backtrace_test_irq(); backtrace_test_saved(); - printk("====[ end of backtrace testing ]====\n"); + pr_info("====[ end of backtrace testing ]====\n"); return 0; } -- cgit v0.10.2 From a6c8c6902c53e620e607e83f520e9ae424e2a424 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:19 -0700 Subject: kernel/capability.c: code clean-up - EXPORT_SYMBOL - typo: unexpectidly->unexpectedly - function prototype over 80 characters Signed-off-by: Fabian Frederick Cc: Serge Hallyn Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/capability.c b/kernel/capability.c index a8d63df..84b2bbf 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -24,7 +24,6 @@ */ const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET; - EXPORT_SYMBOL(__cap_empty_set); int file_caps_enabled = 1; @@ -189,7 +188,7 @@ SYSCALL_DEFINE2(capget, cap_user_header_t, header, cap_user_data_t, dataptr) * * An alternative would be to return an error here * (-ERANGE), but that causes legacy applications to - * unexpectidly fail; the capget/modify/capset aborts + * unexpectedly fail; the capget/modify/capset aborts * before modification is attempted and the application * fails. */ @@ -395,7 +394,8 @@ EXPORT_SYMBOL(ns_capable); * This does not set PF_SUPERPRIV because the caller may not * actually be privileged. */ -bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +bool file_ns_capable(const struct file *file, struct user_namespace *ns, + int cap) { if (WARN_ON_ONCE(!cap_valid(cap))) return false; -- cgit v0.10.2 From b9e5db6d2bbe4416cd1c30c2d1891ef39d6bd0b7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:20 -0700 Subject: kernel/exec_domain.c: code clean-up Fix checkpatch warnings about EXPORT_SYMBOL and return() Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index 0dbeae3..83d4382 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -37,7 +37,7 @@ static unsigned long ident_map[32] = { struct exec_domain default_exec_domain = { .name = "Linux", /* name */ .handler = default_handler, /* lcall7 causes a seg fault. */ - .pers_low = 0, /* PER_LINUX personality. */ + .pers_low = 0, /* PER_LINUX personality. */ .pers_high = 0, /* PER_LINUX personality. */ .signal_map = ident_map, /* Identity map signals. */ .signal_invmap = ident_map, /* - both ways. */ @@ -83,7 +83,7 @@ lookup_exec_domain(unsigned int personality) ep = &default_exec_domain; out: read_unlock(&exec_domains_lock); - return (ep); + return ep; } int @@ -110,8 +110,9 @@ register_exec_domain(struct exec_domain *ep) out: write_unlock(&exec_domains_lock); - return (err); + return err; } +EXPORT_SYMBOL(register_exec_domain); int unregister_exec_domain(struct exec_domain *ep) @@ -133,6 +134,7 @@ unregister: write_unlock(&exec_domains_lock); return 0; } +EXPORT_SYMBOL(unregister_exec_domain); int __set_personality(unsigned int personality) { @@ -144,6 +146,7 @@ int __set_personality(unsigned int personality) return 0; } +EXPORT_SYMBOL(__set_personality); #ifdef CONFIG_PROC_FS static int execdomains_proc_show(struct seq_file *m, void *v) @@ -188,8 +191,3 @@ SYSCALL_DEFINE1(personality, unsigned int, personality) return old; } - - -EXPORT_SYMBOL(register_exec_domain); -EXPORT_SYMBOL(unregister_exec_domain); -EXPORT_SYMBOL(__set_personality); -- cgit v0.10.2 From eaa1809b900c460a020bff1f4030f4f6a237b2b2 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:21 -0700 Subject: kernel/latencytop.c: convert seq_printf to seq_puts This patch also fixes one function declaration over 80 characters. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a462b31..a028127 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -88,7 +88,8 @@ static void clear_global_latency_tracing(void) } static void __sched -account_global_scheduler_latency(struct task_struct *tsk, struct latency_record *lat) +account_global_scheduler_latency(struct task_struct *tsk, + struct latency_record *lat) { int firstnonnull = MAXLR + 1; int i; @@ -255,7 +256,7 @@ static int lstats_show(struct seq_file *m, void *v) break; seq_printf(m, " %ps", (void *)bt); } - seq_printf(m, "\n"); + seq_puts(m, "\n"); } } return 0; -- cgit v0.10.2 From cf25004069d3ccd6aae607d8175bdff67c1dd319 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:22 -0700 Subject: kernel/stop_machine.c: kernel-doc warning fix Signed-off-by: Fabian Frederick Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 01fbae5..695f0c6 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -307,6 +307,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * * @cpu: cpu to stop * @fn: function to execute * @arg: argument to @fn + * @work_buf: pointer to cpu_stop_work structure * * Similar to stop_one_cpu() but doesn't wait for completion. The * caller is responsible for ensuring @work_buf is currently unused -- cgit v0.10.2 From cac92ba74f19fd58a28976f753f9327f27cf1669 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:23 -0700 Subject: kernel/tracepoint.c: kernel-doc fixes Signed-off-by: Fabian Frederick Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 6620e58..33cbd8c 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -239,6 +239,7 @@ static int tracepoint_remove_func(struct tracepoint *tp, * tracepoint_probe_register - Connect a probe to a tracepoint * @tp: tracepoint * @probe: probe handler + * @data: tracepoint data * * Returns 0 if ok, error value on error. * Note: if @tp is within a module, the caller is responsible for @@ -264,6 +265,7 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register); * tracepoint_probe_unregister - Disconnect a probe from a tracepoint * @tp: tracepoint * @probe: probe function pointer + * @data: tracepoint data * * Returns 0 if ok, error value on error. */ -- cgit v0.10.2 From 6c5a53c67057bddf7f8e26c93a8e045215f61539 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:24 -0700 Subject: kernel/res_counter.c: replace simple_strtoull by kstrtoull [akpm@linux-foundation.org: don't overwrite kstrtoull()'s errno] Signed-off-by: Fabian Frederick Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/res_counter.c b/kernel/res_counter.c index 51dbac6..e791130 100644 --- a/kernel/res_counter.c +++ b/kernel/res_counter.c @@ -186,8 +186,11 @@ int res_counter_memparse_write_strategy(const char *buf, /* return RES_COUNTER_MAX(unlimited) if "-1" is specified */ if (*buf == '-') { - res = simple_strtoull(buf + 1, &end, 10); - if (res != 1 || *end != '\0') + int rc = kstrtoull(buf + 1, 10, &res); + + if (rc) + return rc; + if (res != 1) return -EINVAL; *resp = RES_COUNTER_MAX; return 0; -- cgit v0.10.2 From 616feab753972b9751308f3cd2a68fc57eae8edb Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:25 -0700 Subject: kernel/reboot.c: convert simple_strtoul to kstrtoint Replace obsolete function. kstrtoint is used as reboot_cpu is an integer. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/reboot.c b/kernel/reboot.c index 662c83f..a3a9e24 100644 --- a/kernel/reboot.c +++ b/kernel/reboot.c @@ -388,15 +388,22 @@ static int __init reboot_setup(char *str) break; case 's': - if (isdigit(*(str+1))) - reboot_cpu = simple_strtoul(str+1, NULL, 0); - else if (str[1] == 'm' && str[2] == 'p' && - isdigit(*(str+3))) - reboot_cpu = simple_strtoul(str+3, NULL, 0); - else + { + int rc; + + if (isdigit(*(str+1))) { + rc = kstrtoint(str+1, 0, &reboot_cpu); + if (rc) + return rc; + } else if (str[1] == 'm' && str[2] == 'p' && + isdigit(*(str+3))) { + rc = kstrtoint(str+3, 0, &reboot_cpu); + if (rc) + return rc; + } else reboot_mode = REBOOT_SOFT; break; - + } case 'g': reboot_mode = REBOOT_GPIO; break; -- cgit v0.10.2 From 95583e4ab5745218373add88ffddb70faff2d0c8 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:26 -0700 Subject: kernel/utsname_sysctl.c: replace obsolete __initcall by device_initcall Also fixes checkpatch warnings on proc_dostring function parameters Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c index 4f69f9a..6fbe811 100644 --- a/kernel/utsname_sysctl.c +++ b/kernel/utsname_sysctl.c @@ -51,7 +51,7 @@ static int proc_do_uts_string(ctl_table *table, int write, int r; memcpy(&uts_table, table, sizeof(uts_table)); uts_table.data = get_uts(table, write); - r = proc_dostring(&uts_table,write,buffer,lenp, ppos); + r = proc_dostring(&uts_table, write, buffer, lenp, ppos); put_uts(table, write, uts_table.data); if (write) @@ -135,4 +135,4 @@ static int __init utsname_sysctl_init(void) return 0; } -__initcall(utsname_sysctl_init); +device_initcall(utsname_sysctl_init); -- cgit v0.10.2 From b51dbec68c8732caac2495f558659556523e8322 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:26 -0700 Subject: kernel/hung_task.c: convert simple_strtoul to kstrtouint sysctl_hung_task_panic has been changed to unsigned int. use kstrtouint instead of obsolete simple_strtoul Signed-off-by: Fabian Frederick Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 06bb141..06db124 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -52,8 +52,10 @@ unsigned int __read_mostly sysctl_hung_task_panic = static int __init hung_task_panic_setup(char *str) { - sysctl_hung_task_panic = simple_strtoul(str, NULL, 0); + int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); + if (rc) + return rc; return 1; } __setup("hung_task_panic=", hung_task_panic_setup); -- cgit v0.10.2 From b300a4ea665f7fa44f015616ac1874deca891c5e Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 4 Jun 2014 16:11:27 -0700 Subject: kernel/user.c: drop unused field 'files' from user_struct Nobody seems uses it for a long time. Let's drop it. Signed-off-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/sched.h b/include/linux/sched.h index 2f2dd7d..611676f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -745,7 +745,6 @@ static inline int signal_group_exit(const struct signal_struct *sig) struct user_struct { atomic_t __count; /* reference count */ atomic_t processes; /* How many processes does this user have? */ - atomic_t files; /* How many open files does this user have? */ atomic_t sigpending; /* How many pending signals does this user have? */ #ifdef CONFIG_INOTIFY_USER atomic_t inotify_watches; /* How many inotify watches does this user have? */ diff --git a/kernel/user.c b/kernel/user.c index 294fc6a..4efa393 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -87,7 +87,6 @@ static DEFINE_SPINLOCK(uidhash_lock); struct user_struct root_user = { .__count = ATOMIC_INIT(1), .processes = ATOMIC_INIT(1), - .files = ATOMIC_INIT(0), .sigpending = ATOMIC_INIT(0), .locked_shm = 0, .uid = GLOBAL_ROOT_UID, -- cgit v0.10.2 From 0a581694ab7a5bc083d710df8a552a6a055b005f Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:28 -0700 Subject: printk: split code for making free space in the log buffer The check for free space in the log buffer always passes when "first_seq" and "next_seq" are equal. In theory, it might cause writing outside of the log buffer. Fortunately, the current usage looks safe because the used "text" and "dict" buffers are quite limited. See the second patch for more details. Anyway, it is better to be on the safe side and add a check. An easy solution is done in the 2nd patch and it is improved in the 4th patch. 5th patch fixes the computation of the printed message length. 1st and 3rd patches just do some code refactoring to make the other patches easier. This patch (of 5): There will be needed some fixes in the check for free space. They will be easier if the code is moved outside of the quite long log_store() function. This patch does not change the existing behavior. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 221229c..99b7a2d 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -297,6 +297,34 @@ static u32 log_next(u32 idx) return idx + msg->len; } +/* check whether there is enough free space for the given message */ +static int logbuf_has_space(u32 msg_size) +{ + u32 free; + + if (log_next_idx > log_first_idx) + free = max(log_buf_len - log_next_idx, log_first_idx); + else + free = log_first_idx - log_next_idx; + + /* + * We need space also for an empty header that signalizes wrapping + * of the buffer. + */ + return free >= msg_size + sizeof(struct printk_log); +} + +static void log_make_free_space(u32 msg_size) +{ + while (log_first_seq < log_next_seq) { + if (logbuf_has_space(msg_size)) + return; + /* drop old messages until we have enough continuous space */ + log_first_idx = log_next(log_first_idx); + log_first_seq++; + } +} + /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, enum log_flags flags, u64 ts_nsec, @@ -311,21 +339,7 @@ static void log_store(int facility, int level, pad_len = (-size) & (LOG_ALIGN - 1); size += pad_len; - while (log_first_seq < log_next_seq) { - u32 free; - - if (log_next_idx > log_first_idx) - free = max(log_buf_len - log_next_idx, log_first_idx); - else - free = log_first_idx - log_next_idx; - - if (free >= size + sizeof(struct printk_log)) - break; - - /* drop old messages until we have enough contiuous space */ - log_first_idx = log_next(log_first_idx); - log_first_seq++; - } + log_make_free_space(size); if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* -- cgit v0.10.2 From f40e4b9f70d48eb08f443642283fdd9d05b27c6d Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:30 -0700 Subject: printk: ignore too long messages There was no check for too long messages. The check for free space always passed when first_seq and next_seq were equal. Enough free space was not guaranteed, though. log_store() might be called to store messages up to 64kB + 64kB + 16B. This is sum of maximal text_len, dict_len values, and the size of the structure printk_log. On the other hand, the minimal size for the main log buffer currently is 4kB and it is enforced only by Kconfig. The good news is that the usage looks safe right now. log_store() is called only from vprintk_emit() and cont_flush(). Here the "text" part is always passed via a static buffer and the length is limited to LOG_LINE_MAX which is 1024. The "dict" part is NULL in most cases. The only exceptions is when vprintk_emit() is called from printk_emit() and dev_vprintk_emit(). But printk_emit() is currently used only in devkmsg_writev() and here "dict" is NULL as well. In dev_vprintk_emit(), "dict" is limited by the static buffer "hdr" of the size 128 bytes. It meas that the current maximal printed text is 1024B + 128B + 16B and it always fit the log buffer. But it is only matter of time when someone calls printk_emit() with unsafe parameters, especially the "dict" one. This patch adds a check for the free space when the buffer is empty. It reuses the already existing log_has_space() function but it has to add an extra parameter. It defines whether the buffer is empty. Note that the same values of "first_idx" and "next_idx" might also mean that the buffer is full. If the buffer is empty, we must respect the current position of the indexes. We cannot reset them to the beginning of the buffer. Otherwise, the functions reading the buffer would get crazy. The question is what to do when the message is too long. This patch uses the easiest solution and just ignores the problematic message. Let's do something better in a followup patch. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 99b7a2d..8fbbab1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -297,12 +297,20 @@ static u32 log_next(u32 idx) return idx + msg->len; } -/* check whether there is enough free space for the given message */ -static int logbuf_has_space(u32 msg_size) +/* + * Check whether there is enough free space for the given message. + * + * The same values of first_idx and next_idx mean that the buffer + * is either empty or full. + * + * If the buffer is empty, we must respect the position of the indexes. + * They cannot be reset to the beginning of the buffer. + */ +static int logbuf_has_space(u32 msg_size, bool empty) { u32 free; - if (log_next_idx > log_first_idx) + if (log_next_idx > log_first_idx || empty) free = max(log_buf_len - log_next_idx, log_first_idx); else free = log_first_idx - log_next_idx; @@ -314,15 +322,21 @@ static int logbuf_has_space(u32 msg_size) return free >= msg_size + sizeof(struct printk_log); } -static void log_make_free_space(u32 msg_size) +static int log_make_free_space(u32 msg_size) { while (log_first_seq < log_next_seq) { - if (logbuf_has_space(msg_size)) - return; + if (logbuf_has_space(msg_size, false)) + return 0; /* drop old messages until we have enough continuous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } + + /* sequence numbers are equal, so the log buffer is empty */ + if (logbuf_has_space(msg_size, true)) + return 0; + + return -ENOMEM; } /* insert record into the buffer, discard old ones, update heads */ @@ -339,7 +353,9 @@ static void log_store(int facility, int level, pad_len = (-size) & (LOG_ALIGN - 1); size += pad_len; - log_make_free_space(size); + /* if message does not fit empty log buffer, ignore it */ + if (log_make_free_space(size)) + return; if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* -- cgit v0.10.2 From 85c87043023b7e5535f975bbee12a4f5399df520 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:31 -0700 Subject: printk: split message size computation We will want to recompute the message size when shrinking too long messages. Let's put the code into separate function. The side effect of setting "pad_len" is not nice but it is worth removing the code duplication. Note that I will probably have one more usage for this function when handling messages safe way in NMI context. This patch does not change the existing behavior. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 8fbbab1..9f088ed 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -339,6 +339,18 @@ static int log_make_free_space(u32 msg_size) return -ENOMEM; } +/* compute the message size including the padding bytes */ +static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) +{ + u32 size; + + size = sizeof(struct printk_log) + text_len + dict_len; + *pad_len = (-size) & (LOG_ALIGN - 1); + size += *pad_len; + + return size; +} + /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, enum log_flags flags, u64 ts_nsec, @@ -349,9 +361,7 @@ static void log_store(int facility, int level, u32 size, pad_len; /* number of '\0' padding bytes to next message */ - size = sizeof(struct printk_log) + text_len + dict_len; - pad_len = (-size) & (LOG_ALIGN - 1); - size += pad_len; + size = msg_used_size(text_len, dict_len, &pad_len); /* if message does not fit empty log buffer, ignore it */ if (log_make_free_space(size)) -- cgit v0.10.2 From 55bd53a4eb3dd18be8744f8b4d026068fc801a62 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:32 -0700 Subject: printk: shrink too long messages We might want to print at least part of too long messages and add some warning for debugging purpose. The question is how long the shrunken message should be. If we use the whole buffer, it might get rotated too soon. Let's try to use only 1/4 of the buffer for now. Also shrink the whole dictionary. We do not want to parse it or break it in the middle of some pair of values. It would not cause any real harm but still. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 9f088ed..7131dd4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -351,6 +351,32 @@ static u32 msg_used_size(u16 text_len, u16 dict_len, u32 *pad_len) return size; } +/* + * Define how much of the log buffer we could take at maximum. The value + * must be greater than two. Note that only half of the buffer is available + * when the index points to the middle. + */ +#define MAX_LOG_TAKE_PART 4 +static const char trunc_msg[] = ""; + +static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, + u16 *dict_len, u32 *pad_len) +{ + /* + * The message should not take the whole buffer. Otherwise, it might + * get removed too soon. + */ + u32 max_text_len = log_buf_len / MAX_LOG_TAKE_PART; + if (*text_len > max_text_len) + *text_len = max_text_len; + /* enable the warning message */ + *trunc_msg_len = strlen(trunc_msg); + /* disable the "dict" completely */ + *dict_len = 0; + /* compute the size again, count also the warning message */ + return msg_used_size(*text_len + *trunc_msg_len, 0, pad_len); +} + /* insert record into the buffer, discard old ones, update heads */ static void log_store(int facility, int level, enum log_flags flags, u64 ts_nsec, @@ -359,13 +385,19 @@ static void log_store(int facility, int level, { struct printk_log *msg; u32 size, pad_len; + u16 trunc_msg_len = 0; /* number of '\0' padding bytes to next message */ size = msg_used_size(text_len, dict_len, &pad_len); - /* if message does not fit empty log buffer, ignore it */ - if (log_make_free_space(size)) - return; + if (log_make_free_space(size)) { + /* truncate the message if it is too long for empty buffer */ + size = truncate_msg(&text_len, &trunc_msg_len, + &dict_len, &pad_len); + /* survive when the log buffer is too small for trunc_msg */ + if (log_make_free_space(size)) + return; + } if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { /* @@ -381,6 +413,10 @@ static void log_store(int facility, int level, msg = (struct printk_log *)(log_buf + log_next_idx); memcpy(log_text(msg), text, text_len); msg->text_len = text_len; + if (trunc_msg_len) { + memcpy(log_text(msg) + text_len, trunc_msg, trunc_msg_len); + msg->text_len += trunc_msg_len; + } memcpy(log_dict(msg), dict, dict_len); msg->dict_len = dict_len; msg->facility = facility; -- cgit v0.10.2 From 034633ccb24d675850f99bf85c1c5880c831e4b6 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 4 Jun 2014 16:11:33 -0700 Subject: printk: return really stored message length I wonder if anyone uses printk return value but it is there and should be counted correctly. This patch modifies log_store() to return the number of really stored bytes from the 'text' part. Also it handles the return value in vprintk_emit(). Note that log_store() is used also in cont_flush() but we could ignore the return value there. The function works with characters that were already counted earlier. In addition, the store could newer fail here because the length of the printed text is limited by the "cont" buffer and "dict" is NULL. Signed-off-by: Petr Mladek Cc: Jan Kara Cc: Jiri Kosina Cc: Kay Sievers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7131dd4..7476a53 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -378,10 +378,10 @@ static u32 truncate_msg(u16 *text_len, u16 *trunc_msg_len, } /* insert record into the buffer, discard old ones, update heads */ -static void log_store(int facility, int level, - enum log_flags flags, u64 ts_nsec, - const char *dict, u16 dict_len, - const char *text, u16 text_len) +static int log_store(int facility, int level, + enum log_flags flags, u64 ts_nsec, + const char *dict, u16 dict_len, + const char *text, u16 text_len) { struct printk_log *msg; u32 size, pad_len; @@ -396,7 +396,7 @@ static void log_store(int facility, int level, &dict_len, &pad_len); /* survive when the log buffer is too small for trunc_msg */ if (log_make_free_space(size)) - return; + return 0; } if (log_next_idx + size + sizeof(struct printk_log) > log_buf_len) { @@ -432,6 +432,8 @@ static void log_store(int facility, int level, /* insert message */ log_next_idx += msg->len; log_next_seq++; + + return msg->text_len; } #ifdef CONFIG_SECURITY_DMESG_RESTRICT @@ -1606,10 +1608,10 @@ asmlinkage int vprintk_emit(int facility, int level, "BUG: recent printk recursion!"; recursion_bug = 0; - printed_len += strlen(recursion_msg); + text_len = strlen(recursion_msg); /* emit KERN_CRIT message */ - log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, printed_len); + printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, + NULL, 0, recursion_msg, text_len); } /* @@ -1662,9 +1664,12 @@ asmlinkage int vprintk_emit(int facility, int level, cont_flush(LOG_NEWLINE); /* buffer line if possible, otherwise store it right away */ - if (!cont_add(facility, level, text, text_len)) - log_store(facility, level, lflags | LOG_CONT, 0, - dict, dictlen, text, text_len); + if (cont_add(facility, level, text, text_len)) + printed_len += text_len; + else + printed_len += log_store(facility, level, + lflags | LOG_CONT, 0, + dict, dictlen, text, text_len); } else { bool stored = false; @@ -1683,11 +1688,12 @@ asmlinkage int vprintk_emit(int facility, int level, cont_flush(LOG_NEWLINE); } - if (!stored) - log_store(facility, level, lflags, 0, - dict, dictlen, text, text_len); + if (stored) + printed_len += text_len; + else + printed_len += log_store(facility, level, lflags, 0, + dict, dictlen, text, text_len); } - printed_len += text_len; /* * Try to acquire and then immediately release the console semaphore. -- cgit v0.10.2 From ca1d432ad8a527fabc5c7ceed8526e3a28de121c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:34 -0700 Subject: printk: remove outdated comment Comment about interesting interlocking between lockbuf_lock and console_sem is outdated. It was added in 2002 by commit a880f45a48be during conversion of console_lock to console_sem + lockbuf_lock. At that time release_console_sem() (today's equivalent is console_unlock()) was indeed using lockbuf_lock to avoid races between trylock on console_sem in printk() and unlock of console_sem. However these days the interlocking is gone and the races are avoided by rechecking logbuf state after releasing console_sem. Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 7476a53..5bc5447 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -206,8 +206,7 @@ struct printk_log { }; /* - * The logbuf_lock protects kmsg buffer, indices, counters. It is also - * used in interesting ways to provide interlocking in console_unlock(); + * The logbuf_lock protects kmsg buffer, indices, counters. */ static DEFINE_RAW_SPINLOCK(logbuf_lock); -- cgit v0.10.2 From 608873cacb9d0d2811586fcc79a38b64eabd6d32 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:35 -0700 Subject: printk: release lockbuf_lock before calling console_trylock_for_printk() There's no reason to hold lockbuf_lock when entering console_trylock_for_printk(). The first thing this function does is to call down_trylock(console_sem) and if that fails it immediately unlocks lockbuf_lock. So lockbuf_lock isn't needed for that branch. When down_trylock() succeeds, the rest of console_trylock() is OK without lockbuf_lock (it is called without it from other places), and the only remaining thing in console_trylock_for_printk() is can_use_console() call. For that call console_sem is enough (it iterates all consoles and checks CON_ANYTIME flag). So we drop logbuf_lock before entering console_trylock_for_printk() which simplifies the code. [akpm@linux-foundation.org: fix have_callable_console() comment] Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5bc5447..6e1b21a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -249,9 +249,6 @@ static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; static u32 log_buf_len = __LOG_BUF_LEN; -/* cpu currently holding logbuf_lock */ -static volatile unsigned int logbuf_cpu = UINT_MAX; - /* human readable text of the record */ static char *log_text(const struct printk_log *msg) { @@ -1380,7 +1377,10 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -/* Check if we have any console registered that can be called early in boot. */ +/* + * Check if we have any console that is capable of printing while cpu is + * booting or shutting down. Requires console_sem. + */ static int have_callable_console(void) { struct console *con; @@ -1410,36 +1410,22 @@ static inline int can_use_console(unsigned int cpu) * messages from a 'printk'. Return true (and with the * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. - * - * This gets called with the 'logbuf_lock' spinlock held and - * interrupts disabled. It should return with 'lockbuf_lock' - * released but interrupts still disabled. */ static int console_trylock_for_printk(unsigned int cpu) - __releases(&logbuf_lock) { - int retval = 0, wake = 0; - - if (console_trylock()) { - retval = 1; - - /* - * If we can't use the console, we need to release - * the console semaphore by hand to avoid flushing - * the buffer. We need to hold the console semaphore - * in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - wake = 1; - retval = 0; - } - } - logbuf_cpu = UINT_MAX; - raw_spin_unlock(&logbuf_lock); - if (wake) + if (!console_trylock()) + return 0; + /* + * If we can't use the console, we need to release the console + * semaphore by hand to avoid flushing the buffer. We need to hold the + * console semaphore in order to do this test safely. + */ + if (!can_use_console(cpu)) { + console_locked = 0; up(&console_sem); - return retval; + return 0; + } + return 1; } int printk_delay_msec __read_mostly; @@ -1572,6 +1558,9 @@ asmlinkage int vprintk_emit(int facility, int level, unsigned long flags; int this_cpu; int printed_len = 0; + /* cpu currently holding logbuf_lock in this function */ + static volatile unsigned int logbuf_cpu = UINT_MAX; + boot_delay_msec(level); printk_delay(); @@ -1694,13 +1683,12 @@ asmlinkage int vprintk_emit(int facility, int level, dict, dictlen, text, text_len); } + logbuf_cpu = UINT_MAX; + raw_spin_unlock(&logbuf_lock); /* * Try to acquire and then immediately release the console semaphore. * The release will print out buffers and wake up /dev/kmsg and syslog() * users. - * - * The console_trylock_for_printk() function will release 'logbuf_lock' - * regardless of whether it actually gets the console semaphore or not. */ if (console_trylock_for_printk(this_cpu)) console_unlock(); -- cgit v0.10.2 From bd8d7cf5b8410fe98eba06a9aaa90efe88815d8a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:36 -0700 Subject: printk: fix lockdep instrumentation of console_sem Printk calls mutex_acquire() / mutex_release() by hand to instrument lockdep about console_sem. However in some corner cases the instrumentation is missing. Fix the problem by creating helper functions for locking / unlocking console_sem which take care of lockdep instrumentation as well. Signed-off-by: Jan Kara Reported-by: Fabio Estevam Reported-by: Andy Shevchenko Tested-by: Fabio Estevam Tested-By: Valdis Kletnieks Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6e1b21a..5ba37f8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -91,6 +91,29 @@ static struct lockdep_map console_lock_dep_map = { #endif /* + * Helper macros to handle lockdep when locking/unlocking console_sem. We use + * macros instead of functions so that _RET_IP_ contains useful information. + */ +#define down_console_sem() do { \ + down(&console_sem);\ + mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_);\ +} while (0) + +static int __down_trylock_console_sem(unsigned long ip) +{ + if (down_trylock(&console_sem)) + return 1; + mutex_acquire(&console_lock_dep_map, 0, 1, ip); + return 0; +} +#define down_trylock_console_sem() __down_trylock_console_sem(_RET_IP_) + +#define up_console_sem() do { \ + mutex_release(&console_lock_dep_map, 1, _RET_IP_);\ + up(&console_sem);\ +} while (0) + +/* * This is used for debugging the mess that is the VT code by * keeping track if we have the console semaphore held. It's * definitely not the perfect debug tool (we don't know if _WE_ @@ -1422,7 +1445,7 @@ static int console_trylock_for_printk(unsigned int cpu) */ if (!can_use_console(cpu)) { console_locked = 0; - up(&console_sem); + up_console_sem(); return 0; } return 1; @@ -1951,16 +1974,14 @@ void suspend_console(void) printk("Suspending console(s) (use no_console_suspend to debug)\n"); console_lock(); console_suspended = 1; - up(&console_sem); - mutex_release(&console_lock_dep_map, 1, _RET_IP_); + up_console_sem(); } void resume_console(void) { if (!console_suspend_enabled) return; - down(&console_sem); - mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); + down_console_sem(); console_suspended = 0; console_unlock(); } @@ -2002,12 +2023,11 @@ void console_lock(void) { might_sleep(); - down(&console_sem); + down_console_sem(); if (console_suspended) return; console_locked = 1; console_may_schedule = 1; - mutex_acquire(&console_lock_dep_map, 0, 0, _RET_IP_); } EXPORT_SYMBOL(console_lock); @@ -2021,15 +2041,14 @@ EXPORT_SYMBOL(console_lock); */ int console_trylock(void) { - if (down_trylock(&console_sem)) + if (down_trylock_console_sem()) return 0; if (console_suspended) { - up(&console_sem); + up_console_sem(); return 0; } console_locked = 1; console_may_schedule = 0; - mutex_acquire(&console_lock_dep_map, 0, 1, _RET_IP_); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2091,7 +2110,7 @@ void console_unlock(void) bool retry; if (console_suspended) { - up(&console_sem); + up_console_sem(); return; } @@ -2153,7 +2172,6 @@ skip: local_irq_restore(flags); } console_locked = 0; - mutex_release(&console_lock_dep_map, 1, _RET_IP_); /* Release the exclusive_console once it is used */ if (unlikely(exclusive_console)) @@ -2161,7 +2179,7 @@ skip: raw_spin_unlock(&logbuf_lock); - up(&console_sem); + up_console_sem(); /* * Someone could have filled up the buffer again, so re-check if there's @@ -2206,7 +2224,7 @@ void console_unblank(void) * oops_in_progress is set to 1.. */ if (oops_in_progress) { - if (down_trylock(&console_sem) != 0) + if (down_trylock_console_sem() != 0) return; } else console_lock(); -- cgit v0.10.2 From 939f04bec1a4ef6ba4370b0f34b01decc844b1b1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 4 Jun 2014 16:11:37 -0700 Subject: printk: enable interrupts before calling console_trylock_for_printk() We need interrupts disabled when calling console_trylock_for_printk() only so that cpu id we pass to can_use_console() remains valid (for other things console_sem provides all the exclusion we need and deadlocks on console_sem due to interrupts are impossible because we use down_trylock()). However if we are rescheduled, we are guaranteed to run on an online cpu so we can easily just get the cpu id in can_use_console(). We can lose a bit of performance when we enable interrupts in vprintk_emit() and then disable them again in console_unlock() but OTOH it can somewhat reduce interrupt latency caused by console_unlock() especially since later in the patch series we will want to spin on console_sem in console_trylock_for_printk(). Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 5ba37f8..4e22230 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1418,10 +1418,9 @@ static int have_callable_console(void) /* * Can we actually use the console at this time on this cpu? * - * Console drivers may assume that per-cpu resources have - * been allocated. So unless they're explicitly marked as - * being able to cope (CON_ANYTIME) don't call them until - * this CPU is officially up. + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. */ static inline int can_use_console(unsigned int cpu) { @@ -1434,8 +1433,10 @@ static inline int can_use_console(unsigned int cpu) * console_lock held, and 'console_locked' set) if it * is successful, false otherwise. */ -static int console_trylock_for_printk(unsigned int cpu) +static int console_trylock_for_printk(void) { + unsigned int cpu = smp_processor_id(); + if (!console_trylock()) return 0; /* @@ -1605,7 +1606,8 @@ asmlinkage int vprintk_emit(int facility, int level, */ if (!oops_in_progress && !lockdep_recursing(current)) { recursion_bug = 1; - goto out_restore_irqs; + local_irq_restore(flags); + return 0; } zap_locks(); } @@ -1708,17 +1710,22 @@ asmlinkage int vprintk_emit(int facility, int level, logbuf_cpu = UINT_MAX; raw_spin_unlock(&logbuf_lock); + lockdep_on(); + local_irq_restore(flags); + + /* + * Disable preemption to avoid being preempted while holding + * console_sem which would prevent anyone from printing to console + */ + preempt_disable(); /* * Try to acquire and then immediately release the console semaphore. * The release will print out buffers and wake up /dev/kmsg and syslog() * users. */ - if (console_trylock_for_printk(this_cpu)) + if (console_trylock_for_printk()) console_unlock(); - - lockdep_on(); -out_restore_irqs: - local_irq_restore(flags); + preempt_enable(); return printed_len; } -- cgit v0.10.2 From 458df9fd4815b47809875d57f42e16401674b621 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 4 Jun 2014 16:11:38 -0700 Subject: printk: remove separate printk_sched buffers and use printk buf instead To prevent deadlocks with doing a printk inside the scheduler, printk_sched() was created. The issue is that printk has a console_sem that it can grab and release. The release does a wake up if there's a task pending on the sem, and this wake up grabs the rq locks that is held in the scheduler. This leads to a possible deadlock if the wake up uses the same rq as the one with the rq lock held already. What printk_sched() does is to save the printk write in a per cpu buffer and sets the PRINTK_PENDING_SCHED flag. On a timer tick, if this flag is set, the printk() is done against the buffer. There's a couple of issues with this approach. 1) If two printk_sched()s are called before the tick, the second one will overwrite the first one. 2) The temporary buffer is 512 bytes and is per cpu. This is a quite a bit of space wasted for something that is seldom used. In order to remove this, the printk_sched() can use the printk buffer instead, and delay the console_trylock()/console_unlock() to the queued work. Because printk_sched() would then be taking the logbuf_lock, the logbuf_lock must not be held while doing anything that may call into the scheduler functions, which includes wake ups. Unfortunately, printk() also has a console_sem that it uses, and on release, the up(&console_sem) may do a wake up of any pending waiters. This must be avoided while holding the logbuf_lock. Signed-off-by: Steven Rostedt Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4e22230..247b0c1 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -68,6 +68,9 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; +/* Deferred messaged from sched code are marked by this special level */ +#define SCHED_MESSAGE_LOGLEVEL -2 + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -229,7 +232,9 @@ struct printk_log { }; /* - * The logbuf_lock protects kmsg buffer, indices, counters. + * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken + * within the scheduler's rq lock. It must be released before calling + * console_unlock() or anything else that might wake up a process. */ static DEFINE_RAW_SPINLOCK(logbuf_lock); @@ -1577,14 +1582,19 @@ asmlinkage int vprintk_emit(int facility, int level, static int recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; - size_t text_len; + size_t text_len = 0; enum log_flags lflags = 0; unsigned long flags; int this_cpu; int printed_len = 0; + bool in_sched = false; /* cpu currently holding logbuf_lock in this function */ static volatile unsigned int logbuf_cpu = UINT_MAX; + if (level == SCHED_MESSAGE_LOGLEVEL) { + level = -1; + in_sched = true; + } boot_delay_msec(level); printk_delay(); @@ -1631,7 +1641,12 @@ asmlinkage int vprintk_emit(int facility, int level, * The printf needs to come first; we need the syslog * prefix which might be passed-in as a parameter. */ - text_len = vscnprintf(text, sizeof(textbuf), fmt, args); + if (in_sched) + text_len = scnprintf(text, sizeof(textbuf), + KERN_WARNING "[sched_delayed] "); + + text_len += vscnprintf(text + text_len, + sizeof(textbuf) - text_len, fmt, args); /* mark and strip a trailing newline */ if (text_len && text[text_len-1] == '\n') { @@ -1713,6 +1728,10 @@ asmlinkage int vprintk_emit(int facility, int level, lockdep_on(); local_irq_restore(flags); + /* If called from the scheduler, we can not call up(). */ + if (in_sched) + return printed_len; + /* * Disable preemption to avoid being preempted while holding * console_sem which would prevent anyone from printing to console @@ -2532,21 +2551,19 @@ late_initcall(printk_late_init); /* * Delayed printk version, for scheduler-internal messages: */ -#define PRINTK_BUF_SIZE 512 - #define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 +#define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = __this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - pr_warn("[sched_delayed] %s", buf); + if (pending & PRINTK_PENDING_OUTPUT) { + /* If trylock fails, someone else is doing the printing */ + if (console_trylock()) + console_unlock(); } if (pending & PRINTK_PENDING_WAKEUP) @@ -2570,21 +2587,15 @@ void wake_up_klogd(void) int printk_sched(const char *fmt, ...) { - unsigned long flags; va_list args; - char *buf; int r; - local_irq_save(flags); - buf = __get_cpu_var(printk_sched_buf); - va_start(args, fmt); - r = vsnprintf(buf, PRINTK_BUF_SIZE, fmt, args); + r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); va_end(args); - __this_cpu_or(printk_pending, PRINTK_PENDING_SCHED); + __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - local_irq_restore(flags); return r; } -- cgit v0.10.2 From 81954606265ab8f04b41154bd00576013affcf5b Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:39 -0700 Subject: printk: disable preemption for printk_sched An earlier change in -mm (printk: remove separate printk_sched buffers...), removed the printk_sched irqsave/restore lines since it was safe for current users. Since we may be expanding usage of printk_sched(), disable preepmtion for this function to make it more generally safe to call. Signed-off-by: John Stultz Reviewed-by: Jan Kara Cc: Peter Zijlstra Cc: Jiri Bohac Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 247b0c1..dc2b8bd 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2590,12 +2590,14 @@ int printk_sched(const char *fmt, ...) va_list args; int r; + preempt_disable(); va_start(args, fmt); r = vprintk_emit(0, SCHED_MESSAGE_LOGLEVEL, NULL, 0, fmt, args); va_end(args); __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + preempt_enable(); return r; } -- cgit v0.10.2 From aac74dc495456412c4130a1167ce4beb6c1f0b38 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:40 -0700 Subject: printk: rename printk_sched to printk_deferred After learning we'll need some sort of deferred printk functionality in the timekeeping core, Peter suggested we rename the printk_sched function so it can be reused by needed subsystems. This only changes the function name. No logic changes. Signed-off-by: John Stultz Reviewed-by: Steven Rostedt Cc: Jan Kara Cc: Peter Zijlstra Cc: Jiri Bohac Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/printk.h b/include/linux/printk.h index 8752f75..7847301 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -128,9 +128,9 @@ asmlinkage __printf(1, 2) __cold int printk(const char *fmt, ...); /* - * Special printk facility for scheduler use only, _DO_NOT_USE_ ! + * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ ! */ -__printf(1, 2) __cold int printk_sched(const char *fmt, ...); +__printf(1, 2) __cold int printk_deferred(const char *fmt, ...); /* * Please don't use printk_ratelimit(), because it shares ratelimiting state @@ -165,7 +165,7 @@ int printk(const char *s, ...) return 0; } static inline __printf(1, 2) __cold -int printk_sched(const char *s, ...) +int printk_deferred(const char *s, ...) { return 0; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index dc2b8bd..35d9db2 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2585,7 +2585,7 @@ void wake_up_klogd(void) preempt_enable(); } -int printk_sched(const char *fmt, ...) +int printk_deferred(const char *fmt, ...) { va_list args; int r; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 913c6d6..caf03e8 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1367,7 +1367,7 @@ out: * leave kernel. */ if (p->mm && printk_ratelimit()) { - printk_sched("process %d (%s) no longer affine to cpu%d\n", + printk_deferred("process %d (%s) no longer affine to cpu%d\n", task_pid_nr(p), p->comm, cpu); } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index f9ca7d1..d17e1c4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -352,7 +352,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, if (!lag_once) { lag_once = true; - printk_sched("sched: DL replenish lagged to much\n"); + printk_deferred("sched: DL replenish lagged to much\n"); } dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0ebfd7a..5d7667b 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -896,7 +896,7 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) if (!once) { once = true; - printk_sched("sched: RT throttling activated\n"); + printk_deferred("sched: RT throttling activated\n"); } } else { /* -- cgit v0.10.2 From c224815dac9c739b79050d3cc67443ff500bc478 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:41 -0700 Subject: printk: Add printk_deferred_once Two of the three prink_deferred uses are really printk_once style uses, so add a printk_deferred_once macro to simplify those call sites. Signed-off-by: John Stultz Reviewed-by: Steven Rostedt Reviewed-by: Jan Kara Cc: Peter Zijlstra Cc: Jiri Bohac Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/printk.h b/include/linux/printk.h index 7847301..f086d6c 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -266,9 +266,20 @@ extern asmlinkage void dump_stack(void) __cold; printk(fmt, ##__VA_ARGS__); \ } \ }) +#define printk_deferred_once(fmt, ...) \ +({ \ + static bool __print_once __read_mostly; \ + \ + if (!__print_once) { \ + __print_once = true; \ + printk_deferred(fmt, ##__VA_ARGS__); \ + } \ +}) #else #define printk_once(fmt, ...) \ no_printk(fmt, ##__VA_ARGS__) +#define printk_deferred_once(fmt, ...) \ + no_printk(fmt, ##__VA_ARGS__) #endif #define pr_emerg_once(fmt, ...) \ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index d17e1c4..e1574fc 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -348,12 +348,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - static bool lag_once = false; - - if (!lag_once) { - lag_once = true; - printk_deferred("sched: DL replenish lagged to much\n"); - } + printk_deferred_once("sched: DL replenish lagged to much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 5d7667b..b3512f1 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -890,14 +890,8 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) * but accrue some time due to boosting. */ if (likely(rt_b->rt_runtime)) { - static bool once = false; - rt_rq->rt_throttled = 1; - - if (!once) { - once = true; - printk_deferred("sched: RT throttling activated\n"); - } + printk_deferred_once("sched: RT throttling activated\n"); } else { /* * In case we did anyway, make it go away, -- cgit v0.10.2 From 6d9bcb621b0b0a20604cbdb298c4487e44dd0da2 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Wed, 4 Jun 2014 16:11:43 -0700 Subject: timekeeping: use printk_deferred when holding timekeeping seqlock Jiri Bohac pointed out that there are rare but potential deadlock possibilities when calling printk while holding the timekeeping seqlock. This is due to printk() triggering console sem wakeup, which can cause scheduling code to trigger hrtimers which may try to read the time. Specifically, as Jiri pointed out, that path is: printk vprintk_emit console_unlock up(&console_sem) __up wake_up_process try_to_wake_up ttwu_do_activate ttwu_activate activate_task enqueue_task enqueue_task_fair hrtick_update hrtick_start_fair hrtick_start_fair get_time ktime_get --> endless loop on read_seqcount_retry(&timekeeper_seq, ...) This patch tries to avoid this issue by using printk_deferred (previously named printk_sched) which should defer printing via a irq_work_queue. Signed-off-by: John Stultz Reported-by: Jiri Bohac Reviewed-by: Steven Rostedt Cc: Jan Kara Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 419a52c..5b0ac4d 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -786,8 +786,9 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) time_status |= STA_PPSERROR; pps_errcnt++; pps_dec_freq_interval(); - pr_err("hardpps: PPSERROR: interval too long - %ld s\n", - freq_norm.sec); + printk_deferred(KERN_ERR + "hardpps: PPSERROR: interval too long - %ld s\n", + freq_norm.sec); return 0; } @@ -800,7 +801,8 @@ static long hardpps_update_freq(struct pps_normtime freq_norm) delta = shift_right(ftemp - pps_freq, NTP_SCALE_SHIFT); pps_freq = ftemp; if (delta > PPS_MAXWANDER || delta < -PPS_MAXWANDER) { - pr_warning("hardpps: PPSWANDER: change=%ld\n", delta); + printk_deferred(KERN_WARNING + "hardpps: PPSWANDER: change=%ld\n", delta); time_status |= STA_PPSWANDER; pps_stbcnt++; pps_dec_freq_interval(); @@ -844,8 +846,9 @@ static void hardpps_update_phase(long error) * the time offset is updated. */ if (jitter > (pps_jitter << PPS_POPCORN)) { - pr_warning("hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", - jitter, (pps_jitter << PPS_POPCORN)); + printk_deferred(KERN_WARNING + "hardpps: PPSJITTER: jitter=%ld, limit=%ld\n", + jitter, (pps_jitter << PPS_POPCORN)); time_status |= STA_PPSJITTER; pps_jitcnt++; } else if (time_status & STA_PPSTIME) { @@ -902,7 +905,7 @@ void __hardpps(const struct timespec *phase_ts, const struct timespec *raw_ts) time_status |= STA_PPSJITTER; /* restart the frequency calibration interval */ pps_fbase = *raw_ts; - pr_err("hardpps: PPSJITTER: bad pulse\n"); + printk_deferred(KERN_ERR "hardpps: PPSJITTER: bad pulse\n"); return; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f7df8ea..32d8d6a 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -852,8 +852,9 @@ static void __timekeeping_inject_sleeptime(struct timekeeper *tk, struct timespec *delta) { if (!timespec_valid_strict(delta)) { - printk(KERN_WARNING "__timekeeping_inject_sleeptime: Invalid " - "sleep delta value!\n"); + printk_deferred(KERN_WARNING + "__timekeeping_inject_sleeptime: Invalid " + "sleep delta value!\n"); return; } tk_xtime_add(tk, delta); @@ -1157,7 +1158,7 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset) if (unlikely(tk->clock->maxadj && (tk->mult + adj > tk->clock->mult + tk->clock->maxadj))) { - printk_once(KERN_WARNING + printk_deferred_once(KERN_WARNING "Adjusting %s more than 11%% (%ld vs %ld)\n", tk->clock->name, (long)tk->mult + adj, (long)tk->clock->mult + tk->clock->maxadj); -- cgit v0.10.2 From 6e099f557d9c6797c3ee3ee7b5c8cebe543ec1cc Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 4 Jun 2014 16:11:44 -0700 Subject: Documentation: expand/clarify debug documentation The pr_debug() and related debug print macros all differ from the normal pr_XXX() macros, in that the normal ones print unconditionally, while the debug macros are compiled out unless DEBUG is defined or CONFIG_DYNAMIC_DEBUG is set. This isn't obvious, and the only way to find this out is either to review the actual printk.h code or to read CodingStyle, and the message there doesn't highlight the fact. Change Documentation/CodingStyle to clearly indicate that pr_debug() and related debug printing macros behave differently than all other pr_XXX() macros, and attempt to clarify when and where the different debug printing methods might be used. Add short comment to printk.h above the pr_XXX() macros indicating that while these macros print unconditionally, pr_debug() does not. Signed-off-by: Dan Streetman Cc: Joe Perches Cc: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/CodingStyle b/Documentation/CodingStyle index 7fe0546..6b6bef3 100644 --- a/Documentation/CodingStyle +++ b/Documentation/CodingStyle @@ -660,15 +660,23 @@ There are a number of driver model diagnostic macros in which you should use to make sure messages are matched to the right device and driver, and are tagged with the right level: dev_err(), dev_warn(), dev_info(), and so forth. For messages that aren't associated with a -particular device, defines pr_debug() and pr_info(). +particular device, defines pr_notice(), pr_info(), +pr_warn(), pr_err(), etc. Coming up with good debugging messages can be quite a challenge; and once -you have them, they can be a huge help for remote troubleshooting. Such -messages should be compiled out when the DEBUG symbol is not defined (that -is, by default they are not included). When you use dev_dbg() or pr_debug(), -that's automatic. Many subsystems have Kconfig options to turn on -DDEBUG. -A related convention uses VERBOSE_DEBUG to add dev_vdbg() messages to the -ones already enabled by DEBUG. +you have them, they can be a huge help for remote troubleshooting. However +debug message printing is handled differently than printing other non-debug +messages. While the other pr_XXX() functions print unconditionally, +pr_debug() does not; it is compiled out by default, unless either DEBUG is +defined or CONFIG_DYNAMIC_DEBUG is set. That is true for dev_dbg() also, +and a related convention uses VERBOSE_DEBUG to add dev_vdbg() messages to +the ones already enabled by DEBUG. + +Many subsystems have Kconfig debug options to turn on -DDEBUG in the +corresponding Makefile; in other cases specific files #define DEBUG. And +when a debug message should be unconditionally printed, such as if it is +already inside a debug-related #ifdef secton, printk(KERN_DEBUG ...) can be +used. Chapter 14: Allocating memory diff --git a/include/linux/printk.h b/include/linux/printk.h index f086d6c..37f3a65 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -210,6 +210,12 @@ extern asmlinkage void dump_stack(void) __cold; #define pr_fmt(fmt) fmt #endif +/* + * These can be used to print at the various log levels. + * All of these will print unconditionally, although note that pr_debug() + * and other debug macros are compiled out unless either DEBUG is defined + * or CONFIG_DYNAMIC_DEBUG is set. + */ #define pr_emerg(fmt, ...) \ printk(KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__) #define pr_alert(fmt, ...) \ -- cgit v0.10.2 From 84b5ec8a9df86f3dcaaaf912715db35e4852d1da Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Wed, 4 Jun 2014 16:11:45 -0700 Subject: printk: report dropping of messages from logbuf If the log ring buffer becomes full, we silently overwrite old messages with new data. console_unlock will detect this case and fast-forward the console_* pointers to skip over the corrupted data, but nothing will be reported to the user. This patch hijacks the first valid log message after detecting that we dropped messages and prefixes it with a note detailing how many messages were dropped. For long (~1000 char) messages, this will result in some truncation of the real message, but given that we're dropping things anyway, that doesn't seem to be the end of the world. Signed-off-by: Will Deacon Acked-by: Peter Zijlstra Cc: Kay Sievers Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 35d9db2..923c5d4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2157,10 +2157,15 @@ again: } if (console_seq < log_first_seq) { + len = sprintf(text, "** %u printk messages dropped ** ", + (unsigned)(log_first_seq - console_seq)); + /* messages are gone, move to first one */ console_seq = log_first_seq; console_idx = log_first_idx; console_prev = 0; + } else { + len = 0; } skip: if (console_seq == log_next_seq) @@ -2185,8 +2190,8 @@ skip: } level = msg->level; - len = msg_print_text(msg, console_prev, false, - text, sizeof(text)); + len += msg_print_text(msg, console_prev, false, + text + len, sizeof(text) - len); console_idx = log_next(console_idx); console_seq++; console_prev = msg->flags; -- cgit v0.10.2 From a8fe19ebfbfd90ec17c02284717238b02efb9580 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 4 Jun 2014 16:11:46 -0700 Subject: kernel/printk: use symbolic defines for console loglevels ... instead of naked numbers. Stuff in sysrq.c used to set it to 8 which is supposed to mean above default level so set it to DEBUG instead as we're terminating/killing all tasks and we want to be verbose there. Also, correct the check in x86_64_start_kernel which should be >= as we're clearly issuing the string there for all debug levels, not only the magical 10. Signed-off-by: Borislav Petkov Acked-by: Kees Cook Acked-by: Randy Dunlap Cc: Joe Perches Cc: Valdis Kletnieks Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 068054f..eda1a86 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -172,7 +172,7 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - if (console_loglevel == 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) early_printk("Kernel alive\n"); clear_page(init_level4_pgt); diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index be27da6..c89c933 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -85,7 +85,7 @@ static cpumask_var_t uv_nmi_cpu_mask; * Default is all stack dumps go to the console and buffer. * Lower level to send to log buffer only. */ -static int uv_nmi_loglevel = 7; +static int uv_nmi_loglevel = CONSOLE_LOGLEVEL_DEFAULT; module_param_named(dump_loglevel, uv_nmi_loglevel, int, 0644); /* diff --git a/drivers/nubus/nubus.c b/drivers/nubus/nubus.c index 43926cd..5066a7e 100644 --- a/drivers/nubus/nubus.c +++ b/drivers/nubus/nubus.c @@ -473,7 +473,7 @@ static struct nubus_dev* __init if (slot == 0 && (unsigned long)dir.base % 2) dir.base += 1; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_functional_resource: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -568,7 +568,7 @@ static int __init nubus_get_vidnames(struct nubus_board* board, printk(KERN_INFO " video modes supported:\n"); nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_vidnames: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -629,7 +629,7 @@ static int __init nubus_get_vendorinfo(struct nubus_board* board, printk(KERN_INFO " vendor info:\n"); nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_vendorinfo: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -654,7 +654,7 @@ static int __init nubus_get_board_resource(struct nubus_board* board, int slot, struct nubus_dirent ent; nubus_get_subdir(parent, &dir); - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_board_resource: parent is 0x%p, dir is 0x%p\n", parent->base, dir.base); @@ -753,19 +753,19 @@ static void __init nubus_find_rom_dir(struct nubus_board* board) if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_INFO "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* This one takes us to where we want to go. */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); nubus_get_subdir(&ent, &dir); /* Resource ID 01, also an "Unknown Macintosh" */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* FIXME: the first one is *not* always the right one. We @@ -780,7 +780,7 @@ static void __init nubus_find_rom_dir(struct nubus_board* board) path to that address... */ if (nubus_readdir(&dir, &ent) == -1) goto badrom; - if (console_loglevel >= 10) + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) printk(KERN_DEBUG "nubus_get_rom_dir: entry %02x %06x\n", ent.type, ent.data); /* Bwahahahaha... */ @@ -816,7 +816,7 @@ static struct nubus_board* __init nubus_add_board(int slot, int bytelanes) board->fblock = rp; /* Dump the format block for debugging purposes */ - if (console_loglevel >= 10) { + if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) { int i; printk(KERN_DEBUG "Slot %X, format block at 0x%p\n", slot, rp); diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c index ce396ec..b767a64 100644 --- a/drivers/tty/sysrq.c +++ b/drivers/tty/sysrq.c @@ -88,7 +88,7 @@ static void sysrq_handle_loglevel(int key) int i; i = key - '0'; - console_loglevel = 7; + console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; printk("Loglevel set to %d\n", i); console_loglevel = i; } @@ -343,7 +343,7 @@ static void send_sig_all(int sig) static void sysrq_handle_term(int key) { send_sig_all(SIGTERM); - console_loglevel = 8; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; } static struct sysrq_key_op sysrq_term_op = { .handler = sysrq_handle_term, @@ -387,7 +387,7 @@ static struct sysrq_key_op sysrq_thaw_op = { static void sysrq_handle_kill(int key) { send_sig_all(SIGKILL); - console_loglevel = 8; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; } static struct sysrq_key_op sysrq_kill_op = { .handler = sysrq_handle_kill, @@ -520,7 +520,7 @@ void __handle_sysrq(int key, bool check_mask) * routing in the consumers of /proc/kmsg. */ orig_log_level = console_loglevel; - console_loglevel = 7; + console_loglevel = CONSOLE_LOGLEVEL_DEFAULT; printk(KERN_INFO "SysRq : "); op_p = __sysrq_get_key_op(key); diff --git a/include/linux/printk.h b/include/linux/printk.h index 37f3a65..319ff7e 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -30,6 +30,17 @@ static inline const char *printk_skip_level(const char *buffer) return buffer; } +/* printk's without a loglevel use this.. */ +#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL + +/* We show everything that is MORE important than this.. */ +#define CONSOLE_LOGLEVEL_SILENT 0 /* Mum's the word */ +#define CONSOLE_LOGLEVEL_MIN 1 /* Minimum loglevel we let people use */ +#define CONSOLE_LOGLEVEL_QUIET 4 /* Shhh ..., when booted with "quiet" */ +#define CONSOLE_LOGLEVEL_DEFAULT 7 /* anything MORE serious than KERN_DEBUG */ +#define CONSOLE_LOGLEVEL_DEBUG 10 /* issue debug messages */ +#define CONSOLE_LOGLEVEL_MOTORMOUTH 15 /* You can't shut this one up */ + extern int console_printk[]; #define console_loglevel (console_printk[0]) @@ -39,13 +50,13 @@ extern int console_printk[]; static inline void console_silent(void) { - console_loglevel = 0; + console_loglevel = CONSOLE_LOGLEVEL_SILENT; } static inline void console_verbose(void) { if (console_loglevel) - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; } struct va_format { diff --git a/init/main.c b/init/main.c index e08c0b2..04fab8d 100644 --- a/init/main.c +++ b/init/main.c @@ -203,13 +203,13 @@ EXPORT_SYMBOL(loops_per_jiffy); static int __init debug_kernel(char *str) { - console_loglevel = 10; + console_loglevel = CONSOLE_LOGLEVEL_DEBUG; return 0; } static int __init quiet_kernel(char *str) { - console_loglevel = 4; + console_loglevel = CONSOLE_LOGLEVEL_QUIET; return 0; } diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c index b03e0e8..fe15fff 100644 --- a/kernel/debug/kdb/kdb_bt.c +++ b/kernel/debug/kdb/kdb_bt.c @@ -21,7 +21,7 @@ static void kdb_show_stack(struct task_struct *p, void *addr) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; kdb_set_current_task(p); if (addr) { diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 14ff484..7c70812 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -710,7 +710,7 @@ kdb_printit: } if (logging) { saved_loglevel = console_loglevel; - console_loglevel = 0; + console_loglevel = CONSOLE_LOGLEVEL_SILENT; printk(KERN_INFO "%s", kdb_buffer); } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 0b097c8..2f7c760 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1091,7 +1091,7 @@ static int kdb_reboot(int argc, const char **argv) static void kdb_dumpregs(struct pt_regs *regs) { int old_lvl = console_loglevel; - console_loglevel = 15; + console_loglevel = CONSOLE_LOGLEVEL_MOTORMOUTH; kdb_trap_printk++; show_regs(regs); kdb_trap_printk--; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 923c5d4..ea2d5f6 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -54,18 +54,11 @@ #include "console_cmdline.h" #include "braille.h" -/* printk's without a loglevel use this.. */ -#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL - -/* We show everything that is MORE important than this.. */ -#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ -#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ - int console_printk[4] = { - DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ - MINIMUM_CONSOLE_LOGLEVEL, /* minimum_console_loglevel */ - DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ + CONSOLE_LOGLEVEL_MIN, /* minimum_console_loglevel */ + CONSOLE_LOGLEVEL_DEFAULT, /* default_console_loglevel */ }; /* Deferred messaged from sched code are marked by this special level */ -- cgit v0.10.2 From 0046dd9fed0c9313cbb4fb860324476cd298dc9f Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 4 Jun 2014 16:11:47 -0700 Subject: lib/string.c: use the name "C-string" in comments For strncpy() and friends the source string may or may not have an actual NUL character at the end. The documentation is confusing in this because it specifically mentions that you are passing a "NUL-terminated" string. Wikipedia says that "C-string" is an alternative name we can use instead. http://en.wikipedia.org/wiki/Null-terminated_string Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/string.c b/lib/string.c index e0c20eb..992bf30 100644 --- a/lib/string.c +++ b/lib/string.c @@ -107,7 +107,7 @@ EXPORT_SYMBOL(strcpy); #ifndef __HAVE_ARCH_STRNCPY /** - * strncpy - Copy a length-limited, %NUL-terminated string + * strncpy - Copy a length-limited, C-string * @dest: Where to copy the string to * @src: Where to copy the string from * @count: The maximum number of bytes to copy @@ -136,7 +136,7 @@ EXPORT_SYMBOL(strncpy); #ifndef __HAVE_ARCH_STRLCPY /** - * strlcpy - Copy a %NUL terminated string into a sized buffer + * strlcpy - Copy a C-string into a sized buffer * @dest: Where to copy the string to * @src: Where to copy the string from * @size: size of destination buffer @@ -182,7 +182,7 @@ EXPORT_SYMBOL(strcat); #ifndef __HAVE_ARCH_STRNCAT /** - * strncat - Append a length-limited, %NUL-terminated string to another + * strncat - Append a length-limited, C-string to another * @dest: The string to be appended to * @src: The string to append to it * @count: The maximum numbers of bytes to copy @@ -211,7 +211,7 @@ EXPORT_SYMBOL(strncat); #ifndef __HAVE_ARCH_STRLCAT /** - * strlcat - Append a length-limited, %NUL-terminated string to another + * strlcat - Append a length-limited, C-string to another * @dest: The string to be appended to * @src: The string to append to it * @count: The size of the destination buffer. -- cgit v0.10.2 From 84d517f3e56f7d0d305c14a701cee8f7372ebe1e Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Wed, 4 Jun 2014 16:11:48 -0700 Subject: lib/xz: add comments for the intentionally missing break statements Signed-off-by: Lasse Collin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/xz/xz_dec_lzma2.c b/lib/xz/xz_dec_lzma2.c index a6cdc96..08c3c80 100644 --- a/lib/xz/xz_dec_lzma2.c +++ b/lib/xz/xz_dec_lzma2.c @@ -1043,6 +1043,8 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, s->lzma2.sequence = SEQ_LZMA_PREPARE; + /* Fall through */ + case SEQ_LZMA_PREPARE: if (s->lzma2.compressed < RC_INIT_BYTES) return XZ_DATA_ERROR; @@ -1053,6 +1055,8 @@ XZ_EXTERN enum xz_ret xz_dec_lzma2_run(struct xz_dec_lzma2 *s, s->lzma2.compressed -= RC_INIT_BYTES; s->lzma2.sequence = SEQ_LZMA_RUN; + /* Fall through */ + case SEQ_LZMA_RUN: /* * Set dictionary limit to indicate how much we want -- cgit v0.10.2 From 1812062790ab647e85821f21f2263f56eaeffc11 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 4 Jun 2014 16:11:49 -0700 Subject: lib/plist.c: replace pr_debug with printk in plist_test() Replace pr_debug() in lib/plist.c test function plist_test() with printk(KERN_DEBUG ...). Without DEBUG defined, pr_debug() is complied out, but the entire plist_test() function is already inside CONFIG_DEBUG_PI_LIST, so printk should just be used directly. Signed-off-by: Dan Streetman Reviewed-by: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/plist.c b/lib/plist.c index 0f2084d..d408e77 100644 --- a/lib/plist.c +++ b/lib/plist.c @@ -223,7 +223,7 @@ static int __init plist_test(void) int nr_expect = 0, i, loop; unsigned int r = local_clock(); - pr_debug("start plist test\n"); + printk(KERN_DEBUG "start plist test\n"); plist_head_init(&test_head); for (i = 0; i < ARRAY_SIZE(test_node); i++) plist_node_init(test_node + i, 0); @@ -255,7 +255,7 @@ static int __init plist_test(void) plist_test_check(nr_expect); } - pr_debug("end plist test\n"); + printk(KERN_DEBUG "end plist test\n"); return 0; } -- cgit v0.10.2 From bf4d064d89aebe3cc43d875c0803478a6a1dde12 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Wed, 4 Jun 2014 16:11:50 -0700 Subject: lib/xz: enable all filters by default in Kconfig This restores the old behavior that existed before 2013-02-22, when changes were made by 64dbfb444c150 ("decompressors: drop dependency on CONFIG_EXPERT") and 5dc49c75a2 ("decompressors: make the default XZ_DEC_* config match the selected architecture"). Disabling the filters only makes sense on embedded systems. Signed-off-by: Lasse Collin Acked-by: Kyle McMartin Cc: Florian Fainelli Cc: Phillip Lougher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/xz/Kconfig b/lib/xz/Kconfig index 08837db..12d2d777 100644 --- a/lib/xz/Kconfig +++ b/lib/xz/Kconfig @@ -9,33 +9,33 @@ config XZ_DEC if XZ_DEC config XZ_DEC_X86 - bool "x86 BCJ filter decoder" - default y if X86 + bool "x86 BCJ filter decoder" if EXPERT + default y select XZ_DEC_BCJ config XZ_DEC_POWERPC - bool "PowerPC BCJ filter decoder" - default y if PPC + bool "PowerPC BCJ filter decoder" if EXPERT + default y select XZ_DEC_BCJ config XZ_DEC_IA64 - bool "IA-64 BCJ filter decoder" - default y if IA64 + bool "IA-64 BCJ filter decoder" if EXPERT + default y select XZ_DEC_BCJ config XZ_DEC_ARM - bool "ARM BCJ filter decoder" - default y if ARM + bool "ARM BCJ filter decoder" if EXPERT + default y select XZ_DEC_BCJ config XZ_DEC_ARMTHUMB - bool "ARM-Thumb BCJ filter decoder" - default y if (ARM && ARM_THUMB) + bool "ARM-Thumb BCJ filter decoder" if EXPERT + default y select XZ_DEC_BCJ config XZ_DEC_SPARC - bool "SPARC BCJ filter decoder" - default y if SPARC + bool "SPARC BCJ filter decoder" if EXPERT + default y select XZ_DEC_BCJ endif -- cgit v0.10.2 From f8eaf298c8dc034e88d772c7d4bef7e5f5a490e2 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:51 -0700 Subject: lib/libcrc32c.c: use PTR_ERR_OR_ZERO replace IS_ERR/PTR_ERR Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c index 244f548..b3131f5 100644 --- a/lib/libcrc32c.c +++ b/lib/libcrc32c.c @@ -62,10 +62,7 @@ EXPORT_SYMBOL(crc32c); static int __init libcrc32c_mod_init(void) { tfm = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - return 0; + return PTR_ERR_OR_ZERO(tfm); } static void __exit libcrc32c_mod_fini(void) -- cgit v0.10.2 From 3f623eba2a7fc01b0341f7989aa6c5ed91b9adb6 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:52 -0700 Subject: lib/vsprintf.c: fix comparison to bool Fixing 2 coccinelle warnings: lib/vsprintf.c:2350:2-9: WARNING: Assignment of bool to 0/1 lib/vsprintf.c:2389:3-10: WARNING: Assignment of bool to 0/1 Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 0648291..6fe2c84 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -2347,7 +2347,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args) break; base = 10; - is_sign = 0; + is_sign = false; switch (*fmt++) { case 'c': @@ -2386,7 +2386,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args) case 'i': base = 0; case 'd': - is_sign = 1; + is_sign = true; case 'u': break; case '%': -- cgit v0.10.2 From c75b53af2f0043aff500af0a6f878497bef41bca Mon Sep 17 00:00:00 2001 From: Minfei Huang Date: Wed, 4 Jun 2014 16:11:53 -0700 Subject: lib/btree.c: fix leak of whole btree nodes I use btree from 3.14-rc2 in my own module. When the btree module is removed, a warning arises: kmem_cache_destroy btree_node: Slab cache still has objects CPU: 13 PID: 9150 Comm: rmmod Tainted: GF O 3.14.0-rc2 #1 Hardware name: Inspur NF5270M3/NF5270M3, BIOS CHEETAH_2.1.3 09/10/2013 Call Trace: dump_stack+0x49/0x5d kmem_cache_destroy+0xcf/0xe0 btree_module_exit+0x10/0x12 [btree] SyS_delete_module+0x198/0x1f0 system_call_fastpath+0x16/0x1b The cause is that it doesn't release the last btree node, when height = 1 and fill = 1. [akpm@linux-foundation.org: remove unneeded test of NULL] Signed-off-by: Minfei Huang Cc: Joern Engel Cc: Johannes Berg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/btree.c b/lib/btree.c index f9a4846..4264871 100644 --- a/lib/btree.c +++ b/lib/btree.c @@ -198,6 +198,7 @@ EXPORT_SYMBOL_GPL(btree_init); void btree_destroy(struct btree_head *head) { + mempool_free(head->node, head->mempool); mempool_destroy(head->mempool); head->mempool = NULL; } -- cgit v0.10.2 From b8cfff68ea9cd7b25f07c1d5bb42567d084fcba3 Mon Sep 17 00:00:00 2001 From: Dan Streetman Date: Wed, 4 Jun 2014 16:11:54 -0700 Subject: lib/plist.c: make CONFIG_DEBUG_PI_LIST selectable Change CONFIG_DEBUG_PI_LIST to be user-selectable, and add a title and description. Remove the dependency on DEBUG_RT_MUTEXES since they were changed to use rbtrees, and there are other users of plists now. Signed-off-by: Dan Streetman Acked-by: Steven Rostedt Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index c2de650..ccca322 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -833,11 +833,6 @@ config DEBUG_RT_MUTEXES This allows rt mutex semantics violations and rt mutex related deadlocks (lockups) to be detected and reported automatically. -config DEBUG_PI_LIST - bool - default y - depends on DEBUG_RT_MUTEXES - config RT_MUTEX_TESTER bool "Built-in scriptable tester for rt-mutexes" depends on DEBUG_KERNEL && RT_MUTEXES @@ -1063,6 +1058,16 @@ config DEBUG_LIST If unsure, say N. +config DEBUG_PI_LIST + bool "Debug priority linked list manipulation" + depends on DEBUG_KERNEL + help + Enable this to turn on extended checks in the priority-ordered + linked-list (plist) walking routines. This checks the entire + list multiple times during each manipulation. + + If unsure, say N. + config DEBUG_SG bool "Debug SG table operations" depends on DEBUG_KERNEL -- cgit v0.10.2 From 8e4c0b68489abd602af070367c1156f715a80339 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:55 -0700 Subject: lib/radix-tree.c: kernel-doc warning fix index has been removed from __radix_tree_delete_node in 449dd6984d0e47 ("mm: keep page cache radix tree nodes in check") Signed-off-by: Fabian Frederick Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 55f7a9c..d648156 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1296,7 +1296,6 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) /** * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root - * @index: index key * @node: node containing @index * * After clearing the slot at @index in @node from radix tree -- cgit v0.10.2 From 38b4fe5fcc8690719339fb44afb330a75af08021 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:56 -0700 Subject: lib/crc32.c: remove unnecessary __constant Use cpu_to_le32 instead of __constant_cpu_to_le32. Signed-off-by: Fabian Frederick Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/crc32.c b/lib/crc32.c index 70f00ca..21a7b2135 100644 --- a/lib/crc32.c +++ b/lib/crc32.c @@ -33,13 +33,13 @@ #include "crc32defs.h" #if CRC_LE_BITS > 8 -# define tole(x) ((__force u32) __constant_cpu_to_le32(x)) +# define tole(x) ((__force u32) cpu_to_le32(x)) #else # define tole(x) (x) #endif #if CRC_BE_BITS > 8 -# define tobe(x) ((__force u32) __constant_cpu_to_be32(x)) +# define tobe(x) ((__force u32) cpu_to_be32(x)) #else # define tobe(x) (x) #endif -- cgit v0.10.2 From 54b14f40c5b13aeb179f68d82214e728617d5704 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:57 -0700 Subject: lib/digsig.c: kernel-doc warning fixes Small typo and @return: -> Returns ... Signed-off-by: Fabian Frederick Cc: Duan Jiong Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/digsig.c b/lib/digsig.c index 8793aed..ae05ea39 100644 --- a/lib/digsig.c +++ b/lib/digsig.c @@ -175,10 +175,11 @@ err1: * digsig_verify() - digital signature verification with public key * @keyring: keyring to search key in * @sig: digital signature - * @sigen: length of the signature + * @siglen: length of the signature * @data: data * @datalen: length of the data - * @return: 0 on success, -EINVAL otherwise + * + * Returns 0 on success, -EINVAL otherwise * * Verifies data integrity against digital signature. * Currently only RSA is supported. -- cgit v0.10.2 From 6d6a138f13e7cb5f20e4ee1c841b4bdaee5e0251 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:57 -0700 Subject: lib/nlattr.c: move EXPORT_SYMBOL after functions Fix some checkpatch warnings: WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable Signed-off-by: Fabian Frederick Cc: Pablo Neira Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/nlattr.c b/lib/nlattr.c index fc67547..0c57787 100644 --- a/lib/nlattr.c +++ b/lib/nlattr.c @@ -136,6 +136,7 @@ int nla_validate(const struct nlattr *head, int len, int maxtype, errout: return err; } +EXPORT_SYMBOL(nla_validate); /** * nla_policy_len - Determin the max. length of a policy @@ -162,6 +163,7 @@ nla_policy_len(const struct nla_policy *p, int n) return len; } +EXPORT_SYMBOL(nla_policy_len); /** * nla_parse - Parse a stream of attributes into a tb buffer @@ -208,6 +210,7 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head, errout: return err; } +EXPORT_SYMBOL(nla_parse); /** * nla_find - Find a specific attribute in a stream of attributes @@ -228,6 +231,7 @@ struct nlattr *nla_find(const struct nlattr *head, int len, int attrtype) return NULL; } +EXPORT_SYMBOL(nla_find); /** * nla_strlcpy - Copy string attribute payload into a sized buffer @@ -258,6 +262,7 @@ size_t nla_strlcpy(char *dst, const struct nlattr *nla, size_t dstsize) return srclen; } +EXPORT_SYMBOL(nla_strlcpy); /** * nla_memcpy - Copy a netlink attribute into another memory area @@ -278,6 +283,7 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) return minlen; } +EXPORT_SYMBOL(nla_memcpy); /** * nla_memcmp - Compare an attribute with sized memory area @@ -295,6 +301,7 @@ int nla_memcmp(const struct nlattr *nla, const void *data, return d; } +EXPORT_SYMBOL(nla_memcmp); /** * nla_strcmp - Compare a string attribute against a string @@ -317,6 +324,7 @@ int nla_strcmp(const struct nlattr *nla, const char *str) return d; } +EXPORT_SYMBOL(nla_strcmp); #ifdef CONFIG_NET /** @@ -502,12 +510,3 @@ int nla_append(struct sk_buff *skb, int attrlen, const void *data) } EXPORT_SYMBOL(nla_append); #endif - -EXPORT_SYMBOL(nla_validate); -EXPORT_SYMBOL(nla_policy_len); -EXPORT_SYMBOL(nla_parse); -EXPORT_SYMBOL(nla_find); -EXPORT_SYMBOL(nla_strlcpy); -EXPORT_SYMBOL(nla_memcpy); -EXPORT_SYMBOL(nla_memcmp); -EXPORT_SYMBOL(nla_strcmp); -- cgit v0.10.2 From ce643a30d1c8bd31b6310f59f6d7236c9904c3bf Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:58 -0700 Subject: lib/textsearch.c: move EXPORT_SYMBOL after functions Fix checkpatch warning: "WARNING: EXPORT_SYMBOL(foo); should immediately follow its function/variable" Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/textsearch.c b/lib/textsearch.c index e0cc014..0c7e9ab 100644 --- a/lib/textsearch.c +++ b/lib/textsearch.c @@ -159,6 +159,7 @@ errout: spin_unlock(&ts_mod_lock); return err; } +EXPORT_SYMBOL(textsearch_register); /** * textsearch_unregister - unregister a textsearch module @@ -190,6 +191,7 @@ out: spin_unlock(&ts_mod_lock); return err; } +EXPORT_SYMBOL(textsearch_unregister); struct ts_linear_state { @@ -236,6 +238,7 @@ unsigned int textsearch_find_continuous(struct ts_config *conf, return textsearch_find(conf, state); } +EXPORT_SYMBOL(textsearch_find_continuous); /** * textsearch_prepare - Prepare a search @@ -298,6 +301,7 @@ errout: return ERR_PTR(err); } +EXPORT_SYMBOL(textsearch_prepare); /** * textsearch_destroy - destroy a search configuration @@ -316,9 +320,4 @@ void textsearch_destroy(struct ts_config *conf) kfree(conf); } - -EXPORT_SYMBOL(textsearch_register); -EXPORT_SYMBOL(textsearch_unregister); -EXPORT_SYMBOL(textsearch_prepare); -EXPORT_SYMBOL(textsearch_find_continuous); EXPORT_SYMBOL(textsearch_destroy); -- cgit v0.10.2 From c56ba70331d9f3c1ea77f8053095fb05fe773f50 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:11:59 -0700 Subject: lib/bug.c: convert printk to pr_foo() - Coalesce formats - "WARNING:" prefix unchanged to keep bug format. - printk(KERN_DEFAULT not converted. - define pr_fmt without prefix to avoid any default prefix update (suggested by Joe Perches). Signed-off-by: Fabian Frederick Cc: Jeremy Fitzhardinge Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/bug.c b/lib/bug.c index 1686034..d1d7c78 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -37,6 +37,9 @@ Jeremy Fitzhardinge 2006 */ + +#define pr_fmt(fmt) fmt + #include #include #include @@ -153,15 +156,13 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) if (warning) { /* this is a WARN_ON rather than BUG/BUG_ON */ - printk(KERN_WARNING "------------[ cut here ]------------\n"); + pr_warn("------------[ cut here ]------------\n"); if (file) - printk(KERN_WARNING "WARNING: at %s:%u\n", - file, line); + pr_warn("WARNING: at %s:%u\n", file, line); else - printk(KERN_WARNING "WARNING: at %p " - "[verbose debug info unavailable]\n", - (void *)bugaddr); + pr_warn("WARNING: at %p [verbose debug info unavailable]\n", + (void *)bugaddr); print_modules(); show_regs(regs); @@ -174,12 +175,10 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) printk(KERN_DEFAULT "------------[ cut here ]------------\n"); if (file) - printk(KERN_CRIT "kernel BUG at %s:%u!\n", - file, line); + pr_crit("kernel BUG at %s:%u!\n", file, line); else - printk(KERN_CRIT "Kernel BUG at %p " - "[verbose debug info unavailable]\n", - (void *)bugaddr); + pr_crit("Kernel BUG at %p [verbose debug info unavailable]\n", + (void *)bugaddr); return BUG_TRAP_TYPE_BUG; } -- cgit v0.10.2 From b3b16d284a4121a9eb294ec0012928591993b37c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:00 -0700 Subject: lib/atomic64_test.c: convert printk(KERN_INFO to pr_info Convert printk to current pr_foo() logging functions. Also add pr_fmt based on KBUILD_MODNAME to avoid repeating prefix. Prefix is now "atomic64_test: " Signed-off-by: Fabian Frederick Cc: Luca Barbieri Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/atomic64_test.c b/lib/atomic64_test.c index 00bca22..0211d30 100644 --- a/lib/atomic64_test.c +++ b/lib/atomic64_test.c @@ -8,6 +8,9 @@ * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -146,18 +149,18 @@ static __init int test_atomic64(void) BUG_ON(v.counter != r); #ifdef CONFIG_X86 - printk(KERN_INFO "atomic64 test passed for %s platform %s CX8 and %s SSE\n", + pr_info("passed for %s platform %s CX8 and %s SSE\n", #ifdef CONFIG_X86_64 - "x86-64", + "x86-64", #elif defined(CONFIG_X86_CMPXCHG64) - "i586+", + "i586+", #else - "i386+", + "i386+", #endif boot_cpu_has(X86_FEATURE_CX8) ? "with" : "without", boot_cpu_has(X86_FEATURE_XMM) ? "with" : "without"); #else - printk(KERN_INFO "atomic64 test passed\n"); + pr_info("passed\n"); #endif return 0; -- cgit v0.10.2 From 548bbff9818c7ddd325b47face1c9bf9a53ad0c7 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:01 -0700 Subject: lib/asn1_decoder.c: kernel-doc warning fix Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/lib/asn1_decoder.c b/lib/asn1_decoder.c index 11b9b01..1a000bb 100644 --- a/lib/asn1_decoder.c +++ b/lib/asn1_decoder.c @@ -140,7 +140,7 @@ error: * @decoder: The decoder definition (produced by asn1_compiler) * @context: The caller's context (to be passed to the action functions) * @data: The encoded data - * @datasize: The size of the encoded data + * @datalen: The size of the encoded data * * Decode BER/DER/CER encoded ASN.1 data according to a bytecode pattern * produced by asn1_compiler. Action functions are called on marked tags to -- cgit v0.10.2 From 6516a466193fe7f72644d65467fb9905139228c3 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:02 -0700 Subject: kernel/compat.c: use sizeof() instead of sizeof Fix 4 checkpatch warnings WARNING: sizeof *tv should be sizeof(*tv) Signed-off-by: Fabian Frederick Cc: "H. Peter Anvin" Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/compat.c b/kernel/compat.c index e40b043..633394f 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -157,7 +157,7 @@ static int __compat_put_timespec(const struct timespec *ts, struct compat_timesp int compat_get_timeval(struct timeval *tv, const void __user *utv) { if (COMPAT_USE_64BIT_TIME) - return copy_from_user(tv, utv, sizeof *tv) ? -EFAULT : 0; + return copy_from_user(tv, utv, sizeof(*tv)) ? -EFAULT : 0; else return __compat_get_timeval(tv, utv); } @@ -166,7 +166,7 @@ EXPORT_SYMBOL_GPL(compat_get_timeval); int compat_put_timeval(const struct timeval *tv, void __user *utv) { if (COMPAT_USE_64BIT_TIME) - return copy_to_user(utv, tv, sizeof *tv) ? -EFAULT : 0; + return copy_to_user(utv, tv, sizeof(*tv)) ? -EFAULT : 0; else return __compat_put_timeval(tv, utv); } @@ -175,7 +175,7 @@ EXPORT_SYMBOL_GPL(compat_put_timeval); int compat_get_timespec(struct timespec *ts, const void __user *uts) { if (COMPAT_USE_64BIT_TIME) - return copy_from_user(ts, uts, sizeof *ts) ? -EFAULT : 0; + return copy_from_user(ts, uts, sizeof(*ts)) ? -EFAULT : 0; else return __compat_get_timespec(ts, uts); } @@ -184,7 +184,7 @@ EXPORT_SYMBOL_GPL(compat_get_timespec); int compat_put_timespec(const struct timespec *ts, void __user *uts) { if (COMPAT_USE_64BIT_TIME) - return copy_to_user(uts, ts, sizeof *ts) ? -EFAULT : 0; + return copy_to_user(uts, ts, sizeof(*ts)) ? -EFAULT : 0; else return __compat_put_timespec(ts, uts); } -- cgit v0.10.2 From 185d566bcd0a8e83fe762b3bbef1d58347b9a034 Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Wed, 4 Jun 2014 16:12:03 -0700 Subject: checkpatch: fix wildcard DT compatible string checking We attempt to search for compatible strings which use a variable token in the documented name such as or . While this was attempted to be handled, it's utterly broken. The desired forms of matching are: vendor,-* vendor,name-* For , lower case characters and numbers are permitted. For , only numeric values are allowed. With this change, the number of missing compatible strings reported in arch/arm/boot/dts is reduced from 1071 to 960. Reported-by: Alexandre Belloni Signed-off-by: Rob Herring Tested-by: Geert Uytterhoeven Cc: Florian Vaussard Cc: Joe Perches Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 34eb216..62d005e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2093,8 +2093,10 @@ sub process { foreach my $compat (@compats) { my $compat2 = $compat; - $compat2 =~ s/\,[a-z]*\-/\,<\.\*>\-/; - `grep -Erq "$compat|$compat2" $dt_path`; + $compat2 =~ s/\,[a-zA-Z0-9]*\-/\,<\.\*>\-/; + my $compat3 = $compat; + $compat3 =~ s/\,([a-z]*)[0-9]*\-/\,$1<\.\*>\-/; + `grep -Erq "$compat|$compat2|$compat3" $dt_path`; if ( $? >> 8 ) { WARN("UNDOCUMENTED_DT_STRING", "DT compatible string \"$compat\" appears un-documented -- check $dt_path\n" . $herecurr); -- cgit v0.10.2 From 3f7bac031c6ba61c89b06b279f74a25309da1625 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 4 Jun 2014 16:12:04 -0700 Subject: checkpatch: always warn on missing blank line after variable declaration block Make the test system wide, modify the message too. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 62d005e..f2ef63a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -397,6 +397,11 @@ foreach my $entry (@mode_permission_funcs) { $mode_perms_search .= $entry->[0]; } +our $declaration_macros = qr{(?x: + (?:$Storage\s+)?(?:DECLARE|DEFINE)_[A-Z]+\s*\(| + (?:$Storage\s+)?LIST_HEAD\s*\( +)}; + our $allowed_asm_includes = qr{(?x: irq| memory @@ -2268,18 +2273,37 @@ sub process { } # check for missing blank lines after declarations - if ($realfile =~ m@^(drivers/net/|net/)@ && - $prevline =~ /^\+\s+$Declare\s+$Ident/ && - !($prevline =~ /(?:$Compare|$Assignment|$Operators)\s*$/ || - $prevline =~ /(?:\{\s*|\\)$/) && #extended lines - $sline =~ /^\+\s+/ && #Not at char 1 - !($sline =~ /^\+\s+$Declare/ || - $sline =~ /^\+\s+$Ident\s+$Ident/ || #eg: typedef foo + if ($sline =~ /^\+\s+\S/ && #Not at char 1 + # actual declarations + ($prevline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || + # foo bar; where foo is some local typedef or #define + $prevline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || + # known declaration macros + $prevline =~ /^\+\s+$declaration_macros/) && + # for "else if" which can look like "$Ident $Ident" + !($prevline =~ /^\+\s+$c90_Keywords\b/ || + # other possible extensions of declaration lines + $prevline =~ /(?:$Compare|$Assignment|$Operators)\s*$/ || + # not starting a section or a macro "\" extended line + $prevline =~ /(?:\{\s*|\\)$/) && + # looks like a declaration + !($sline =~ /^\+\s+$Declare\s*$Ident\s*[=,;:\[]/ || + # foo bar; where foo is some local typedef or #define + $sline =~ /^\+\s+$Ident(?:\s+|\s*\*\s*)$Ident\s*[=,;\[]/ || + # known declaration macros + $sline =~ /^\+\s+$declaration_macros/ || + # start of struct or union or enum $sline =~ /^\+\s+(?:union|struct|enum|typedef)\b/ || - $sline =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(])/ || - $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/)) { + # start or end of block or continuation of declaration + $sline =~ /^\+\s+(?:$|[\{\}\.\#\"\?\:\(\[])/ || + # bitfield continuation + $sline =~ /^\+\s+$Ident\s*:\s*\d+\s*[,;]/ || + # other possible extensions of declaration lines + $sline =~ /^\+\s+\(?\s*(?:$Compare|$Assignment|$Operators)/) && + # indentation of previous and current line are the same + (($prevline =~ /\+(\s+)\S/) && $sline =~ /^\+$1\S/)) { WARN("SPACING", - "networking uses a blank line after declarations\n" . $hereprev); + "Missing a blank line after declarations\n" . $hereprev); } # check for spaces at the beginning of a line. -- cgit v0.10.2 From 2ac73b4f685e699ccdfa6855e826df846999d577 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 4 Jun 2014 16:12:05 -0700 Subject: checkpatch: make --strict a default for files in drivers/net and net/ Networking files are generally more strictly conformant to linux-kernel style so make checkpatch more verbose by default for patches to files or when checking files in these directories. Signed-off-by: Joe Perches Cc: Andy Whitcroft Cc: David Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f2ef63a..bb4c842 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -24,6 +24,7 @@ my $emacs = 0; my $terse = 0; my $file = 0; my $check = 0; +my $check_orig = 0; my $summary = 1; my $mailback = 0; my $summary_file = 0; @@ -146,6 +147,7 @@ GetOptions( help(0) if ($help); $fix = 1 if ($fix_inplace); +$check_orig = $check; my $exit = 0; @@ -1813,11 +1815,13 @@ sub process { $here = "#$linenr: " if (!$file); $here = "#$realline: " if ($file); + my $found_file = 0; # extract the filename as it passes if ($line =~ /^diff --git.*?(\S+)$/) { $realfile = $1; $realfile =~ s@^([^/]*)/@@ if (!$file); $in_commit_log = 0; + $found_file = 1; } elsif ($line =~ /^\+\+\+\s+(\S+)/) { $realfile = $1; $realfile =~ s@^([^/]*)/@@ if (!$file); @@ -1834,6 +1838,15 @@ sub process { ERROR("MODIFIED_INCLUDE_ASM", "do not modify files in include/asm, change architecture specific files in include/asm-\n" . "$here$rawline\n"); } + $found_file = 1; + } + + if ($found_file) { + if ($realfile =~ m@^(drivers/net/|net/)@) { + $check = 1; + } else { + $check = $check_orig; + } next; } -- cgit v0.10.2 From f5ef95b12eb03ae4b3994cdb035612e127b630b9 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 4 Jun 2014 16:12:06 -0700 Subject: checkpatch: warn on #defines ending in semicolon Using a #define ending in a semicolon is poor style and can lead to unexpected code paths being executed. Warn on uses of these #define types: #define foo[(...)] bar; #define foo[(...)] \ bar; Based on a patch from Borislav Petkov. Signed-off-by: Joe Perches Cc: Borislav Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index bb4c842..e7ff52a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3821,6 +3821,17 @@ sub process { WARN("DO_WHILE_MACRO_WITH_TRAILING_SEMICOLON", "do {} while (0) macros should not be semicolon terminated\n" . "$herectx"); } + } elsif ($dstat =~ /^\+\s*#\s*define\s+$Ident.*;\s*$/) { + $ctx =~ s/\n*$//; + my $cnt = statement_rawlines($ctx); + my $herectx = $here . "\n"; + + for (my $n = 0; $n < $cnt; $n++) { + $herectx .= raw_line($linenr, $n) . "\n"; + } + + WARN("TRAILING_SEMICOLON", + "macros should not use a trailing semicolon\n" . "$herectx"); } } -- cgit v0.10.2 From 60a55369aad3336e604218abf2057363f69c0722 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 4 Jun 2014 16:12:07 -0700 Subject: checkpatch: add warning for kmalloc/kzalloc with multiply Protect against sizeof overflows by preferring kmalloc_array/kcalloc over kmalloc/kzalloc with a sizeof multiply. Signed-off-by: Joe Perches Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e7ff52a..7774025 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4378,6 +4378,30 @@ sub process { "Prefer $3(sizeof(*$1)...) over $3($4...)\n" . $herecurr); } +# check for k[mz]alloc with multiplies that could be kmalloc_array/kcalloc + if ($^V && $^V ge 5.10.0 && + $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/) { + my $oldfunc = $3; + my $a1 = $4; + my $a2 = $10; + my $newfunc = "kmalloc_array"; + $newfunc = "kcalloc" if ($oldfunc eq "kzalloc"); + if ($a1 =~ /^sizeof\s*\S/ || $a2 =~ /^sizeof\s*\S/) { + if (WARN("ALLOC_WITH_MULTIPLY", + "Prefer $newfunc over $oldfunc with multiply\n" . $herecurr) && + $fix) { + my $r1 = $a1; + my $r2 = $a2; + if ($a1 =~ /^sizeof\s*\S/) { + $r1 = $a2; + $r2 = $a1; + } + $fixed[$linenr - 1] =~ s/\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*(k[mz]alloc)\s*\(\s*($FuncArg)\s*\*\s*($FuncArg)/$1 . ' = ' . "$newfunc(" . trim($r1) . ', ' . trim($r2)/e; + + } + } + } + # check for krealloc arg reuse if ($^V && $^V ge 5.10.0 && $line =~ /\b($Lval)\s*\=\s*(?:$balanced_parens)?\s*krealloc\s*\(\s*\1\s*,/) { -- cgit v0.10.2 From afc819ab0293be3bd5c16d9eba26f9d57f61c42a Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 4 Jun 2014 16:12:08 -0700 Subject: checkpatch: prefer kstrto to sscanf(buf, "%", &bar); Use the kstrto functions in preference to sscanf. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 7774025..862cc7a 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4314,6 +4314,27 @@ sub process { "unchecked sscanf return value\n" . "$here\n$stat_real\n"); } +# check for simple sscanf that should be kstrto + if ($^V && $^V ge 5.10.0 && + defined $stat && + $line =~ /\bsscanf\b/) { + my $lc = $stat =~ tr@\n@@; + $lc = $lc + $linenr; + my $stat_real = raw_line($linenr, 0); + for (my $count = $linenr + 1; $count <= $lc; $count++) { + $stat_real = $stat_real . "\n" . raw_line($count, 0); + } + if ($stat_real =~ /\bsscanf\b\s*\(\s*$FuncArg\s*,\s*("[^"]+")/) { + my $format = $6; + my $count = $format =~ tr@%@%@; + if ($count == 1 && + $format =~ /^"\%(?i:ll[udxi]|[udxi]ll|ll|[hl]h?[udxi]|[udxi][hl]h?|[hl]h?|[udxi])"$/) { + WARN("SSCANF_TO_KSTRTO", + "Prefer kstrto to single variable sscanf\n" . "$here\n$stat_real\n"); + } + } + } + # check for new externs in .h files. if ($realfile =~ /\.h$/ && $line =~ /^\+\s*(extern\s+)$Type\s*$Ident\s*\(/s) { -- cgit v0.10.2 From 9819cf252a0fad1bf46aac8a051cf30426e073ee Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 4 Jun 2014 16:12:09 -0700 Subject: checkpatch: warn on unnecessary void function return statements void function lines that use a single tab then "return;" are generally unnecessary. Signed-off-by: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 862cc7a..f354ae6 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -3470,6 +3470,13 @@ sub process { } } +# unnecessary return in a void function? (a single leading tab, then return;) + if ($sline =~ /^\+\treturn\s*;\s*$/ && + $prevline =~ /^\+/) { + WARN("RETURN_VOID", + "void function return statements are not generally useful\n" . $herecurr); + } + # if statements using unnecessary parentheses - ie: if ((foo == bar)) if ($^V && $^V ge 5.10.0 && $line =~ /\bif\s*((?:\(\s*){2,})/) { -- cgit v0.10.2 From 9b3189eb424044e51c8847063b2ba314b57b6ed2 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Wed, 4 Jun 2014 16:12:10 -0700 Subject: checkpatch: check stable email address It should be stable@vger.kernel.org, not stable@kernel.org. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index f354ae6..0ef4ae1 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1944,6 +1944,12 @@ sub process { } } +# Check for old stable address + if ($line =~ /^\s*cc:\s*.*?.*$/i) { + ERROR("STABLE_ADDRESS", + "The 'stable' address should be 'stable\@vger.kernel.org'\n" . $herecurr); + } + # Check for unwanted Gerrit info if ($in_commit_log && $line =~ /^\s*change-id:/i) { ERROR("GERRIT_CHANGE_ID", -- cgit v0.10.2 From ae3ccc4678fec2d270a4c54981831c7b8a2da9cd Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:10 -0700 Subject: scripts/checkpatch.pl: device_initcall is not the only __initcall substitute This patch adds a link to init.h to find appropriate initcall function to replace obsolete __initcall Signed-off-by: Fabian Frederick Cc: Andy Whitcroft Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index 0ef4ae1..010b18e 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -4551,10 +4551,10 @@ sub process { "$1 is obsolete, use k$3 instead\n" . $herecurr); } -# check for __initcall(), use device_initcall() explicitly please +# check for __initcall(), use device_initcall() explicitly or more appropriate function please if ($line =~ /^.\s*__initcall\s*\(/) { WARN("USE_DEVICE_INITCALL", - "please use device_initcall() instead of __initcall()\n" . $herecurr); + "please use device_initcall() or more appropriate function instead of __initcall() (see include/linux/init.h)\n" . $herecurr); } # check for various ops structs, ensure they are const. -- cgit v0.10.2 From 179b87fb186b524ec75a5d54c0d7f25e8d559415 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:11 -0700 Subject: fs/efs: convert printk to pr_foo() Convert all except KERN_DEBUG (pr_debug doesn't work the same as printk(KERN_DEBUG and requires special check) Signed-off-by: Fabian Frederick Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/efs/dir.c b/fs/efs/dir.c index b72307c..46a9a60 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -26,7 +26,7 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) int slot; if (inode->i_size & (EFS_DIRBSIZE-1)) - printk(KERN_WARNING "EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); + pr_warn("EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); /* work out where this entry can be found */ block = ctx->pos >> EFS_DIRBSIZE_BITS; @@ -43,14 +43,14 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); if (!bh) { - printk(KERN_ERR "EFS: readdir(): failed to read dir block %d\n", block); + pr_err("EFS: readdir(): failed to read dir block %d\n", block); break; } dirblock = (struct efs_dir *) bh->b_data; if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { - printk(KERN_ERR "EFS: readdir(): invalid directory block\n"); + pr_err("EFS: readdir(): invalid directory block\n"); brelse(bh); break; } @@ -80,7 +80,7 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) /* sanity check */ if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { - printk(KERN_WARNING "EFS: directory entry %d exceeds directory block\n", slot); + pr_warn("EFS: directory entry %d exceeds directory block\n", slot); continue; } diff --git a/fs/efs/file.c b/fs/efs/file.c index 1ccb364..a75c710 100644 --- a/fs/efs/file.c +++ b/fs/efs/file.c @@ -22,7 +22,7 @@ int efs_get_block(struct inode *inode, sector_t iblock, /* * i have no idea why this happens as often as it does */ - printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", + pr_warn("EFS: bmap(): block %d >= %ld (filesize %ld)\n", block, inode->i_blocks, inode->i_size); @@ -38,7 +38,7 @@ int efs_get_block(struct inode *inode, sector_t iblock, int efs_bmap(struct inode *inode, efs_block_t block) { if (block < 0) { - printk(KERN_WARNING "EFS: bmap(): block < 0\n"); + pr_warn("EFS: bmap(): block < 0\n"); return 0; } @@ -48,10 +48,8 @@ int efs_bmap(struct inode *inode, efs_block_t block) { /* * i have no idea why this happens as often as it does */ - printk(KERN_WARNING "EFS: bmap(): block %d >= %ld (filesize %ld)\n", - block, - inode->i_blocks, - inode->i_size); + pr_warn("EFS: bmap(): block %d >= %ld (filesize %ld)\n", + block, inode->i_blocks, inode->i_size); #endif return 0; } diff --git a/fs/efs/inode.c b/fs/efs/inode.c index d15ccf2..54f1cbb 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -89,7 +89,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) bh = sb_bread(inode->i_sb, block); if (!bh) { - printk(KERN_WARNING "EFS: bread() failed at block %d\n", block); + pr_warn("EFS: bread() failed at block %d\n", block); goto read_inode_error; } @@ -130,7 +130,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) for(i = 0; i < EFS_DIRECTEXTENTS; i++) { extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i])); if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) { - printk(KERN_WARNING "EFS: extent %d has bad magic number in inode %lu\n", i, inode->i_ino); + pr_warn("EFS: extent %d has bad magic number in inode %lu\n", i, inode->i_ino); brelse(bh); goto read_inode_error; } @@ -162,7 +162,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) init_special_inode(inode, inode->i_mode, device); break; default: - printk(KERN_WARNING "EFS: unsupported inode mode %o\n", inode->i_mode); + pr_warn("EFS: unsupported inode mode %o\n", inode->i_mode); goto read_inode_error; break; } @@ -171,7 +171,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) return inode; read_inode_error: - printk(KERN_WARNING "EFS: failed to read inode %lu\n", inode->i_ino); + pr_warn("EFS: failed to read inode %lu\n", inode->i_ino); iget_failed(inode); return ERR_PTR(-EIO); } @@ -216,7 +216,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { /* if we only have one extent then nothing can be found */ if (in->numextents == 1) { - printk(KERN_ERR "EFS: map_block() failed to map (1 extent)\n"); + pr_err("EFS: map_block() failed to map (1 extent)\n"); return 0; } @@ -234,7 +234,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { } } - printk(KERN_ERR "EFS: map_block() failed to map block %u (dir)\n", block); + pr_err("EFS: map_block() failed to map block %u (dir)\n", block); return 0; } @@ -262,7 +262,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { if (dirext == direxts) { /* should never happen */ - printk(KERN_ERR "EFS: couldn't find direct extent for indirect extent %d (block %u)\n", cur, block); + pr_err("EFS: couldn't find direct extent for indirect extent %d (block %u)\n", cur, block); if (bh) brelse(bh); return 0; } @@ -279,7 +279,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { bh = sb_bread(inode->i_sb, iblock); if (!bh) { - printk(KERN_ERR "EFS: bread() failed at block %d\n", iblock); + pr_err("EFS: bread() failed at block %d\n", iblock); return 0; } #ifdef DEBUG @@ -294,7 +294,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { extent_copy(&(exts[ioffset]), &ext); if (ext.cooked.ex_magic != 0) { - printk(KERN_ERR "EFS: extent %d has bad magic number in block %d\n", cur, iblock); + pr_err("EFS: extent %d has bad magic number in block %d\n", cur, iblock); if (bh) brelse(bh); return 0; } @@ -306,7 +306,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { } } if (bh) brelse(bh); - printk(KERN_ERR "EFS: map_block() failed to map block %u (indir)\n", block); + pr_err("EFS: map_block() failed to map block %u (indir)\n", block); return 0; } diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 96f66d2..527d0b9 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -23,20 +23,20 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) efs_block_t block; if (inode->i_size & (EFS_DIRBSIZE-1)) - printk(KERN_WARNING "EFS: WARNING: find_entry(): directory size not a multiple of EFS_DIRBSIZE\n"); + pr_warn("EFS: WARNING: find_entry(): directory size not a multiple of EFS_DIRBSIZE\n"); for(block = 0; block < inode->i_blocks; block++) { bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); if (!bh) { - printk(KERN_ERR "EFS: find_entry(): failed to read dir block %d\n", block); + pr_err("EFS: find_entry(): failed to read dir block %d\n", block); return 0; } dirblock = (struct efs_dir *) bh->b_data; if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { - printk(KERN_ERR "EFS: find_entry(): invalid directory block\n"); + pr_err("EFS: find_entry(): invalid directory block\n"); brelse(bh); return(0); } diff --git a/fs/efs/super.c b/fs/efs/super.c index 3befcc9..0590232 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -134,7 +134,7 @@ static const struct export_operations efs_export_ops = { static int __init init_efs_fs(void) { int err; - printk("EFS: "EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); + pr_info("EFS: "EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); err = init_inodecache(); if (err) goto out1; @@ -179,7 +179,7 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { csum += be32_to_cpu(cs); } if (csum) { - printk(KERN_INFO "EFS: SGI disklabel: checksum bad, label corrupted\n"); + pr_warn("EFS: SGI disklabel: checksum bad, label corrupted\n"); return 0; } @@ -226,11 +226,10 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { } if (slice == -1) { - printk(KERN_NOTICE "EFS: partition table contained no EFS partitions\n"); + pr_notice("EFS: partition table contained no EFS partitions\n"); #ifdef DEBUG } else { - printk(KERN_INFO "EFS: using slice %d (type %s, offset 0x%x)\n", - slice, + pr_info("EFS: using slice %d (type %s, offset 0x%x)\n", slice, (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", sblock); #endif @@ -268,7 +267,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { - printk(KERN_ERR "EFS: device does not support %d byte blocks\n", + pr_err("EFS: device does not support %d byte blocks\n", EFS_BLOCKSIZE); return -EINVAL; } @@ -277,7 +276,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) bh = sb_bread(s, 0); if (!bh) { - printk(KERN_ERR "EFS: cannot read volume header\n"); + pr_err("EFS: cannot read volume header\n"); return -EINVAL; } @@ -295,13 +294,13 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) bh = sb_bread(s, sb->fs_start + EFS_SUPER); if (!bh) { - printk(KERN_ERR "EFS: cannot read superblock\n"); + pr_err("EFS: cannot read superblock\n"); return -EINVAL; } if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { #ifdef DEBUG - printk(KERN_WARNING "EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); + pr_warn("EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); #endif brelse(bh); return -EINVAL; @@ -310,7 +309,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) if (!(s->s_flags & MS_RDONLY)) { #ifdef DEBUG - printk(KERN_INFO "EFS: forcing read-only mode\n"); + pr_info("EFS: forcing read-only mode\n"); #endif s->s_flags |= MS_RDONLY; } @@ -318,13 +317,13 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) s->s_export_op = &efs_export_ops; root = efs_iget(s, EFS_ROOTINODE); if (IS_ERR(root)) { - printk(KERN_ERR "EFS: get root inode failed\n"); + pr_err("EFS: get root inode failed\n"); return PTR_ERR(root); } s->s_root = d_make_root(root); if (!(s->s_root)) { - printk(KERN_ERR "EFS: get root dentry failed\n"); + pr_err("EFS: get root dentry failed\n"); return -ENOMEM; } -- cgit v0.10.2 From f403d1dbac6d1ef28f553f3996d5bb5cea90ce15 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:12 -0700 Subject: fs/efs: add pr_fmt / use __func__ Also uniformize function arguments. Signed-off-by: Fabian Frederick Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/efs/dir.c b/fs/efs/dir.c index 46a9a60..7f97031 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -26,7 +26,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) int slot; if (inode->i_size & (EFS_DIRBSIZE-1)) - pr_warn("EFS: WARNING: readdir(): directory size not a multiple of EFS_DIRBSIZE\n"); + pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n", + __func__); /* work out where this entry can be found */ block = ctx->pos >> EFS_DIRBSIZE_BITS; @@ -43,14 +44,15 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); if (!bh) { - pr_err("EFS: readdir(): failed to read dir block %d\n", block); + pr_err("%s(): failed to read dir block %d\n", + __func__, block); break; } dirblock = (struct efs_dir *) bh->b_data; if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { - pr_err("EFS: readdir(): invalid directory block\n"); + pr_err("%s(): invalid directory block\n", __func__); brelse(bh); break; } @@ -80,7 +82,8 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) /* sanity check */ if (nameptr - (char *) dirblock + namelen > EFS_DIRBSIZE) { - pr_warn("EFS: directory entry %d exceeds directory block\n", slot); + pr_warn("directory entry %d exceeds directory block\n", + slot); continue; } diff --git a/fs/efs/efs.h b/fs/efs/efs.h index 5528926..5bbf961 100644 --- a/fs/efs/efs.h +++ b/fs/efs/efs.h @@ -7,6 +7,12 @@ #ifndef _EFS_EFS_H_ #define _EFS_EFS_H_ +#ifdef pr_fmt +#undef pr_fmt +#endif + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include diff --git a/fs/efs/file.c b/fs/efs/file.c index a75c710..a37dcee 100644 --- a/fs/efs/file.c +++ b/fs/efs/file.c @@ -22,10 +22,8 @@ int efs_get_block(struct inode *inode, sector_t iblock, /* * i have no idea why this happens as often as it does */ - pr_warn("EFS: bmap(): block %d >= %ld (filesize %ld)\n", - block, - inode->i_blocks, - inode->i_size); + pr_warn("%s(): block %d >= %ld (filesize %ld)\n", + __func__, block, inode->i_blocks, inode->i_size); #endif return 0; } @@ -38,7 +36,7 @@ int efs_get_block(struct inode *inode, sector_t iblock, int efs_bmap(struct inode *inode, efs_block_t block) { if (block < 0) { - pr_warn("EFS: bmap(): block < 0\n"); + pr_warn("%s(): block < 0\n", __func__); return 0; } @@ -48,8 +46,8 @@ int efs_bmap(struct inode *inode, efs_block_t block) { /* * i have no idea why this happens as often as it does */ - pr_warn("EFS: bmap(): block %d >= %ld (filesize %ld)\n", - block, inode->i_blocks, inode->i_size); + pr_warn("%s(): block %d >= %ld (filesize %ld)\n", + __func__, block, inode->i_blocks, inode->i_size); #endif return 0; } diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 54f1cbb..6c9684a 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -89,7 +89,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) bh = sb_bread(inode->i_sb, block); if (!bh) { - pr_warn("EFS: bread() failed at block %d\n", block); + pr_warn("%s() failed at block %d\n", __func__, block); goto read_inode_error; } @@ -130,7 +130,8 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) for(i = 0; i < EFS_DIRECTEXTENTS; i++) { extent_copy(&(efs_inode->di_u.di_extents[i]), &(in->extents[i])); if (i < in->numextents && in->extents[i].cooked.ex_magic != 0) { - pr_warn("EFS: extent %d has bad magic number in inode %lu\n", i, inode->i_ino); + pr_warn("extent %d has bad magic number in inode %lu\n", + i, inode->i_ino); brelse(bh); goto read_inode_error; } @@ -162,7 +163,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) init_special_inode(inode, inode->i_mode, device); break; default: - pr_warn("EFS: unsupported inode mode %o\n", inode->i_mode); + pr_warn("unsupported inode mode %o\n", inode->i_mode); goto read_inode_error; break; } @@ -171,7 +172,7 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) return inode; read_inode_error: - pr_warn("EFS: failed to read inode %lu\n", inode->i_ino); + pr_warn("failed to read inode %lu\n", inode->i_ino); iget_failed(inode); return ERR_PTR(-EIO); } @@ -216,7 +217,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { /* if we only have one extent then nothing can be found */ if (in->numextents == 1) { - pr_err("EFS: map_block() failed to map (1 extent)\n"); + pr_err("%s() failed to map (1 extent)\n", __func__); return 0; } @@ -234,7 +235,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { } } - pr_err("EFS: map_block() failed to map block %u (dir)\n", block); + pr_err("%s() failed to map block %u (dir)\n", __func__, block); return 0; } @@ -262,7 +263,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { if (dirext == direxts) { /* should never happen */ - pr_err("EFS: couldn't find direct extent for indirect extent %d (block %u)\n", cur, block); + pr_err("couldn't find direct extent for indirect extent %d (block %u)\n", + cur, block); if (bh) brelse(bh); return 0; } @@ -279,7 +281,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { bh = sb_bread(inode->i_sb, iblock); if (!bh) { - pr_err("EFS: bread() failed at block %d\n", iblock); + pr_err("%s() failed at block %d\n", + __func__, iblock); return 0; } #ifdef DEBUG @@ -294,7 +297,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { extent_copy(&(exts[ioffset]), &ext); if (ext.cooked.ex_magic != 0) { - pr_err("EFS: extent %d has bad magic number in block %d\n", cur, iblock); + pr_err("extent %d has bad magic number in block %d\n", + cur, iblock); if (bh) brelse(bh); return 0; } @@ -306,7 +310,7 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { } } if (bh) brelse(bh); - pr_err("EFS: map_block() failed to map block %u (indir)\n", block); + pr_err("%s() failed to map block %u (indir)\n", __func__, block); return 0; } diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 527d0b9..356c044 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -23,20 +23,22 @@ static efs_ino_t efs_find_entry(struct inode *inode, const char *name, int len) efs_block_t block; if (inode->i_size & (EFS_DIRBSIZE-1)) - pr_warn("EFS: WARNING: find_entry(): directory size not a multiple of EFS_DIRBSIZE\n"); + pr_warn("%s(): directory size not a multiple of EFS_DIRBSIZE\n", + __func__); for(block = 0; block < inode->i_blocks; block++) { bh = sb_bread(inode->i_sb, efs_bmap(inode, block)); if (!bh) { - pr_err("EFS: find_entry(): failed to read dir block %d\n", block); + pr_err("%s(): failed to read dir block %d\n", + __func__, block); return 0; } dirblock = (struct efs_dir *) bh->b_data; if (be16_to_cpu(dirblock->magic) != EFS_DIRBLK_MAGIC) { - pr_err("EFS: find_entry(): invalid directory block\n"); + pr_err("%s(): invalid directory block\n", __func__); brelse(bh); return(0); } diff --git a/fs/efs/super.c b/fs/efs/super.c index 0590232..cd1399e 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -134,7 +134,7 @@ static const struct export_operations efs_export_ops = { static int __init init_efs_fs(void) { int err; - pr_info("EFS: "EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); + pr_info(EFS_VERSION" - http://aeschi.ch.eu.org/efs/\n"); err = init_inodecache(); if (err) goto out1; @@ -179,7 +179,7 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { csum += be32_to_cpu(cs); } if (csum) { - pr_warn("EFS: SGI disklabel: checksum bad, label corrupted\n"); + pr_warn("SGI disklabel: checksum bad, label corrupted\n"); return 0; } @@ -226,10 +226,10 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { } if (slice == -1) { - pr_notice("EFS: partition table contained no EFS partitions\n"); + pr_notice("partition table contained no EFS partitions\n"); #ifdef DEBUG } else { - pr_info("EFS: using slice %d (type %s, offset 0x%x)\n", slice, + pr_info("using slice %d (type %s, offset 0x%x)\n", slice, (pt_entry->pt_name) ? pt_entry->pt_name : "unknown", sblock); #endif @@ -267,7 +267,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) s->s_magic = EFS_SUPER_MAGIC; if (!sb_set_blocksize(s, EFS_BLOCKSIZE)) { - pr_err("EFS: device does not support %d byte blocks\n", + pr_err("device does not support %d byte blocks\n", EFS_BLOCKSIZE); return -EINVAL; } @@ -276,7 +276,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) bh = sb_bread(s, 0); if (!bh) { - pr_err("EFS: cannot read volume header\n"); + pr_err("cannot read volume header\n"); return -EINVAL; } @@ -294,13 +294,14 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) bh = sb_bread(s, sb->fs_start + EFS_SUPER); if (!bh) { - pr_err("EFS: cannot read superblock\n"); + pr_err("cannot read superblock\n"); return -EINVAL; } if (efs_validate_super(sb, (struct efs_super *) bh->b_data)) { #ifdef DEBUG - pr_warn("EFS: invalid superblock at block %u\n", sb->fs_start + EFS_SUPER); + pr_warn("invalid superblock at block %u\n", + sb->fs_start + EFS_SUPER); #endif brelse(bh); return -EINVAL; @@ -309,7 +310,7 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) if (!(s->s_flags & MS_RDONLY)) { #ifdef DEBUG - pr_info("EFS: forcing read-only mode\n"); + pr_info("forcing read-only mode\n"); #endif s->s_flags |= MS_RDONLY; } @@ -317,13 +318,13 @@ static int efs_fill_super(struct super_block *s, void *d, int silent) s->s_export_op = &efs_export_ops; root = efs_iget(s, EFS_ROOTINODE); if (IS_ERR(root)) { - pr_err("EFS: get root inode failed\n"); + pr_err("get root inode failed\n"); return PTR_ERR(root); } s->s_root = d_make_root(root); if (!(s->s_root)) { - pr_err("EFS: get root dentry failed\n"); + pr_err("get root dentry failed\n"); return -ENOMEM; } -- cgit v0.10.2 From d1826f2a3d64f76b18e9ac4a6874c5db5be24750 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:13 -0700 Subject: fs/efs: convert printk(KERN_DEBUG to pr_debug All KERN_DEBUG callsites being under #ifdef DEBUG we can safely convert everything to pr_debug without changing current behaviour. Remove #ifdef DEBUG around pr_debugs only (suggested by Joe Perches) Signed-off-by: Fabian Frederick Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/efs/dir.c b/fs/efs/dir.c index 7f97031..ce63b24 100644 --- a/fs/efs/dir.c +++ b/fs/efs/dir.c @@ -71,10 +71,9 @@ static int efs_readdir(struct file *file, struct dir_context *ctx) inodenum = be32_to_cpu(dirslot->inode); namelen = dirslot->namelen; nameptr = dirslot->name; - -#ifdef DEBUG - printk(KERN_DEBUG "EFS: readdir(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", block, slot, dirblock->slots-1, inodenum, nameptr, namelen); -#endif + pr_debug("%s(): block %d slot %d/%d: inode %u, name \"%s\", namelen %u\n", + __func__, block, slot, dirblock->slots-1, + inodenum, nameptr, namelen); if (!namelen) continue; /* found the next entry */ diff --git a/fs/efs/inode.c b/fs/efs/inode.c index 6c9684a..079d203 100644 --- a/fs/efs/inode.c +++ b/fs/efs/inode.c @@ -138,12 +138,8 @@ struct inode *efs_iget(struct super_block *super, unsigned long ino) } brelse(bh); - -#ifdef DEBUG - printk(KERN_DEBUG "EFS: efs_iget(): inode %lu, extents %d, mode %o\n", - inode->i_ino, in->numextents, inode->i_mode); -#endif - + pr_debug("efs_iget(): inode %lu, extents %d, mode %o\n", + inode->i_ino, in->numextents, inode->i_mode); switch (inode->i_mode & S_IFMT) { case S_IFDIR: inode->i_op = &efs_dir_inode_operations; @@ -239,9 +235,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { return 0; } -#ifdef DEBUG - printk(KERN_DEBUG "EFS: map_block(): indirect search for logical block %u\n", block); -#endif + pr_debug("%s(): indirect search for logical block %u\n", + __func__, block); direxts = in->extents[0].cooked.ex_offset; indexts = in->numextents; @@ -285,9 +280,8 @@ efs_block_t efs_map_block(struct inode *inode, efs_block_t block) { __func__, iblock); return 0; } -#ifdef DEBUG - printk(KERN_DEBUG "EFS: map_block(): read indirect extent block %d\n", iblock); -#endif + pr_debug("%s(): read indirect extent block %d\n", + __func__, iblock); first = 0; lastblock = iblock; } diff --git a/fs/efs/super.c b/fs/efs/super.c index cd1399e..7fca462 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -184,7 +184,7 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { } #ifdef DEBUG - printk(KERN_DEBUG "EFS: bf: \"%16s\"\n", vh->vh_bootfile); + pr_debug("bf: \"%16s\"\n", vh->vh_bootfile); for(i = 0; i < NVDIR; i++) { int j; @@ -196,9 +196,8 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { name[j] = (char) 0; if (name[0]) { - printk(KERN_DEBUG "EFS: vh: %8s block: 0x%08x size: 0x%08x\n", - name, - (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), + pr_debug("vh: %8s block: 0x%08x size: 0x%08x\n", + name, (int) be32_to_cpu(vh->vh_vd[i].vd_lbn), (int) be32_to_cpu(vh->vh_vd[i].vd_nbytes)); } } @@ -211,12 +210,11 @@ static efs_block_t efs_validate_vh(struct volume_header *vh) { } #ifdef DEBUG if (be32_to_cpu(vh->vh_pt[i].pt_nblks)) { - printk(KERN_DEBUG "EFS: pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", - i, - (int) be32_to_cpu(vh->vh_pt[i].pt_firstlbn), - (int) be32_to_cpu(vh->vh_pt[i].pt_nblks), - pt_type, - (pt_entry->pt_name) ? pt_entry->pt_name : "unknown"); + pr_debug("pt %2d: start: %08d size: %08d type: 0x%02x (%s)\n", + i, (int)be32_to_cpu(vh->vh_pt[i].pt_firstlbn), + (int)be32_to_cpu(vh->vh_pt[i].pt_nblks), + pt_type, (pt_entry->pt_name) ? + pt_entry->pt_name : "unknown"); } #endif if (IS_EFS(pt_type)) { -- cgit v0.10.2 From b219e25f8dd9793904ca87c55c76ab8c8931abb9 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:14 -0700 Subject: fs/binfmt_elf.c: fix bool assignements Fix coccinelle warnings. Signed-off-by: Fabian Frederick Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index aa3cb62..dabc73a 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1686,7 +1686,7 @@ static size_t get_note_info_size(struct elf_note_info *info) static int write_note_info(struct elf_note_info *info, struct coredump_params *cprm) { - bool first = 1; + bool first = true; struct elf_thread_core_info *t = info->thread; do { @@ -1710,7 +1710,7 @@ static int write_note_info(struct elf_note_info *info, !writenote(&t->notes[i], cprm)) return 0; - first = 0; + first = false; t = t->next; } while (t); -- cgit v0.10.2 From 343034357202be37be335a08ef4a0f0708b6ba3d Mon Sep 17 00:00:00 2001 From: Axel Lin Date: Wed, 4 Jun 2014 16:12:15 -0700 Subject: fs/binfmt_flat.c: make old_reloc() static old_reloc() is only used in this file, make it static. Signed-off-by: Axel Lin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index d50bbe5..f723cd3 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -380,7 +380,7 @@ failed: /****************************************************************************/ -void old_reloc(unsigned long rl) +static void old_reloc(unsigned long rl) { #ifdef DEBUG char *segment[] = { "TEXT", "DATA", "BSS", "*UNKNOWN*" }; -- cgit v0.10.2 From d62cf81524304396276f6aaa5cd7ce62f6f65110 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Jun 2014 16:12:16 -0700 Subject: init/main.c: don't use pr_debug() Pertially revert commit ea676e846a81 ("init/main.c: convert to pr_foo()"). Unbeknownst to me, pr_debug() is different from the other pr_foo() levels: pr_debug() is a no-op when DEBUG is not defined. Happily, init/main.c does have a #define DEBUG so we didn't break initcall_debug. But the functioning of initcall_debug should not be dependent upon the presence of that #define DEBUG. Reported-by: Russell King Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/init/main.c b/init/main.c index 04fab8d..9d3a7b8 100644 --- a/init/main.c +++ b/init/main.c @@ -671,13 +671,13 @@ static int __init_or_module do_one_initcall_debug(initcall_t fn) unsigned long long duration; int ret; - pr_debug("calling %pF @ %i\n", fn, task_pid_nr(current)); + printk(KERN_DEBUG "calling %pF @ %i\n", fn, task_pid_nr(current)); calltime = ktime_get(); ret = fn(); rettime = ktime_get(); delta = ktime_sub(rettime, calltime); duration = (unsigned long long) ktime_to_ns(delta) >> 10; - pr_debug("initcall %pF returned %d after %lld usecs\n", + printk(KERN_DEBUG "initcall %pF returned %d after %lld usecs\n", fn, ret, duration); return ret; -- cgit v0.10.2 From 7b0b73d76651e5f88c88b76efa18d719f832bf6f Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Wed, 4 Jun 2014 16:12:17 -0700 Subject: init/main.c: add initcall_blacklist kernel parameter When a module is built into the kernel the module_init() function becomes an initcall. Sometimes debugging through dynamic debug can help, however, debugging built in kernel modules is typically done by changing the .config, recompiling, and booting the new kernel in an effort to determine exactly which module caused a problem. This patchset can be useful stand-alone or combined with initcall_debug. There are cases where some initcalls can hang the machine before the console can be flushed, which can make initcall_debug output inaccurate. Having the ability to skip initcalls can help further debugging of these scenarios. Usage: initcall_blacklist= ex) added "initcall_blacklist=sgi_uv_sysfs_init" as a kernel parameter and the log contains: blacklisting initcall sgi_uv_sysfs_init ... ... initcall sgi_uv_sysfs_init blacklisted ex) added "initcall_blacklist=foo_bar,sgi_uv_sysfs_init" as a kernel parameter and the log contains: blacklisting initcall foo_bar blacklisting initcall sgi_uv_sysfs_init ... ... initcall sgi_uv_sysfs_init blacklisted [akpm@linux-foundation.org: tweak printk text] Signed-off-by: Prarit Bhargava Cc: Richard Weinberger Cc: Andi Kleen Cc: Josh Boyer Cc: Rob Landley Cc: Steven Rostedt Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index adea3a2..9973a7e 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1312,6 +1312,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. for working out where the kernel is dying during startup. + initcall_blacklist= [KNL] Do not execute a comma-separated list of + initcall functions. Useful for debugging built-in + modules and initcalls. + initrd= [BOOT] Specify the location of the initial ramdisk inport.irq= [HW] Inport (ATI XL and Microsoft) busmouse driver diff --git a/init/main.c b/init/main.c index 9d3a7b8..8ac3833 100644 --- a/init/main.c +++ b/init/main.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include @@ -665,6 +666,70 @@ static void __init do_ctors(void) bool initcall_debug; core_param(initcall_debug, initcall_debug, bool, 0644); +#ifdef CONFIG_KALLSYMS +struct blacklist_entry { + struct list_head next; + char *buf; +}; + +static __initdata_or_module LIST_HEAD(blacklisted_initcalls); + +static int __init initcall_blacklist(char *str) +{ + char *str_entry; + struct blacklist_entry *entry; + + /* str argument is a comma-separated list of functions */ + do { + str_entry = strsep(&str, ","); + if (str_entry) { + pr_debug("blacklisting initcall %s\n", str_entry); + entry = alloc_bootmem(sizeof(*entry)); + entry->buf = alloc_bootmem(strlen(str_entry) + 1); + strcpy(entry->buf, str_entry); + list_add(&entry->next, &blacklisted_initcalls); + } + } while (str_entry); + + return 0; +} + +static bool __init_or_module initcall_blacklisted(initcall_t fn) +{ + struct list_head *tmp; + struct blacklist_entry *entry; + char *fn_name; + + fn_name = kasprintf(GFP_KERNEL, "%pf", fn); + if (!fn_name) + return false; + + list_for_each(tmp, &blacklisted_initcalls) { + entry = list_entry(tmp, struct blacklist_entry, next); + if (!strcmp(fn_name, entry->buf)) { + pr_debug("initcall %s blacklisted\n", fn_name); + kfree(fn_name); + return true; + } + } + + kfree(fn_name); + return false; +} +#else +static int __init initcall_blacklist(char *str) +{ + pr_warn("initcall_blacklist requires CONFIG_KALLSYMS\n"); + return 0; +} + +static bool __init_or_module initcall_blacklisted(initcall_t fn) +{ + return false; +} +#endif +__setup("initcall_blacklist=", initcall_blacklist); + static int __init_or_module do_one_initcall_debug(initcall_t fn) { ktime_t calltime, delta, rettime; @@ -689,6 +754,9 @@ int __init_or_module do_one_initcall(initcall_t fn) int ret; char msgbuf[64]; + if (initcall_blacklisted(fn)) + return -EPERM; + if (initcall_debug) ret = do_one_initcall_debug(fn); else -- cgit v0.10.2 From 34a1b7236ad6113883f6c448d1da854cad60265e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 4 Jun 2014 16:12:19 -0700 Subject: kthreads: kill CLONE_KERNEL, change kernel_thread(kernel_init) to avoid CLONE_SIGHAND 1. Remove CLONE_KERNEL, it has no users and it is dangerous. The (old) comment says "List of flags we want to share for kernel threads" but this is not true, we do not want to share ->sighand by default. This flag can only be used if the caller is sure that both parent/child will never play with signals (say, allow_signal/etc). 2. Change rest_init() to clone kernel_init() without CLONE_SIGHAND. In this case CLONE_SIGHAND does not really hurt, and it looks like optimization because copy_sighand() can avoid kmem_cache_alloc(). But in fact this only adds the minor pessimization. kernel_init() is going to exec the init process, and de_thread() will need to unshare ->sighand and do kmem_cache_alloc(sighand_cachep) anyway, but it needs to do more work and take tasklist_lock and siglock. Signed-off-by: Oleg Nesterov Acked-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Ingo Molnar Cc: Mathieu Desnoyers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/sched.h b/include/linux/sched.h index 611676f..8fcd0e6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -137,12 +137,6 @@ struct filename; #define VMACACHE_MASK (VMACACHE_SIZE - 1) /* - * List of flags we want to share for kernel threads, - * if only because they are not used by them anyway. - */ -#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND) - -/* * These are the constant used to fake the fixed-point load-average * counting. Some notes: * - 11 bit fractions expand to 22 bits by the multiplies: this gives diff --git a/init/main.c b/init/main.c index 8ac3833..4de815c 100644 --- a/init/main.c +++ b/init/main.c @@ -380,7 +380,7 @@ static noinline void __init_refok rest_init(void) * the init task will end up wanting to create kthreads, which, if * we schedule it before we create kthreadd, will OOPS. */ - kernel_thread(kernel_init, NULL, CLONE_FS | CLONE_SIGHAND); + kernel_thread(kernel_init, NULL, CLONE_FS); numa_default_policy(); pid = kernel_thread(kthreadd, NULL, CLONE_FS | CLONE_FILES); rcu_read_lock(); -- cgit v0.10.2 From 647f010bff6795b3e85c2b5a7768c0594a049ab0 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 4 Jun 2014 16:12:20 -0700 Subject: init/main.c: remove an ifdef Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h index 608e60a..9d117f6 100644 --- a/include/linux/proc_fs.h +++ b/include/linux/proc_fs.h @@ -44,6 +44,10 @@ extern int remove_proc_subtree(const char *, struct proc_dir_entry *); #else /* CONFIG_PROC_FS */ +static inline void proc_root_init(void) +{ +} + static inline void proc_flush_task(struct task_struct *task) { } diff --git a/init/main.c b/init/main.c index 4de815c..17d47bc 100644 --- a/init/main.c +++ b/init/main.c @@ -629,9 +629,7 @@ asmlinkage __visible void __init start_kernel(void) signals_init(); /* rootfs populating might need page-writeback */ page_writeback_init(); -#ifdef CONFIG_PROC_FS proc_root_init(); -#endif cgroup_init(); cpuset_init(); taskstats_init_early(); -- cgit v0.10.2 From 8091b895b76f690aa2f6689b6ed602f07525a938 Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:21 -0700 Subject: fs/ncpfs/getopt.c: replace simple_strtoul by kstrtoul Remove obsolete simple_strtoul in ncp_getopt Signed-off-by: Fabian Frederick Cc: Petr Vandrovec Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c index 03ffde1..344889c 100644 --- a/fs/ncpfs/getopt.c +++ b/fs/ncpfs/getopt.c @@ -53,15 +53,14 @@ int ncp_getopt(const char *caller, char **options, const struct ncp_option *opts return -EINVAL; } if (opts->has_arg & OPT_INT) { - char* v; + int rc = kstrtoul(val, 0, value); - *value = simple_strtoul(val, &v, 0); - if (!*v) { - return opts->val; + if (rc) { + pr_info("%s: invalid numeric value in %s=%s\n", + caller, token, val); + return rc; } - pr_info("%s: invalid numeric value in %s=%s\n", - caller, token, val); - return -EDOM; + return opts->val; } if (opts->has_arg & OPT_STRING) { return opts->val; -- cgit v0.10.2 From 3ff6db3287e8a5e8f5bb9529b8e1259ca6b10def Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 4 Jun 2014 16:12:21 -0700 Subject: fs/autofs4/dev-ioctl.c: add __init to autofs_dev_ioctl_init autofs_dev_ioctl_init is only called by __init init_autofs4_fs Signed-off-by: Fabian Frederick Acked-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 232e03d..5b570b6 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -737,7 +737,7 @@ MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); MODULE_ALIAS("devname:autofs"); /* Register/deregister misc character device */ -int autofs_dev_ioctl_init(void) +int __init autofs_dev_ioctl_init(void) { int r; -- cgit v0.10.2