From d8adde17e5f858427504725218c56aef90e90fc7 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Wed, 11 Jul 2012 14:01:52 -0700 Subject: memory hotplug: fix invalid memory access caused by stale kswapd pointer kswapd_stop() is called to destroy the kswapd work thread when all memory of a NUMA node has been offlined. But kswapd_stop() only terminates the work thread without resetting NODE_DATA(nid)->kswapd to NULL. The stale pointer will prevent kswapd_run() from creating a new work thread when adding memory to the memory-less NUMA node again. Eventually the stale pointer may cause invalid memory access. An example stack dump as below. It's reproduced with 2.6.32, but latest kernel has the same issue. BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] exit_creds+0x12/0x78 PGD 0 Oops: 0000 [#1] SMP last sysfs file: /sys/devices/system/memory/memory391/state CPU 11 Modules linked in: cpufreq_conservative cpufreq_userspace cpufreq_powersave acpi_cpufreq microcode fuse loop dm_mod tpm_tis rtc_cmos i2c_i801 rtc_core tpm serio_raw pcspkr sg tpm_bios igb i2c_core iTCO_wdt rtc_lib mptctl iTCO_vendor_support button dca bnx2 usbhid hid uhci_hcd ehci_hcd usbcore sd_mod crc_t10dif edd ext3 mbcache jbd fan ide_pci_generic ide_core ata_generic ata_piix libata thermal processor thermal_sys hwmon mptsas mptscsih mptbase scsi_transport_sas scsi_mod Pid: 7949, comm: sh Not tainted 2.6.32.12-qiuxishi-5-default #92 Tecal RH2285 RIP: 0010:exit_creds+0x12/0x78 RSP: 0018:ffff8806044f1d78 EFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff880604f22140 RCX: 0000000000019502 RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000 RBP: ffff880604f22150 R08: 0000000000000000 R09: ffffffff81a4dc10 R10: 00000000000032a0 R11: ffff880006202500 R12: 0000000000000000 R13: 0000000000c40000 R14: 0000000000008000 R15: 0000000000000001 FS: 00007fbc03d066f0(0000) GS:ffff8800282e0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000000 CR3: 000000060f029000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process sh (pid: 7949, threadinfo ffff8806044f0000, task ffff880603d7c600) Stack: ffff880604f22140 ffffffff8103aac5 ffff880604f22140 ffffffff8104d21e ffff880006202500 0000000000008000 0000000000c38000 ffffffff810bd5b1 0000000000000000 ffff880603d7c600 00000000ffffdd29 0000000000000003 Call Trace: __put_task_struct+0x5d/0x97 kthread_stop+0x50/0x58 offline_pages+0x324/0x3da memory_block_change_state+0x179/0x1db store_mem_state+0x9e/0xbb sysfs_write_file+0xd0/0x107 vfs_write+0xad/0x169 sys_write+0x45/0x6e system_call_fastpath+0x16/0x1b Code: ff 4d 00 0f 94 c0 84 c0 74 08 48 89 ef e8 1f fd ff ff 5b 5d 31 c0 41 5c c3 53 48 8b 87 20 06 00 00 48 89 fb 48 8b bf 18 06 00 00 <8b> 00 48 c7 83 18 06 00 00 00 00 00 00 f0 ff 0f 0f 94 c0 84 c0 RIP exit_creds+0x12/0x78 RSP CR2: 0000000000000000 [akpm@linux-foundation.org: add pglist_data.kswapd locking comments] Signed-off-by: Xishi Qiu Signed-off-by: Jiang Liu Acked-by: KAMEZAWA Hiroyuki Acked-by: KOSAKI Motohiro Acked-by: Mel Gorman Acked-by: David Rientjes Reviewed-by: Minchan Kim Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 2427706..68c569f 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -694,7 +694,7 @@ typedef struct pglist_data { range, including holes */ int node_id; wait_queue_head_t kswapd_wait; - struct task_struct *kswapd; + struct task_struct *kswapd; /* Protected by lock_memory_hotplug() */ int kswapd_max_order; enum zone_type classzone_idx; } pg_data_t; diff --git a/mm/vmscan.c b/mm/vmscan.c index eeb3bc9..6615763 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2955,14 +2955,17 @@ int kswapd_run(int nid) } /* - * Called by memory hotplug when all memory in a node is offlined. + * Called by memory hotplug when all memory in a node is offlined. Caller must + * hold lock_memory_hotplug(). */ void kswapd_stop(int nid) { struct task_struct *kswapd = NODE_DATA(nid)->kswapd; - if (kswapd) + if (kswapd) { kthread_stop(kswapd); + NODE_DATA(nid)->kswapd = NULL; + } } static int __init kswapd_init(void) -- cgit v0.10.2 From 2a643893e50fde71d7ba84b5592ec61b467b9ab6 Mon Sep 17 00:00:00 2001 From: Devendra Naga Date: Wed, 11 Jul 2012 14:01:53 -0700 Subject: drivers/rtc/rtc-spear.c: fix use-after-free in spear_rtc_remove() `config' is freed and is then used in the rtc_device_unregister() call, causing a kernel panic. Signed-off-by: Devendra Naga Reviewed-by: Viresh Kumar Cc: Alessandro Zummo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-spear.c b/drivers/rtc/rtc-spear.c index 1f76320..e278547 100644 --- a/drivers/rtc/rtc-spear.c +++ b/drivers/rtc/rtc-spear.c @@ -458,12 +458,12 @@ static int __devexit spear_rtc_remove(struct platform_device *pdev) clk_disable(config->clk); clk_put(config->clk); iounmap(config->ioaddr); - kfree(config); res = platform_get_resource(pdev, IORESOURCE_MEM, 0); if (res) release_mem_region(res->start, resource_size(res)); platform_set_drvdata(pdev, NULL); rtc_device_unregister(config->rtc); + kfree(config); return 0; } -- cgit v0.10.2 From 325c117000c1851139981a63d331ea33f2997cd3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Jul 2012 14:01:56 -0700 Subject: mn10300: move setup_jiffies_interrupt() to cevt-mn10300.c Move the static inline function setup_jiffies_interrupt() from to arch/mn10300/kernel/cevt-mn10300.c, which is its only callsite. This allows to remove the inclusion of and from and , fixing include hell like: include/linux/jiffies.h:260:31: warning: "CLOCK_TICK_RATE" is not defined [-Wundef] include/linux/jiffies.h:260:31: warning: "CLOCK_TICK_RATE" is not defined [-Wundef] include/linux/jiffies.h:46:42: error: division by zero in #if ... make[4]: *** [arch/mn10300/kernel/asm-offsets.s] Error 1 and (after a quick hack for the above by defining CLOCK_TICK_RATE in ): In file included from include/linux/notifier.h:15:0, from include/linux/memory_hotplug.h:6, from include/linux/mmzone.h:718, from include/linux/gfp.h:4, from include/linux/irq.h:20, from arch/mn10300/unit-asb2303/include/unit/timex.h:15, from arch/mn10300/include/asm/timex.h:15, from include/linux/timex.h:174, from include/linux/jiffies.h:8, from include/linux/ktime.h:25, from include/linux/timer.h:5, from include/linux/workqueue.h:8, include/linux/srcu.h:55:22: error: field 'work' has incomplete type As a consequence, we do need a few more inclusions of , namely in arch/mn10300/unit-asb2303/smc91111.c and arch/mn10300/unit-asb2305/unit-init.c. Signed-off-by: Geert Uytterhoeven Cc: David Howells Cc: Koichi Yasutake Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/mn10300/include/asm/timex.h b/arch/mn10300/include/asm/timex.h index bd4e90d..f8e6642 100644 --- a/arch/mn10300/include/asm/timex.h +++ b/arch/mn10300/include/asm/timex.h @@ -11,7 +11,6 @@ #ifndef _ASM_TIMEX_H #define _ASM_TIMEX_H -#include #include #define TICK_SIZE (tick_nsec / 1000) @@ -30,16 +29,6 @@ static inline cycles_t get_cycles(void) extern int init_clockevents(void); extern int init_clocksource(void); -static inline void setup_jiffies_interrupt(int irq, - struct irqaction *action) -{ - u16 tmp; - setup_irq(irq, action); - set_intr_level(irq, NUM2GxICR_LEVEL(CONFIG_TIMER_IRQ_LEVEL)); - GxICR(irq) |= GxICR_ENABLE | GxICR_DETECT | GxICR_REQUEST; - tmp = GxICR(irq); -} - #endif /* __KERNEL__ */ #endif /* _ASM_TIMEX_H */ diff --git a/arch/mn10300/kernel/cevt-mn10300.c b/arch/mn10300/kernel/cevt-mn10300.c index 69cae02..ccce35e 100644 --- a/arch/mn10300/kernel/cevt-mn10300.c +++ b/arch/mn10300/kernel/cevt-mn10300.c @@ -70,6 +70,16 @@ static void event_handler(struct clock_event_device *dev) { } +static inline void setup_jiffies_interrupt(int irq, + struct irqaction *action) +{ + u16 tmp; + setup_irq(irq, action); + set_intr_level(irq, NUM2GxICR_LEVEL(CONFIG_TIMER_IRQ_LEVEL)); + GxICR(irq) |= GxICR_ENABLE | GxICR_DETECT | GxICR_REQUEST; + tmp = GxICR(irq); +} + int __init init_clockevents(void) { struct clock_event_device *cd; diff --git a/arch/mn10300/unit-asb2303/include/unit/timex.h b/arch/mn10300/unit-asb2303/include/unit/timex.h index cc18fe7..c37f983 100644 --- a/arch/mn10300/unit-asb2303/include/unit/timex.h +++ b/arch/mn10300/unit-asb2303/include/unit/timex.h @@ -11,10 +11,6 @@ #ifndef _ASM_UNIT_TIMEX_H #define _ASM_UNIT_TIMEX_H -#ifndef __ASSEMBLY__ -#include -#endif /* __ASSEMBLY__ */ - #include #include #include diff --git a/arch/mn10300/unit-asb2303/smc91111.c b/arch/mn10300/unit-asb2303/smc91111.c index 43c2464..5367769 100644 --- a/arch/mn10300/unit-asb2303/smc91111.c +++ b/arch/mn10300/unit-asb2303/smc91111.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include diff --git a/arch/mn10300/unit-asb2305/include/unit/timex.h b/arch/mn10300/unit-asb2305/include/unit/timex.h index 758af30..4cefc22 100644 --- a/arch/mn10300/unit-asb2305/include/unit/timex.h +++ b/arch/mn10300/unit-asb2305/include/unit/timex.h @@ -11,10 +11,6 @@ #ifndef _ASM_UNIT_TIMEX_H #define _ASM_UNIT_TIMEX_H -#ifndef __ASSEMBLY__ -#include -#endif /* __ASSEMBLY__ */ - #include #include #include diff --git a/arch/mn10300/unit-asb2305/unit-init.c b/arch/mn10300/unit-asb2305/unit-init.c index e1becd6..bc4adfa 100644 --- a/arch/mn10300/unit-asb2305/unit-init.c +++ b/arch/mn10300/unit-asb2305/unit-init.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/mn10300/unit-asb2364/include/unit/timex.h b/arch/mn10300/unit-asb2364/include/unit/timex.h index ddb7ed0..42f32db 100644 --- a/arch/mn10300/unit-asb2364/include/unit/timex.h +++ b/arch/mn10300/unit-asb2364/include/unit/timex.h @@ -11,10 +11,6 @@ #ifndef _ASM_UNIT_TIMEX_H #define _ASM_UNIT_TIMEX_H -#ifndef __ASSEMBLY__ -#include -#endif /* __ASSEMBLY__ */ - #include #include #include -- cgit v0.10.2 From 1c20c3de6c35b4ccf12f81b9f98ac9771e7510d3 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Jul 2012 14:01:59 -0700 Subject: mn10300: remove duplicate definition of PTRACE_O_TRACESYSGOOD Fix the warning: include/linux/ptrace.h:66:0: warning: "PTRACE_O_TRACESYSGOOD" redefined [enabled by default] arch/mn10300/include/asm/ptrace.h:85:0: note: this is the location of the previous definition We already have it in , so remove it from Signed-off-by: Geert Uytterhoeven Cc: David Howells Cc: Koichi Yasutake Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/mn10300/include/asm/ptrace.h b/arch/mn10300/include/asm/ptrace.h index 55b79ef..44251b9 100644 --- a/arch/mn10300/include/asm/ptrace.h +++ b/arch/mn10300/include/asm/ptrace.h @@ -81,9 +81,6 @@ struct pt_regs { #define PTRACE_GETFPREGS 14 #define PTRACE_SETFPREGS 15 -/* options set using PTRACE_SETOPTIONS */ -#define PTRACE_O_TRACESYSGOOD 0x00000001 - #ifdef __KERNEL__ #define user_mode(regs) (((regs)->epsw & EPSW_nSL) == EPSW_nSL) -- cgit v0.10.2 From 7a63091288f3a9063b044aaefaad5e75e4ca0f60 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Jul 2012 14:02:00 -0700 Subject: mn10300: kernel/internal.h needs Fix the nm10300 build failure: In file included from arch/mn10300/kernel/csrc-mn10300.c:14:0: arch/mn10300/kernel/internal.h:42:1: error: unknown type name 'irqreturn_t' Signed-off-by: Geert Uytterhoeven Cc: David Howells Cc: Koichi Yasutake Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/mn10300/kernel/internal.h b/arch/mn10300/kernel/internal.h index a5ac755..2df4401 100644 --- a/arch/mn10300/kernel/internal.h +++ b/arch/mn10300/kernel/internal.h @@ -9,6 +9,8 @@ * 2 of the Licence, or (at your option) any later version. */ +#include + struct clocksource; struct clock_event_device; -- cgit v0.10.2 From cea7c5879fe6a6d98f2d604ef4f175cc25058c96 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Jul 2012 14:02:02 -0700 Subject: mn10300: kernel/traps.c needs Fix the warning: arch/mn10300/kernel/traps.c:304:1: warning: data definition has no type or storage class [enabled by default] arch/mn10300/kernel/traps.c:304:1: warning: type defaults to 'int' in declaration of 'EXPORT_SYMBOL' [-Wimplicit-int] arch/mn10300/kernel/traps.c:304:1: warning: parameter names (without types) in function declaration [enabled by default] Signed-off-by: Geert Uytterhoeven Cc: David Howells Cc: Koichi Yasutake Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/mn10300/kernel/traps.c b/arch/mn10300/kernel/traps.c index 94a9c6d..b900e5a 100644 --- a/arch/mn10300/kernel/traps.c +++ b/arch/mn10300/kernel/traps.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include -- cgit v0.10.2 From 77cb621c87ee5c1bbf1b0c8f3e6518482d1688a9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Jul 2012 14:02:04 -0700 Subject: mn10300: mm/dma-alloc.c needs Fix the warnings: arch/mn10300/mm/dma-alloc.c: At top level: arch/mn10300/mm/dma-alloc.c:63:1: warning: data definition has no type or storage class [enabled by default] arch/mn10300/mm/dma-alloc.c:63:1: warning: type defaults to 'int' in declaration of 'EXPORT_SYMBOL' [-Wimplicit-int] arch/mn10300/mm/dma-alloc.c:63:1: warning: parameter names (without types) in function declaration [enabled by default] arch/mn10300/mm/dma-alloc.c:75:1: warning: data definition has no type or storage class [enabled by default] arch/mn10300/mm/dma-alloc.c:75:1: warning: type defaults to 'int' in declaration of 'EXPORT_SYMBOL' [-Wimplicit-int] arch/mn10300/mm/dma-alloc.c:75:1: warning: parameter names (without types) in function declaration [enabled by default] Signed-off-by: Geert Uytterhoeven Cc: David Howells Cc: Koichi Yasutake Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/mn10300/mm/dma-alloc.c b/arch/mn10300/mm/dma-alloc.c index 159acb0..e244ebe 100644 --- a/arch/mn10300/mm/dma-alloc.c +++ b/arch/mn10300/mm/dma-alloc.c @@ -15,6 +15,7 @@ #include #include #include +#include #include static unsigned long pci_sram_allocated = 0xbc000000; -- cgit v0.10.2 From 6b4fa63a9e5696fa43f25143f104ef0bea642222 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Jul 2012 14:02:07 -0700 Subject: mn10300: use "#elif defined(CONFIG_*)" instead of "#elif CONFIG_*" Fix the warnings: arch/mn10300/kernel/irq.c:173:7: warning: "CONFIG_MN10300_TTYSM1_TIMER9" is not defined [-Wundef] arch/mn10300/kernel/irq.c:175:7: warning: "CONFIG_MN10300_TTYSM1_TIMER3" is not defined [-Wundef] Signed-off-by: Geert Uytterhoeven Cc: David Howells Cc: Koichi Yasutake Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/mn10300/kernel/irq.c b/arch/mn10300/kernel/irq.c index 2381df8..35932a8 100644 --- a/arch/mn10300/kernel/irq.c +++ b/arch/mn10300/kernel/irq.c @@ -170,9 +170,9 @@ mn10300_cpupic_setaffinity(struct irq_data *d, const struct cpumask *mask, case SC1TXIRQ: #ifdef CONFIG_MN10300_TTYSM1_TIMER12 case TM12IRQ: -#elif CONFIG_MN10300_TTYSM1_TIMER9 +#elif defined(CONFIG_MN10300_TTYSM1_TIMER9) case TM9IRQ: -#elif CONFIG_MN10300_TTYSM1_TIMER3 +#elif defined(CONFIG_MN10300_TTYSM1_TIMER3) case TM3IRQ: #endif /* CONFIG_MN10300_TTYSM1_TIMER12 */ #endif /* CONFIG_MN10300_TTYSM1 */ -- cgit v0.10.2 From a4e08d001f2e50bb8b3c4eebadcf08e5535f02ee Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 11 Jul 2012 14:02:10 -0700 Subject: ocfs2: fix NULL pointer dereference in __ocfs2_change_file_space() As ocfs2_fallocate() will invoke __ocfs2_change_file_space() with a NULL as the first parameter (file), it may trigger a NULL pointer dereferrence due to a missing check. Addresses http://bugs.launchpad.net/bugs/1006012 Signed-off-by: Luis Henriques Reported-by: Bret Towe Tested-by: Bret Towe Cc: Sunil Mushran Acked-by: Joel Becker Acked-by: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 98513c8..7602783 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1950,7 +1950,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (ret < 0) mlog_errno(ret); - if (file->f_flags & O_SYNC) + if (file && (file->f_flags & O_SYNC)) handle->h_sync = 1; ocfs2_commit_trans(osb, handle); -- cgit v0.10.2 From 4229fb1dc6843c49a14bb098719f8a696cdc44f8 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 11 Jul 2012 14:02:11 -0700 Subject: c/r: prctl: less paranoid prctl_set_mm_exe_file() "no other files mapped" requirement from my previous patch (c/r: prctl: update prctl_set_mm_exe_file() after mm->num_exe_file_vmas removal) is too paranoid, it forbids operation even if there mapped one shared-anon vma. Let's check that current mm->exe_file already unmapped, in this case exe_file symlink already outdated and its changing is reasonable. Plus, this patch fixes exit code in case operation success. Signed-off-by: Konstantin Khlebnikov Reported-by: Cyrill Gorcunov Tested-by: Cyrill Gorcunov Cc: Oleg Nesterov Cc: Matt Helsley Cc: Kees Cook Cc: KOSAKI Motohiro Cc: Tejun Heo Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/kernel/sys.c b/kernel/sys.c index e0c8ffc..2d39a84 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1788,7 +1788,6 @@ SYSCALL_DEFINE1(umask, int, mask) #ifdef CONFIG_CHECKPOINT_RESTORE static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { - struct vm_area_struct *vma; struct file *exe_file; struct dentry *dentry; int err; @@ -1816,13 +1815,17 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) down_write(&mm->mmap_sem); /* - * Forbid mm->exe_file change if there are mapped other files. + * Forbid mm->exe_file change if old file still mapped. */ err = -EBUSY; - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_file && !path_equal(&vma->vm_file->f_path, - &exe_file->f_path)) - goto exit_unlock; + if (mm->exe_file) { + struct vm_area_struct *vma; + + for (vma = mm->mmap; vma; vma = vma->vm_next) + if (vma->vm_file && + path_equal(&vma->vm_file->f_path, + &mm->exe_file->f_path)) + goto exit_unlock; } /* @@ -1835,6 +1838,7 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) goto exit_unlock; + err = 0; set_mm_exe_file(mm, exe_file); exit_unlock: up_write(&mm->mmap_sem); -- cgit v0.10.2 From 4bf2bba3750f10aa9e62e6949bc7e8329990f01b Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 11 Jul 2012 14:02:13 -0700 Subject: mm, thp: abort compaction if migration page cannot be charged to memcg If page migration cannot charge the temporary page to the memcg, migrate_pages() will return -ENOMEM. This isn't considered in memory compaction however, and the loop continues to iterate over all pageblocks trying to isolate and migrate pages. If a small number of very large memcgs happen to be oom, however, these attempts will mostly be futile leading to an enormous amout of cpu consumption due to the page migration failures. This patch will short circuit and fail memory compaction if migrate_pages() returns -ENOMEM. COMPACT_PARTIAL is returned in case some migrations were successful so that the page allocator will retry. Signed-off-by: David Rientjes Acked-by: Mel Gorman Cc: Minchan Kim Cc: Kamezawa Hiroyuki Cc: Rik van Riel Cc: Andrea Arcangeli Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/compaction.c b/mm/compaction.c index 7ea259d..2f42d95 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -701,8 +701,11 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) if (err) { putback_lru_pages(&cc->migratepages); cc->nr_migratepages = 0; + if (err == -ENOMEM) { + ret = COMPACT_PARTIAL; + goto out; + } } - } out: -- cgit v0.10.2 From 3cfd16a551dc0c188160e1765168a04baf2d3198 Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Wed, 11 Jul 2012 14:02:16 -0700 Subject: drivers/rtc/rtc-ab8500.c: use IRQF_ONESHOT when requesting a threaded IRQ This driver's IRQ registration is failing because the kernel now forces IRQs to be ONESHOT if no IRQ handler is passed. Signed-off-by: Lee Jones Cc: Alessandro Zummo Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index 4bcf9ca..b11a2ec 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -422,7 +422,7 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev) } err = request_threaded_irq(irq, NULL, rtc_alarm_handler, - IRQF_NO_SUSPEND, "ab8500-rtc", rtc); + IRQF_NO_SUSPEND | IRQF_ONESHOT, "ab8500-rtc", rtc); if (err < 0) { rtc_device_unregister(rtc); return err; -- cgit v0.10.2 From ad49fcbe9083f42321adfdd217ed2e0037fd739f Mon Sep 17 00:00:00 2001 From: Lee Jones Date: Wed, 11 Jul 2012 14:02:17 -0700 Subject: drivers/rtc/rtc-ab8500.c: ensure correct probing of the AB8500 RTC when Device Tree is enabled Without this patch, if Device Tree is enabled the AB8500 RTC wouldn't get probed at all, as there is no reference to it from platform code. This patch ensures the driver is probed during normal DT start-up. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Lee Jones Cc: Alessandro Zummo Acked-by: Linus Walleij Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-ab8500.c b/drivers/rtc/rtc-ab8500.c index b11a2ec..370889d 100644 --- a/drivers/rtc/rtc-ab8500.c +++ b/drivers/rtc/rtc-ab8500.c @@ -17,6 +17,7 @@ #include #include #include +#include #define AB8500_RTC_SOFF_STAT_REG 0x00 #define AB8500_RTC_CC_CONF_REG 0x01 @@ -430,7 +431,6 @@ static int __devinit ab8500_rtc_probe(struct platform_device *pdev) platform_set_drvdata(pdev, rtc); - err = ab8500_sysfs_rtc_register(&pdev->dev); if (err) { dev_err(&pdev->dev, "sysfs RTC failed to register\n"); @@ -454,10 +454,16 @@ static int __devexit ab8500_rtc_remove(struct platform_device *pdev) return 0; } +static const struct of_device_id ab8500_rtc_match[] = { + { .compatible = "stericsson,ab8500-rtc", }, + {} +}; + static struct platform_driver ab8500_rtc_driver = { .driver = { .name = "ab8500-rtc", .owner = THIS_MODULE, + .of_match_table = ab8500_rtc_match, }, .probe = ab8500_rtc_probe, .remove = __devexit_p(ab8500_rtc_remove), -- cgit v0.10.2 From 9adec610b472aba9137b934954e5d5a8550452c5 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Jul 2012 14:02:19 -0700 Subject: h8300/pgtable: add missing #include Fix the h8300 build error: kernel/sched/core.c: In function 'context_switch': kernel/sched/core.c:2061:2: error: implicit declaration of function 'arch_start_context_switch' [-Werror=implicit-function-declaration] Signed-off-by: Geert Uytterhoeven Cc: Yoshinori Sato Cc: Tony Breeds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/h8300/include/asm/pgtable.h b/arch/h8300/include/asm/pgtable.h index a09230a..62ef176 100644 --- a/arch/h8300/include/asm/pgtable.h +++ b/arch/h8300/include/asm/pgtable.h @@ -70,4 +70,7 @@ extern int is_in_rom(unsigned long); #define VMALLOC_END 0xffffffff #define arch_enter_lazy_cpu_mode() do {} while (0) + +#include + #endif /* _H8300_PGTABLE_H */ -- cgit v0.10.2 From 8782171e5838480445ef5b3fcea1358e599fe4a2 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Jul 2012 14:02:22 -0700 Subject: h8300/signal: fix typo "statis" The keyword is "static", not "statis": arch/h8300/kernel/signal.c:455:8: error: expected '=', ',', ';', 'asm' or '__attribute__' before 'void' arch/h8300/kernel/signal.c: In function 'do_notify_resume': arch/h8300/kernel/signal.c:511:3: error: implicit declaration of function 'do_signal' [-Werror=implicit-function-declaration] arch/h8300/kernel/signal.c: At top level: arch/h8300/kernel/signal.c:414:1: warning: 'handle_signal' defined but not used [-Wunused-function] Introduced in commit 7ae4e32a6514 ("h8300: switch to saved_sigmask-based sigsuspend/rt_sigsuspend") Signed-off-by: Geert Uytterhoeven Cc: Al Viro Cc: Yoshinori Sato Cc: Tony Breeds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/h8300/kernel/signal.c b/arch/h8300/kernel/signal.c index fca1037..5adaada 100644 --- a/arch/h8300/kernel/signal.c +++ b/arch/h8300/kernel/signal.c @@ -447,7 +447,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -statis void do_signal(struct pt_regs *regs) +static void do_signal(struct pt_regs *regs) { siginfo_t info; int signr; -- cgit v0.10.2 From 487c719c00b6a4bdb24a77337897d46dfe5131f6 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Jul 2012 14:02:23 -0700 Subject: h8300/time: add missing #include Fix the build error: arch/h8300/kernel/time.c: In function 'h8300_timer_tick': arch/h8300/kernel/time.c:39:2: error: implicit declaration of function 'get_irq_regs' [-Werror=implicit-function-declaration] arch/h8300/kernel/time.c:39:42: error: invalid type argument of '->' (have 'int') Signed-off-by: Geert Uytterhoeven Cc: Yoshinori Sato Cc: Tony Breeds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/h8300/kernel/time.c b/arch/h8300/kernel/time.c index 32263a1..e0f7419 100644 --- a/arch/h8300/kernel/time.c +++ b/arch/h8300/kernel/time.c @@ -27,6 +27,7 @@ #include #include +#include #include #define TICK_SIZE (tick_nsec / 1000) -- cgit v0.10.2 From e048acebc40cf8292ed71a1012fd53068f22924b Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Jul 2012 14:02:26 -0700 Subject: h8300/uaccess: remove assignment to __gu_val in unhandled case of get_user() __gu_val is const if the passed ptr is const, giving: include/linux/pagemap.h: In function 'fault_in_pages_readable': include/linux/pagemap.h:442:2: error: assignment of read-only variable '__gu_val' include/linux/pagemap.h:448:4: error: assignment of read-only variable '__gu_val' include/linux/pagemap.h: In function 'fault_in_multipages_readable': include/linux/pagemap.h:499:3: error: assignment of read-only variable '__gu_val' include/linux/pagemap.h:508:3: error: assignment of read-only variable '__gu_val' make[4]: *** [init/main.o] Error 1 As we don't care about the actual value of __gu_val in the unhandled case (it will cause a link error anyway), just remove the assignment. Signed-off-by: Geert Uytterhoeven Cc: Yoshinori Sato Cc: Tony Breeds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/h8300/include/asm/uaccess.h b/arch/h8300/include/asm/uaccess.h index 356068c..534394f 100644 --- a/arch/h8300/include/asm/uaccess.h +++ b/arch/h8300/include/asm/uaccess.h @@ -100,7 +100,6 @@ extern int __put_user_bad(void); break; \ default: \ __gu_err = __get_user_bad(); \ - __gu_val = 0; \ break; \ } \ (x) = __gu_val; \ -- cgit v0.10.2 From 213ab3f9fcfc1ba471906a4f2c24d7b21370c859 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 11 Jul 2012 14:02:28 -0700 Subject: h8300/uaccess: add mising __clear_user() Fix the build error: include/linux/regset.h: In function 'user_regset_copyout_zero': include/linux/regset.h:289:3: error: implicit declaration of function '__clear_user' [-Werror=implicit-function-declaration] Signed-off-by: Geert Uytterhoeven Cc: Yoshinori Sato Cc: Tony Breeds Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/h8300/include/asm/uaccess.h b/arch/h8300/include/asm/uaccess.h index 534394f..8725d1a 100644 --- a/arch/h8300/include/asm/uaccess.h +++ b/arch/h8300/include/asm/uaccess.h @@ -158,4 +158,6 @@ clear_user(void *to, unsigned long n) return 0; } +#define __clear_user clear_user + #endif /* _H8300_UACCESS_H */ -- cgit v0.10.2 From 41b9e2d7ec3f618fd076cb3466edd0a8ebabae5a Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Wed, 11 Jul 2012 14:02:31 -0700 Subject: mm/memory_hotplug.c: release memory resources if hotadd_new_pgdat() fails We should goto error to release memory resource if hotadd_new_pgdat() failed. Signed-off-by: Wen Congyang Cc: Yasuaki ISIMATU Acked-by: David Rientjes Cc: Len Brown Cc: "Brown, Len" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0d7e3ec..427bb29 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -618,7 +618,7 @@ int __ref add_memory(int nid, u64 start, u64 size) pgdat = hotadd_new_pgdat(nid, start); ret = -ENOMEM; if (!pgdat) - goto out; + goto error; new_pgdat = 1; } -- cgit v0.10.2 From b59f6d1febd6cbe9fae4589bf72da0ed32bc69e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Beno=C3=AEt=20Th=C3=A9baudeau?= Date: Wed, 11 Jul 2012 14:02:32 -0700 Subject: drivers/rtc/rtc-mxc.c: fix irq enabled interrupts warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes WARNING: at irq/handle.c:146 handle_irq_event_percpu+0x19c/0x1b8() irq 25 handler mxc_rtc_interrupt+0x0/0xac enabled interrupts Modules linked in: (unwind_backtrace+0x0/0xf0) from (warn_slowpath_common+0x4c/0x64) (warn_slowpath_common+0x4c/0x64) from (warn_slowpath_fmt+0x30/0x40) (warn_slowpath_fmt+0x30/0x40) from (handle_irq_event_percpu+0x19c/0x1b8) (handle_irq_event_percpu+0x19c/0x1b8) from (handle_irq_event+0x28/0x38) (handle_irq_event+0x28/0x38) from (handle_level_irq+0x80/0xc4) (handle_level_irq+0x80/0xc4) from (generic_handle_irq+0x24/0x38) (generic_handle_irq+0x24/0x38) from (handle_IRQ+0x30/0x84) (handle_IRQ+0x30/0x84) from (avic_handle_irq+0x2c/0x4c) (avic_handle_irq+0x2c/0x4c) from (__irq_svc+0x40/0x60) Exception stack(0xc050bf60 to 0xc050bfa8) bf60: 00000001 00000000 003c4208 c0018e20 c050a000 c050a000 c054a4c8 c050a000 bf80: c05157a8 4117b363 80503bb4 00000000 01000000 c050bfa8 c0018e2c c000e808 bfa0: 60000013 ffffffff (__irq_svc+0x40/0x60) from (default_idle+0x1c/0x30) (default_idle+0x1c/0x30) from (cpu_idle+0x68/0xa8) (cpu_idle+0x68/0xa8) from (start_kernel+0x22c/0x26c) Signed-off-by: Benoît Thébaudeau Cc: Alessandro Zummo Cc: Sascha Hauer Acked-by: Uwe Kleine-König Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-mxc.c b/drivers/rtc/rtc-mxc.c index 5e1d64e..e3e50d6 100644 --- a/drivers/rtc/rtc-mxc.c +++ b/drivers/rtc/rtc-mxc.c @@ -202,10 +202,11 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id) struct platform_device *pdev = dev_id; struct rtc_plat_data *pdata = platform_get_drvdata(pdev); void __iomem *ioaddr = pdata->ioaddr; + unsigned long flags; u32 status; u32 events = 0; - spin_lock_irq(&pdata->rtc->irq_lock); + spin_lock_irqsave(&pdata->rtc->irq_lock, flags); status = readw(ioaddr + RTC_RTCISR) & readw(ioaddr + RTC_RTCIENR); /* clear interrupt sources */ writew(status, ioaddr + RTC_RTCISR); @@ -224,7 +225,7 @@ static irqreturn_t mxc_rtc_interrupt(int irq, void *dev_id) events |= (RTC_PF | RTC_IRQF); rtc_update_irq(pdata->rtc, 1, events); - spin_unlock_irq(&pdata->rtc->irq_lock); + spin_unlock_irqrestore(&pdata->rtc->irq_lock, flags); return IRQ_HANDLED; } -- cgit v0.10.2 From fea9f718b3d68147f162ed2d870183ce5e0ad8d8 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Wed, 11 Jul 2012 14:02:35 -0700 Subject: fs: ramfs: file-nommu: add SetPageUptodate() There is a bug in the below scenario for !CONFIG_MMU: 1. create a new file 2. mmap the file and write to it 3. read the file can't get the correct value Because sys_read() -> generic_file_aio_read() -> simple_readpage() -> clear_page() which causes the page to be zeroed. Add SetPageUptodate() to ramfs_nommu_expand_for_mapping() so that generic_file_aio_read() do not call simple_readpage(). Signed-off-by: Bob Liu Cc: Hugh Dickins Cc: David Howells Cc: Geert Uytterhoeven Cc: Greg Ungerer Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index fbb0b47..d5378d0 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -110,6 +110,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) /* prevent the page from being discarded on memory pressure */ SetPageDirty(page); + SetPageUptodate(page); unlock_page(page); put_page(page); -- cgit v0.10.2 From 8875408abd935a77b6e1cb11c21c438aa2e7ec75 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 11 Jul 2012 14:02:38 -0700 Subject: sgi-xp: nested calls to spin_lock_irqsave() The code here has a nested spin_lock_irqsave(). It's not needed since IRQs are already disabled and it causes a problem because it means that IRQs won't be enabled again at the end. The second call to spin_lock_irqsave() will overwrite the value of irq_flags and we can't restore the proper settings. Signed-off-by: Dan Carpenter Signed-off-by: Robin Holt Cc: Jack Steiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/misc/sgi-xp/xpc_uv.c b/drivers/misc/sgi-xp/xpc_uv.c index 17bbacb..87b251a 100644 --- a/drivers/misc/sgi-xp/xpc_uv.c +++ b/drivers/misc/sgi-xp/xpc_uv.c @@ -452,9 +452,9 @@ xpc_handle_activate_mq_msg_uv(struct xpc_partition *part, if (msg->activate_gru_mq_desc_gpa != part_uv->activate_gru_mq_desc_gpa) { - spin_lock_irqsave(&part_uv->flags_lock, irq_flags); + spin_lock(&part_uv->flags_lock); part_uv->flags &= ~XPC_P_CACHED_ACTIVATE_GRU_MQ_DESC_UV; - spin_unlock_irqrestore(&part_uv->flags_lock, irq_flags); + spin_unlock(&part_uv->flags_lock); part_uv->activate_gru_mq_desc_gpa = msg->activate_gru_mq_desc_gpa; } -- cgit v0.10.2 From c46938d4f3ecadd609a06dae0d5b26a30274b338 Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Wed, 11 Jul 2012 14:02:40 -0700 Subject: MAINTAINERS: add OMAP CPUfreq driver to OMAP Power Management section Add the OMAP CPUFreq driver to the list of files in the OMAP Power Management section. I've already been maintaining this driver, this just makes it official. Signed-off-by: Kevin Hilman Cc: Arnd Bergmann Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/MAINTAINERS b/MAINTAINERS index 03df1d1..d1d9ae6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4857,6 +4857,7 @@ M: Kevin Hilman L: linux-omap@vger.kernel.org S: Maintained F: arch/arm/*omap*/*pm* +F: drivers/cpufreq/omap-cpufreq.c OMAP POWERDOMAIN/CLOCKDOMAIN SOC ADAPTATION LAYER SUPPORT M: Rajendra Nayak -- cgit v0.10.2 From 5d8ecbbc284f7e7568969574a6601b05f1ed1d90 Mon Sep 17 00:00:00 2001 From: "Steven J. Magnani" Date: Wed, 11 Jul 2012 14:02:42 -0700 Subject: fat: fix non-atomic NFS i_pos read fat_encode_fh() can fetch an invalid i_pos value on systems where 64-bit accesses are not atomic. Make it use the same accessor as the rest of the FAT code. Signed-off-by: Steven J. Magnani Acked-by: OGAWA Hirofumi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/fs/fat/inode.c b/fs/fat/inode.c index a3d81eb..0038b32 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -738,22 +738,21 @@ static int fat_encode_fh(struct inode *inode, __u32 *fh, int *lenp, struct inode *parent) { int len = *lenp; - u32 ipos_h, ipos_m, ipos_l; + struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); + loff_t i_pos; if (len < 5) { *lenp = 5; return 255; /* no room */ } - ipos_h = MSDOS_I(inode)->i_pos >> 8; - ipos_m = (MSDOS_I(inode)->i_pos & 0xf0) << 24; - ipos_l = (MSDOS_I(inode)->i_pos & 0x0f) << 28; + i_pos = fat_i_pos_read(sbi, inode); *lenp = 5; fh[0] = inode->i_ino; fh[1] = inode->i_generation; - fh[2] = ipos_h; - fh[3] = ipos_m | MSDOS_I(inode)->i_logstart; - fh[4] = ipos_l; + fh[2] = i_pos >> 8; + fh[3] = ((i_pos & 0xf0) << 24) | MSDOS_I(inode)->i_logstart; + fh[4] = (i_pos & 0x0f) << 28; if (parent) fh[4] |= MSDOS_I(parent)->i_logstart; return 3; -- cgit v0.10.2 From 6b91bf1a3f52f5fdf40f5aaeb09a06b4d49556cc Mon Sep 17 00:00:00 2001 From: Kevin Hilman Date: Wed, 11 Jul 2012 14:02:44 -0700 Subject: drivers/rtc/rtc-twl.c: fix threaded IRQ to use IRQF_ONESHOT Requesting a threaded interrupt without a primary handler and without IRQF_ONESHOT is dangerous, and after commit 1c6c6952 ("genirq: Reject bogus threaded irq requests"), these requests are rejected. This causes ->probe() to fail, and the RTC driver not to be availble. To fix, add IRQF_ONESHOT to the IRQ flags. Tested on OMAP3730/OveroSTORM and OMAP4430/Panda board using rtcwake to wake from system suspend multiple times. Signed-off-by: Kevin Hilman Cc: Alessandro Zummo Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/drivers/rtc/rtc-twl.c b/drivers/rtc/rtc-twl.c index 258abea..c5d06fe 100644 --- a/drivers/rtc/rtc-twl.c +++ b/drivers/rtc/rtc-twl.c @@ -510,7 +510,7 @@ static int __devinit twl_rtc_probe(struct platform_device *pdev) } ret = request_threaded_irq(irq, NULL, twl_rtc_interrupt, - IRQF_TRIGGER_RISING, + IRQF_TRIGGER_RISING | IRQF_ONESHOT, dev_name(&rtc->dev), rtc); if (ret < 0) { dev_err(&pdev->dev, "IRQ is not free.\n"); -- cgit v0.10.2 From f21f8062201fc6361f65de92e758a76375ba8c59 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 11 Jul 2012 14:02:45 -0700 Subject: tmpfs: revert SEEK_DATA and SEEK_HOLE Revert 4fb5ef089b28 ("tmpfs: support SEEK_DATA and SEEK_HOLE"). I believe it's correct, and it's been nice to have from rc1 to rc6; but as the original commit said: I don't know who actually uses SEEK_DATA or SEEK_HOLE, and whether it would be of any use to them on tmpfs. This code adds 92 lines and 752 bytes on x86_64 - is that bloat or worthwhile? Nobody asked for it, so I conclude that it's bloat: let's revert tmpfs to the dumb generic support for v3.5. We can always reinstate it later if useful, and anyone needing it in a hurry can just get it out of git. Signed-off-by: Hugh Dickins Cc: Christoph Hellwig Cc: Josef Bacik Cc: Andi Kleen Cc: Andreas Dilger Cc: Dave Chinner Cc: Marco Stornelli Cc: Jeff liu Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/shmem.c b/mm/shmem.c index 4ce02e0..3f696f7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1692,98 +1692,6 @@ static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, return error; } -/* - * llseek SEEK_DATA or SEEK_HOLE through the radix_tree. - */ -static pgoff_t shmem_seek_hole_data(struct address_space *mapping, - pgoff_t index, pgoff_t end, int origin) -{ - struct page *page; - struct pagevec pvec; - pgoff_t indices[PAGEVEC_SIZE]; - bool done = false; - int i; - - pagevec_init(&pvec, 0); - pvec.nr = 1; /* start small: we may be there already */ - while (!done) { - pvec.nr = shmem_find_get_pages_and_swap(mapping, index, - pvec.nr, pvec.pages, indices); - if (!pvec.nr) { - if (origin == SEEK_DATA) - index = end; - break; - } - for (i = 0; i < pvec.nr; i++, index++) { - if (index < indices[i]) { - if (origin == SEEK_HOLE) { - done = true; - break; - } - index = indices[i]; - } - page = pvec.pages[i]; - if (page && !radix_tree_exceptional_entry(page)) { - if (!PageUptodate(page)) - page = NULL; - } - if (index >= end || - (page && origin == SEEK_DATA) || - (!page && origin == SEEK_HOLE)) { - done = true; - break; - } - } - shmem_deswap_pagevec(&pvec); - pagevec_release(&pvec); - pvec.nr = PAGEVEC_SIZE; - cond_resched(); - } - return index; -} - -static loff_t shmem_file_llseek(struct file *file, loff_t offset, int origin) -{ - struct address_space *mapping; - struct inode *inode; - pgoff_t start, end; - loff_t new_offset; - - if (origin != SEEK_DATA && origin != SEEK_HOLE) - return generic_file_llseek_size(file, offset, origin, - MAX_LFS_FILESIZE); - mapping = file->f_mapping; - inode = mapping->host; - mutex_lock(&inode->i_mutex); - /* We're holding i_mutex so we can access i_size directly */ - - if (offset < 0) - offset = -EINVAL; - else if (offset >= inode->i_size) - offset = -ENXIO; - else { - start = offset >> PAGE_CACHE_SHIFT; - end = (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; - new_offset = shmem_seek_hole_data(mapping, start, end, origin); - new_offset <<= PAGE_CACHE_SHIFT; - if (new_offset > offset) { - if (new_offset < inode->i_size) - offset = new_offset; - else if (origin == SEEK_DATA) - offset = -ENXIO; - else - offset = inode->i_size; - } - } - - if (offset >= 0 && offset != file->f_pos) { - file->f_pos = offset; - file->f_version = 0; - } - mutex_unlock(&inode->i_mutex); - return offset; -} - static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -2787,7 +2695,7 @@ static const struct address_space_operations shmem_aops = { static const struct file_operations shmem_file_operations = { .mmap = shmem_mmap, #ifdef CONFIG_TMPFS - .llseek = shmem_file_llseek, + .llseek = generic_file_llseek, .read = do_sync_read, .write = do_sync_write, .aio_read = shmem_file_aio_read, -- cgit v0.10.2 From d189922862e03ce6c7adc1e99d3b94e632dc8e89 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 11 Jul 2012 14:02:47 -0700 Subject: shmem: fix negative rss in memcg memory.stat When adding the page_private checks before calling shmem_replace_page(), I did realize that there is a further race, but thought it too unlikely to need a hurried fix. But independently I've been chasing why a mem cgroup's memory.stat sometimes shows negative rss after all tasks have gone: I expected it to be a stats gathering bug, but actually it's shmem swapping's fault. It's an old surprise, that when you lock_page(lookup_swap_cache(swap)), the page may have been removed from swapcache before getting the lock; or it may have been freed and reused and be back in swapcache; and it can even be using the same swap location as before (page_private same). The swapoff case is already secure against this (swap cannot be reused until the whole area has been swapped off, and a new swapped on); and shmem_getpage_gfp() is protected by shmem_add_to_page_cache()'s check for the expected radix_tree entry - but a little too late. By that time, we might have already decided to shmem_replace_page(): I don't know of a problem from that, but I'd feel more at ease not to do so spuriously. And we have already done mem_cgroup_cache_charge(), on perhaps the wrong mem cgroup: and this charge is not then undone on the error path, because PageSwapCache ends up preventing that. It's this last case which causes the occasional negative rss in memory.stat: the page is charged here as cache, but (sometimes) found to be anon when eventually it's uncharged - and in between, it's an undeserved charge on the wrong memcg. Fix this by adding an earlier check on the radix_tree entry: it's inelegant to descend the tree twice, but swapping is not the fast path, and a better solution would need a pair (try+commit) of memcg calls, and a rework of shmem_replace_page() to keep out of the swapcache. We can use the added shmem_confirm_swap() function to replace the find_get_page+page_cache_release we were already doing on the error path. And add a comment on that -EEXIST: it seems a peculiar errno to be using, but originates from its use in radix_tree_insert(). [It can be surprising to see positive rss left in a memcg's memory.stat after all tasks have gone, since it is supposed to count anonymous but not shmem. Aside from sharing anon pages via fork with a task in some other memcg, it often happens after swapping: because a swap page can't be freed while under writeback, nor while locked. So it's not an error, and these residual pages are easily freed once pressure demands.] Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/shmem.c b/mm/shmem.c index 3f696f7..294364a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -264,6 +264,24 @@ static int shmem_radix_tree_replace(struct address_space *mapping, } /* + * Sometimes, before we decide whether to proceed or to fail, we must check + * that an entry was not already brought back from swap by a racing thread. + * + * Checking page is not enough: by the time a SwapCache page is locked, it + * might be reused, and again be SwapCache, using the same swap as before. + */ +static bool shmem_confirm_swap(struct address_space *mapping, + pgoff_t index, swp_entry_t swap) +{ + void *item; + + rcu_read_lock(); + item = radix_tree_lookup(&mapping->page_tree, index); + rcu_read_unlock(); + return item == swp_to_radix_entry(swap); +} + +/* * Like add_to_page_cache_locked, but error if expected item has gone. */ static int shmem_add_to_page_cache(struct page *page, @@ -1124,9 +1142,9 @@ repeat: /* We have to do this with page locked to prevent races */ lock_page(page); if (!PageSwapCache(page) || page_private(page) != swap.val || - page->mapping) { + !shmem_confirm_swap(mapping, index, swap)) { error = -EEXIST; /* try again */ - goto failed; + goto unlock; } if (!PageUptodate(page)) { error = -EIO; @@ -1142,9 +1160,12 @@ repeat: error = mem_cgroup_cache_charge(page, current->mm, gfp & GFP_RECLAIM_MASK); - if (!error) + if (!error) { error = shmem_add_to_page_cache(page, mapping, index, gfp, swp_to_radix_entry(swap)); + /* We already confirmed swap, and make no allocation */ + VM_BUG_ON(error); + } if (error) goto failed; @@ -1245,14 +1266,10 @@ decused: unacct: shmem_unacct_blocks(info->flags, 1); failed: - if (swap.val && error != -EINVAL) { - struct page *test = find_get_page(mapping, index); - if (test && !radix_tree_exceptional_entry(test)) - page_cache_release(test); - /* Have another try if the entry has changed */ - if (test != swp_to_radix_entry(swap)) - error = -EEXIST; - } + if (swap.val && error != -EINVAL && + !shmem_confirm_swap(mapping, index, swap)) + error = -EEXIST; +unlock: if (page) { unlock_page(page); page_cache_release(page); @@ -1264,7 +1281,7 @@ failed: spin_unlock(&info->lock); goto repeat; } - if (error == -EEXIST) + if (error == -EEXIST) /* from above or from radix_tree_insert */ goto repeat; return error; } -- cgit v0.10.2 From b065b4321fa78e83bf8f5b0d79d0b5424b57998b Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 11 Jul 2012 14:02:48 -0700 Subject: shmem: cleanup shmem_add_to_page_cache shmem_add_to_page_cache() has three callsites, but only one of them wants the radix_tree_preload() (an exceptional entry guarantees that the radix tree node is present in the other cases), and only that site can achieve mem_cgroup_uncharge_cache_page() (PageSwapCache makes it a no-op in the other cases). We did it this way originally to reflect add_to_page_cache_locked(); but it's confusing now, so move the radix_tree preloading and mem_cgroup uncharging to that one caller. Signed-off-by: Hugh Dickins Acked-by: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/shmem.c b/mm/shmem.c index 294364a..bd10636 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -288,40 +288,31 @@ static int shmem_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp, void *expected) { - int error = 0; + int error; VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(!PageSwapBacked(page)); + page_cache_get(page); + page->mapping = mapping; + page->index = index; + + spin_lock_irq(&mapping->tree_lock); if (!expected) - error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + error = radix_tree_insert(&mapping->page_tree, index, page); + else + error = shmem_radix_tree_replace(mapping, index, expected, + page); if (!error) { - page_cache_get(page); - page->mapping = mapping; - page->index = index; - - spin_lock_irq(&mapping->tree_lock); - if (!expected) - error = radix_tree_insert(&mapping->page_tree, - index, page); - else - error = shmem_radix_tree_replace(mapping, index, - expected, page); - if (!error) { - mapping->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - __inc_zone_page_state(page, NR_SHMEM); - spin_unlock_irq(&mapping->tree_lock); - } else { - page->mapping = NULL; - spin_unlock_irq(&mapping->tree_lock); - page_cache_release(page); - } - if (!expected) - radix_tree_preload_end(); + mapping->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(page, NR_SHMEM); + spin_unlock_irq(&mapping->tree_lock); + } else { + page->mapping = NULL; + spin_unlock_irq(&mapping->tree_lock); + page_cache_release(page); } - if (error) - mem_cgroup_uncharge_cache_page(page); return error; } @@ -1202,11 +1193,18 @@ repeat: __set_page_locked(page); error = mem_cgroup_cache_charge(page, current->mm, gfp & GFP_RECLAIM_MASK); - if (!error) - error = shmem_add_to_page_cache(page, mapping, index, - gfp, NULL); if (error) goto decused; + error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + if (!error) { + error = shmem_add_to_page_cache(page, mapping, index, + gfp, NULL); + radix_tree_preload_end(); + } + if (error) { + mem_cgroup_uncharge_cache_page(page); + goto decused; + } lru_cache_add_anon(page); spin_lock(&info->lock); -- cgit v0.10.2 From 688bb4158f64f3af0fe1d13e7642f8c4c402453e Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 11 Jul 2012 14:02:50 -0700 Subject: xtensa: fix incorrect memset Addresses: https://bugzilla.kernel.org/show_bug.cgi?id=43871 Reported-by: Signed-off-by: Alan Cox Signed-off-by: Chris Zankel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 9b306e5..2c8d6a3 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -277,7 +277,7 @@ void xtensa_elf_core_copy_regs (xtensa_gregset_t *elfregs, struct pt_regs *regs) /* Don't leak any random bits. */ - memset(elfregs, 0, sizeof (elfregs)); + memset(elfregs, 0, sizeof(*elfregs)); /* Note: PS.EXCM is not set while user task is running; its * being set in regs->ps is for exception handling convenience. -- cgit v0.10.2 From 07b4e2bc9c35ea88cbd36d806fcd5e3bcbf022be Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Jul 2012 14:02:51 -0700 Subject: mm: sparse: fix section usemap placement calculation Commit 238305bb4d41 ("mm: remove sparsemem allocation details from the bootmem allocator") introduced a bug in the allocation goal calculation that put section usemaps not in the same section as the node descriptors, creating unnecessary hotplug dependencies between them: node 0 must be removed before remove section 16399 node 1 must be removed before remove section 16399 node 2 must be removed before remove section 16399 node 3 must be removed before remove section 16399 node 4 must be removed before remove section 16399 node 5 must be removed before remove section 16399 node 6 must be removed before remove section 16399 The reason is that it applies PAGE_SECTION_MASK to the physical address of the node descriptor when finding a suitable place to put the usemap, when this mask is actually intended to be used with PFNs. Because the PFN mask is wider, the target address will point beyond the wanted section holding the node descriptor and the node must be offlined before the section holding the usemap can go. Fix this by extending the mask to address width before use. Signed-off-by: Yinghai Lu Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/mm/sparse.c b/mm/sparse.c index 6a4bf91..e861397 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -287,7 +287,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, * from the same section as the pgdat where possible to avoid * this problem. */ - goal = __pa(pgdat) & PAGE_SECTION_MASK; + goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); return __alloc_bootmem_node_nopanic(host_pgdat, size, SMP_CACHE_BYTES, goal); -- cgit v0.10.2 From 99ab7b19440a72ebdf225f99b20f8ef40decee86 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Jul 2012 14:02:53 -0700 Subject: mm: sparse: fix usemap allocation above node descriptor section After commit f5bf18fa22f8 ("bootmem/sparsemem: remove limit constraint in alloc_bootmem_section"), usemap allocations may easily be placed outside the optimal section that holds the node descriptor, even if there is space available in that section. This results in unnecessary hotplug dependencies that need to have the node unplugged before the section holding the usemap. The reason is that the bootmem allocator doesn't guarantee a linear search starting from the passed allocation goal but may start out at a much higher address absent an upper limit. Fix this by trying the allocation with the limit at the section end, then retry without if that fails. This keeps the fix from f5bf18fa22f8 of not panicking if the allocation does not fit in the section, but still makes sure to try to stay within the section at first. Signed-off-by: Yinghai Lu Signed-off-by: Johannes Weiner Cc: [3.3.x, 3.4.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/bootmem.h b/include/linux/bootmem.h index 324fe08..6d6795d 100644 --- a/include/linux/bootmem.h +++ b/include/linux/bootmem.h @@ -91,6 +91,11 @@ extern void *__alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal); +void *___alloc_bootmem_node_nopanic(pg_data_t *pgdat, + unsigned long size, + unsigned long align, + unsigned long goal, + unsigned long limit); extern void *__alloc_bootmem_low(unsigned long size, unsigned long align, unsigned long goal); diff --git a/mm/bootmem.c b/mm/bootmem.c index ec4fcb7..7309663 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -698,7 +698,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, unsigned long limit) { diff --git a/mm/nobootmem.c b/mm/nobootmem.c index d23415c..0900b39 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -274,7 +274,7 @@ void * __init __alloc_bootmem(unsigned long size, unsigned long align, return ___alloc_bootmem(size, align, goal, limit); } -static void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, +void * __init ___alloc_bootmem_node_nopanic(pg_data_t *pgdat, unsigned long size, unsigned long align, unsigned long goal, diff --git a/mm/sparse.c b/mm/sparse.c index e861397..c7bb952 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -275,8 +275,9 @@ static unsigned long * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, unsigned long size) { - pg_data_t *host_pgdat; - unsigned long goal; + unsigned long goal, limit; + unsigned long *p; + int nid; /* * A page may contain usemaps for other sections preventing the * page being freed and making a section unremovable while @@ -288,9 +289,16 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, * this problem. */ goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); - host_pgdat = NODE_DATA(early_pfn_to_nid(goal >> PAGE_SHIFT)); - return __alloc_bootmem_node_nopanic(host_pgdat, size, - SMP_CACHE_BYTES, goal); + limit = goal + (1UL << PA_SECTION_SHIFT); + nid = early_pfn_to_nid(goal >> PAGE_SHIFT); +again: + p = ___alloc_bootmem_node_nopanic(NODE_DATA(nid), size, + SMP_CACHE_BYTES, goal, limit); + if (!p && limit) { + limit = 0; + goto again; + } + return p; } static void __init check_usemap_section_nr(int nid, unsigned long *usemap) -- cgit v0.10.2 From 29f6738609e40227dabcc63bfb3b84b3726a75bd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 11 Jul 2012 14:02:56 -0700 Subject: memblock: free allocated memblock_reserved_regions later memblock_free_reserved_regions() calls memblock_free(), but memblock_free() would double reserved.regions too, so we could free the old range for reserved.regions. Also tj said there is another bug which could be related to this. | I don't think we're saving any noticeable | amount by doing this "free - give it to page allocator - reserve | again" dancing. We should just allocate regions aligned to page | boundaries and free them later when memblock is no longer in use. in that case, when DEBUG_PAGEALLOC, will get panic: memblock_free: [0x0000102febc080-0x0000102febf080] memblock_free_reserved_regions+0x37/0x39 BUG: unable to handle kernel paging request at ffff88102febd948 IP: [] __next_free_mem_range+0x9b/0x155 PGD 4826063 PUD cf67a067 PMD cf7fa067 PTE 800000102febd160 Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC CPU 0 Pid: 0, comm: swapper Not tainted 3.5.0-rc2-next-20120614-sasha #447 RIP: 0010:[] [] __next_free_mem_range+0x9b/0x155 See the discussion at https://lkml.org/lkml/2012/6/13/469 So try to allocate with PAGE_SIZE alignment and free it later. Reported-by: Sasha Levin Acked-by: Tejun Heo Cc: Benjamin Herrenschmidt Signed-off-by: Yinghai Lu Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds diff --git a/include/linux/memblock.h b/include/linux/memblock.h index a6bb102..19dc455 100644 --- a/include/linux/memblock.h +++ b/include/linux/memblock.h @@ -50,9 +50,7 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align, int nid); phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t size, phys_addr_t align); -int memblock_free_reserved_regions(void); -int memblock_reserve_reserved_regions(void); - +phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr); void memblock_allow_resize(void); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add(phys_addr_t base, phys_addr_t size); diff --git a/mm/memblock.c b/mm/memblock.c index d438209..5cc6731 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -143,30 +143,6 @@ phys_addr_t __init_memblock memblock_find_in_range(phys_addr_t start, MAX_NUMNODES); } -/* - * Free memblock.reserved.regions - */ -int __init_memblock memblock_free_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_free(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); -} - -/* - * Reserve memblock.reserved.regions - */ -int __init_memblock memblock_reserve_reserved_regions(void) -{ - if (memblock.reserved.regions == memblock_reserved_init_regions) - return 0; - - return memblock_reserve(__pa(memblock.reserved.regions), - sizeof(struct memblock_region) * memblock.reserved.max); -} - static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r) { type->total_size -= type->regions[r].size; @@ -184,6 +160,18 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u } } +phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( + phys_addr_t *addr) +{ + if (memblock.reserved.regions == memblock_reserved_init_regions) + return 0; + + *addr = __pa(memblock.reserved.regions); + + return PAGE_ALIGN(sizeof(struct memblock_region) * + memblock.reserved.max); +} + /** * memblock_double_array - double the size of the memblock regions array * @type: memblock type of the regions array being doubled @@ -204,6 +192,7 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, phys_addr_t new_area_size) { struct memblock_region *new_array, *old_array; + phys_addr_t old_alloc_size, new_alloc_size; phys_addr_t old_size, new_size, addr; int use_slab = slab_is_available(); int *in_slab; @@ -217,6 +206,12 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, /* Calculate new doubled size */ old_size = type->max * sizeof(struct memblock_region); new_size = old_size << 1; + /* + * We need to allocated new one align to PAGE_SIZE, + * so we can free them completely later. + */ + old_alloc_size = PAGE_ALIGN(old_size); + new_alloc_size = PAGE_ALIGN(new_size); /* Retrieve the slab flag */ if (type == &memblock.memory) @@ -245,11 +240,11 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, addr = memblock_find_in_range(new_area_start + new_area_size, memblock.current_limit, - new_size, sizeof(phys_addr_t)); + new_alloc_size, PAGE_SIZE); if (!addr && new_area_size) addr = memblock_find_in_range(0, min(new_area_start, memblock.current_limit), - new_size, sizeof(phys_addr_t)); + new_alloc_size, PAGE_SIZE); new_array = addr ? __va(addr) : 0; } @@ -279,13 +274,13 @@ static int __init_memblock memblock_double_array(struct memblock_type *type, kfree(old_array); else if (old_array != memblock_memory_init_regions && old_array != memblock_reserved_init_regions) - memblock_free(__pa(old_array), old_size); + memblock_free(__pa(old_array), old_alloc_size); /* Reserve the new array if that comes from the memblock. * Otherwise, we needn't do it */ if (!use_slab) - BUG_ON(memblock_reserve(addr, new_size)); + BUG_ON(memblock_reserve(addr, new_alloc_size)); /* Update slab flag */ *in_slab = use_slab; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 0900b39..4055730 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -105,27 +105,35 @@ static void __init __free_pages_memory(unsigned long start, unsigned long end) __free_pages_bootmem(pfn_to_page(i), 0); } +static unsigned long __init __free_memory_core(phys_addr_t start, + phys_addr_t end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = min_t(unsigned long, + PFN_DOWN(end), max_low_pfn); + + if (start_pfn > end_pfn) + return 0; + + __free_pages_memory(start_pfn, end_pfn); + + return end_pfn - start_pfn; +} + unsigned long __init free_low_memory_core_early(int nodeid) { unsigned long count = 0; - phys_addr_t start, end; + phys_addr_t start, end, size; u64 i; - /* free reserved array temporarily so that it's treated as free area */ - memblock_free_reserved_regions(); - - for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { - unsigned long start_pfn = PFN_UP(start); - unsigned long end_pfn = min_t(unsigned long, - PFN_DOWN(end), max_low_pfn); - if (start_pfn < end_pfn) { - __free_pages_memory(start_pfn, end_pfn); - count += end_pfn - start_pfn; - } - } + for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) + count += __free_memory_core(start, end); + + /* free range that is used for reserved array if we allocate it */ + size = get_allocated_memblock_reserved_regions_info(&start); + if (size) + count += __free_memory_core(start, start + size); - /* put region array back? */ - memblock_reserve_reserved_regions(); return count; } -- cgit v0.10.2