From fb50b020c5331c8c4bee0eb875865f5f8be6c03a Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:53:09 -0800 Subject: x86: Move some contents of page_64_types.h into pgtable_64.h and page_64.h This patch is meant to clean-up the fact that we have several functions in page_64_types.h which really don't belong there. I found this issue when I had tried to replace __phys_addr with an inline function. It resulted in the realmode bits generating compile warnings about types. In order to resolve that I am relocating the address translation to page_64.h since this is in keeping with where these functions are located in 32 bit. In addtion I have relocated several functions defined in init_64.c to pgtable_64.h as this seems to be where most of the functions related to memory initialization were already located. [ hpa: added missing #include to apic_numachip.c, as reported by Yinghai Lu. ] Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215244.8521.31505.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin Cc: Yinghai Lu Cc: Daniel J Blueman diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 072694e..4150999 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -3,4 +3,23 @@ #include +#ifndef __ASSEMBLY__ + +/* duplicated to the one in bootmem.h */ +extern unsigned long max_pfn; +extern unsigned long phys_base; + +extern unsigned long __phys_addr(unsigned long); + +#define __phys_reloc_hide(x) (x) + +#ifdef CONFIG_FLATMEM +#define pfn_valid(pfn) ((pfn) < max_pfn) +#endif + +void clear_page(void *page); +void copy_page(void *to, void *from); + +#endif /* !__ASSEMBLY__ */ + #endif /* _ASM_X86_PAGE_64_H */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 320f7bb..8b491e6 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -50,26 +50,4 @@ #define KERNEL_IMAGE_SIZE (512 * 1024 * 1024) #define KERNEL_IMAGE_START _AC(0xffffffff80000000, UL) -#ifndef __ASSEMBLY__ -void clear_page(void *page); -void copy_page(void *to, void *from); - -/* duplicated to the one in bootmem.h */ -extern unsigned long max_pfn; -extern unsigned long phys_base; - -extern unsigned long __phys_addr(unsigned long); -#define __phys_reloc_hide(x) (x) - -#define vmemmap ((struct page *)VMEMMAP_START) - -extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); -extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); - -#endif /* !__ASSEMBLY__ */ - -#ifdef CONFIG_FLATMEM -#define pfn_valid(pfn) ((pfn) < max_pfn) -#endif - #endif /* _ASM_X86_PAGE_64_DEFS_H */ diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 47356f9..b5d30ad 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -183,6 +183,11 @@ extern void cleanup_highmap(void); #define __HAVE_ARCH_PTE_SAME +#define vmemmap ((struct page *)VMEMMAP_START) + +extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); +extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index a65829a..ae9196f 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -27,6 +27,7 @@ #include #include #include +#include static int numachip_system __read_mostly; -- cgit v0.10.2 From 0bdf525f04afd3a32c14e5a8778771f9c9e0f074 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:53:51 -0800 Subject: x86: Improve __phys_addr performance by making use of carry flags and inlining This patch is meant to improve overall system performance when making use of the __phys_addr call. To do this I have implemented several changes. First if CONFIG_DEBUG_VIRTUAL is not defined __phys_addr is made an inline, similar to how this is currently handled in 32 bit. However in order to do this it is required to export phys_base so that it is available if __phys_addr is used in kernel modules. The second change was to streamline the code by making use of the carry flag on an add operation instead of performing a compare on a 64 bit value. The advantage to this is that it allows us to significantly reduce the overall size of the call. On my Xeon E5 system the entire __phys_addr inline call consumes a little less than 32 bytes and 5 instructions. I also applied similar logic to the debug version of the function. My testing shows that the debug version of the function with this patch applied is slightly faster than the non-debug version without the patch. Finally I also applied the same logic changes to __virt_addr_valid since it used the same general code flow as __phys_addr and could achieve similar gains though these changes. Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215315.8521.46270.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 4150999..5138174 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -9,7 +9,21 @@ extern unsigned long max_pfn; extern unsigned long phys_base; +static inline unsigned long __phys_addr_nodebug(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + x = y + ((x > y) ? phys_base : (__START_KERNEL_map - PAGE_OFFSET)); + + return x; +} + +#ifdef CONFIG_DEBUG_VIRTUAL extern unsigned long __phys_addr(unsigned long); +#else +#define __phys_addr(x) __phys_addr_nodebug(x) +#endif #define __phys_reloc_hide(x) (x) diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 1330dd1..b014d94 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -59,6 +59,9 @@ EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(memmove); +#ifndef CONFIG_DEBUG_VIRTUAL +EXPORT_SYMBOL(phys_base); +#endif EXPORT_SYMBOL(empty_zero_page); #ifndef CONFIG_PARAVIRT EXPORT_SYMBOL(native_load_gs_index); diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index d2e2735..fd40d75 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -8,33 +8,43 @@ #ifdef CONFIG_X86_64 +#ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - VIRTUAL_BUG_ON(x >= KERNEL_IMAGE_SIZE); - x += phys_base; + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); } else { - VIRTUAL_BUG_ON(x < PAGE_OFFSET); - x -= PAGE_OFFSET; - VIRTUAL_BUG_ON(!phys_addr_valid(x)); + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + VIRTUAL_BUG_ON((x > y) || !phys_addr_valid(x)); } + return x; } EXPORT_SYMBOL(__phys_addr); +#endif bool __virt_addr_valid(unsigned long x) { - if (x >= __START_KERNEL_map) { - x -= __START_KERNEL_map; - if (x >= KERNEL_IMAGE_SIZE) + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + if (y >= KERNEL_IMAGE_SIZE) return false; - x += phys_base; } else { - if (x < PAGE_OFFSET) - return false; - x -= PAGE_OFFSET; - if (!phys_addr_valid(x)) + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + if ((x > y) || !phys_addr_valid(x)) return false; } -- cgit v0.10.2 From 7d74275d39def4d3ccc8cf4725388bf79ef13861 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:55:46 -0800 Subject: x86: Make it so that __pa_symbol can only process kernel symbols on x86_64 I submitted an earlier patch that make __phys_addr an inline. This obviously results in an increase in the code size. One step I can take to reduce that is to make it so that the __pa_symbol call does a direct translation for kernel addresses instead of covering all of virtual memory. On my system this reduced the size for __pa_symbol from 5 instructions totalling 30 bytes to 3 instructions totalling 16 bytes. Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215356.8521.92472.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin diff --git a/arch/x86/include/asm/page.h b/arch/x86/include/asm/page.h index 8ca8283..3698a6a 100644 --- a/arch/x86/include/asm/page.h +++ b/arch/x86/include/asm/page.h @@ -44,7 +44,8 @@ static inline void copy_user_page(void *to, void *from, unsigned long vaddr, * case properly. Once all supported versions of gcc understand it, we can * remove this Voodoo magic stuff. (i.e. once gcc3.x is deprecated) */ -#define __pa_symbol(x) __pa(__phys_reloc_hide((unsigned long)(x))) +#define __pa_symbol(x) \ + __phys_addr_symbol(__phys_reloc_hide((unsigned long)(x))) #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) diff --git a/arch/x86/include/asm/page_32.h b/arch/x86/include/asm/page_32.h index da4e762..4d550d0 100644 --- a/arch/x86/include/asm/page_32.h +++ b/arch/x86/include/asm/page_32.h @@ -15,6 +15,7 @@ extern unsigned long __phys_addr(unsigned long); #else #define __phys_addr(x) __phys_addr_nodebug(x) #endif +#define __phys_addr_symbol(x) __phys_addr(x) #define __phys_reloc_hide(x) RELOC_HIDE((x), 0) #ifdef CONFIG_FLATMEM diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index 5138174..0f1ddee 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -21,8 +21,11 @@ static inline unsigned long __phys_addr_nodebug(unsigned long x) #ifdef CONFIG_DEBUG_VIRTUAL extern unsigned long __phys_addr(unsigned long); +extern unsigned long __phys_addr_symbol(unsigned long); #else #define __phys_addr(x) __phys_addr_nodebug(x) +#define __phys_addr_symbol(x) \ + ((unsigned long)(x) - __START_KERNEL_map + phys_base) #endif #define __phys_reloc_hide(x) (x) diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index fd40d75..c73fedd 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -28,6 +28,17 @@ unsigned long __phys_addr(unsigned long x) return x; } EXPORT_SYMBOL(__phys_addr); + +unsigned long __phys_addr_symbol(unsigned long x) +{ + unsigned long y = x - __START_KERNEL_map; + + /* only check upper bounds since lower bounds will trigger carry */ + VIRTUAL_BUG_ON(y >= KERNEL_IMAGE_SIZE); + + return y + phys_base; +} +EXPORT_SYMBOL(__phys_addr_symbol); #endif bool __virt_addr_valid(unsigned long x) -- cgit v0.10.2 From 05a476b6e3795f205806662bf09ab95774266292 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:56:35 -0800 Subject: x86: Drop 4 unnecessary calls to __pa_symbol While debugging the __pa_symbol inline patch I found that there were a couple spots where __pa_symbol was used as follows: __pa_symbol(x) - __pa_symbol(y) The compiler had reduced them to: x - y Since we also support a debug case where __pa_symbol is a function call it would probably be useful to just change the two cases I found so that they are always just treated as "x - y". As such I am casting the values to phys_addr_t and then doing simple subtraction so that the correct type and value is returned. Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215552.8521.68085.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index c18f59d..f15db0c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -30,8 +30,8 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); + memblock_reserve(__pa_symbol(_text), + (phys_addr_t)__bss_stop - (phys_addr_t)_text); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 037df57..42f5df1 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -97,8 +97,8 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); - memblock_reserve(__pa_symbol(&_text), - __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); + memblock_reserve(__pa_symbol(_text), + (phys_addr_t)__bss_stop - (phys_addr_t)_text); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ -- cgit v0.10.2 From fc8d782677f163dee76427fdd8a92bebd2b50b23 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:57:13 -0800 Subject: x86: Use __pa_symbol instead of __pa on C visible symbols When I made an attempt at separating __pa_symbol and __pa I found that there were a number of cases where __pa was used on an obvious symbol. I also caught one non-obvious case as _brk_start and _brk_end are based on the address of __brk_base which is a C visible symbol. In mark_rodata_ro I was able to reduce the overhead of kernel symbol to virtual memory translation by using a combination of __va(__pa_symbol()) instead of page_address(virt_to_page()). Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215640.8521.80483.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 198e019..2249e7e 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -168,7 +168,7 @@ int __cpuinit ppro_with_ram_bug(void) #ifdef CONFIG_X86_F00F_BUG static void __cpuinit trap_init_f00f_bug(void) { - __set_fixmap(FIX_F00F_IDT, __pa(&idt_table), PAGE_KERNEL_RO); + __set_fixmap(FIX_F00F_IDT, __pa_symbol(idt_table), PAGE_KERNEL_RO); /* * Update the IDT descriptor and reload the IDT so that diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index ca45696..2702c5d 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -300,8 +300,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_reserve(__pa(_brk_start), - __pa(_brk_end) - __pa(_brk_start)); + memblock_reserve(__pa_symbol(_brk_start), + _brk_end - _brk_start); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -761,12 +761,12 @@ void __init setup_arch(char **cmdline_p) init_mm.end_data = (unsigned long) _edata; init_mm.brk = _brk_end; - code_resource.start = virt_to_phys(_text); - code_resource.end = virt_to_phys(_etext)-1; - data_resource.start = virt_to_phys(_etext); - data_resource.end = virt_to_phys(_edata)-1; - bss_resource.start = virt_to_phys(&__bss_start); - bss_resource.end = virt_to_phys(&__bss_stop)-1; + code_resource.start = __pa_symbol(_text); + code_resource.end = __pa_symbol(_etext)-1; + data_resource.start = __pa_symbol(_etext); + data_resource.end = __pa_symbol(_edata)-1; + bss_resource.start = __pa_symbol(__bss_start); + bss_resource.end = __pa_symbol(__bss_stop)-1; #ifdef CONFIG_CMDLINE_BOOL #ifdef CONFIG_CMDLINE_OVERRIDE diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 3baff25..0374a10 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -770,12 +770,10 @@ void set_kernel_text_ro(void) void mark_rodata_ro(void) { unsigned long start = PFN_ALIGN(_text); - unsigned long rodata_start = - ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; + unsigned long rodata_start = PFN_ALIGN(__start_rodata); unsigned long end = (unsigned long) &__end_rodata_hpage_align; - unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table); - unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata); - unsigned long data_start = (unsigned long) &_sdata; + unsigned long text_end = PFN_ALIGN(&__stop___ex_table); + unsigned long rodata_end = PFN_ALIGN(&__end_rodata); printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -800,12 +798,12 @@ void mark_rodata_ro(void) #endif free_init_pages("unused kernel memory", - (unsigned long) page_address(virt_to_page(text_end)), - (unsigned long) - page_address(virt_to_page(rodata_start))); + (unsigned long) __va(__pa_symbol(text_end)), + (unsigned long) __va(__pa_symbol(rodata_start))); + free_init_pages("unused kernel memory", - (unsigned long) page_address(virt_to_page(rodata_end)), - (unsigned long) page_address(virt_to_page(data_start))); + (unsigned long) __va(__pa_symbol(rodata_end)), + (unsigned long) __va(__pa_symbol(_sdata))); } #endif diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a718e0d..40f92f3 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -94,12 +94,12 @@ static inline void split_page_count(int level) { } static inline unsigned long highmap_start_pfn(void) { - return __pa(_text) >> PAGE_SHIFT; + return __pa_symbol(_text) >> PAGE_SHIFT; } static inline unsigned long highmap_end_pfn(void) { - return __pa(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; + return __pa_symbol(roundup(_brk_end, PMD_SIZE)) >> PAGE_SHIFT; } #endif @@ -276,8 +276,8 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, * The .rodata section needs to be read-only. Using the pfn * catches all aliases. */ - if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, - __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) + if (within(pfn, __pa_symbol(__start_rodata) >> PAGE_SHIFT, + __pa_symbol(__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; #if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index ad443914..1b60026 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -410,8 +410,8 @@ void __init efi_reserve_boot_services(void) * - Not within any part of the kernel * - Not the bios reserved area */ - if ((start+size >= virt_to_phys(_text) - && start <= virt_to_phys(_end)) || + if ((start+size >= __pa_symbol(_text) + && start <= __pa_symbol(_end)) || !e820_all_mapped(start, start+size, E820_RAM) || memblock_is_region_reserved(start, size)) { /* Could not reserve, skip it */ diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c index cbca565..8045026 100644 --- a/arch/x86/realmode/init.c +++ b/arch/x86/realmode/init.c @@ -62,9 +62,9 @@ void __init setup_real_mode(void) __va(real_mode_header->trampoline_header); #ifdef CONFIG_X86_32 - trampoline_header->start = __pa(startup_32_smp); + trampoline_header->start = __pa_symbol(startup_32_smp); trampoline_header->gdt_limit = __BOOT_DS + 7; - trampoline_header->gdt_base = __pa(boot_gdt); + trampoline_header->gdt_base = __pa_symbol(boot_gdt); #else /* * Some AMD processors will #GP(0) if EFER.LMA is set in WRMSR @@ -78,8 +78,8 @@ void __init setup_real_mode(void) *trampoline_cr4_features = read_cr4(); trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd); - trampoline_pgd[0] = __pa(level3_ident_pgt) + _KERNPG_TABLE; - trampoline_pgd[511] = __pa(level3_kernel_pgt) + _KERNPG_TABLE; + trampoline_pgd[0] = __pa_symbol(level3_ident_pgt) + _KERNPG_TABLE; + trampoline_pgd[511] = __pa_symbol(level3_kernel_pgt) + _KERNPG_TABLE; #endif } -- cgit v0.10.2 From 217f155e9fc68bf2a6c58a7b47e0d1ce68d78818 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:57:32 -0800 Subject: x86/ftrace: Use __pa_symbol instead of __pa on C visible symbols Instead of using __pa which is meant to be a general function for converting virtual addresses to physical addresses we can use __pa_symbol which is the preferred way of decoding kernel text virtual addresses to physical addresses. In this case we are not directly converting C visible symbols however if we know that the instruction pointer is somewhere between _text and _etext we know that we are going to be translating an address form the kernel text space. Cc: Steven Rostedt Cc: Frederic Weisbecker Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215718.8521.24026.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 1d41402..42a392a 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -89,7 +89,7 @@ do_ftrace_mod_code(unsigned long ip, const void *new_code) * kernel identity mapping to modify code. */ if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa(ip)); + ip = (unsigned long)__va(__pa_symbol(ip)); return probe_kernel_write((void *)ip, new_code, MCOUNT_INSN_SIZE); } @@ -279,7 +279,7 @@ static int ftrace_write(unsigned long ip, const char *val, int size) * kernel identity mapping to modify code. */ if (within(ip, (unsigned long)_text, (unsigned long)_etext)) - ip = (unsigned long)__va(__pa(ip)); + ip = (unsigned long)__va(__pa_symbol(ip)); return probe_kernel_write((void *)ip, val, size); } -- cgit v0.10.2 From afd51a0e32cd79261f0e823400886ed322a355ac Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:57:43 -0800 Subject: x86/acpi: Use __pa_symbol instead of __pa on C visible symbols This change just updates one spot where __pa was being used when __pa_symbol should have been used. By using __pa_symbol we are able to drop a few extra lines of code as we don't have to test to see if the virtual pointer is a part of the kernel text or just standard virtual memory. Cc: Len Brown Cc: Pavel Machek Acked-by: "Rafael J. Wysocki" Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215737.8521.51167.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 11676cf..f146a3c 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -69,7 +69,7 @@ int acpi_suspend_lowlevel(void) #ifndef CONFIG_64BIT header->pmode_entry = (u32)&wakeup_pmode_return; - header->pmode_cr3 = (u32)__pa(&initial_page_table); + header->pmode_cr3 = (u32)__pa_symbol(initial_page_table); saved_magic = 0x12345678; #else /* CONFIG_64BIT */ #ifdef CONFIG_SMP -- cgit v0.10.2 From 6a3956bd242926f8956992f6ed7805b0811be003 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Fri, 16 Nov 2012 13:58:12 -0800 Subject: x86/lguest: Use __pa_symbol instead of __pa on C visible symbols The function lguest_write_cr3 is using __pa to convert swapper_pg_dir and initial_page_table from virtual addresses to physical. The correct function to use for these values is __pa_symbol since they are C visible symbols. Cc: Rusty Russell Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121116215748.8521.83556.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 642d880..139dd35 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -552,7 +552,8 @@ static void lguest_write_cr3(unsigned long cr3) current_cr3 = cr3; /* These two page tables are simple, linear, and used during boot */ - if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table)) + if (cr3 != __pa_symbol(swapper_pg_dir) && + cr3 != __pa_symbol(initial_page_table)) cr3_changed = true; } -- cgit v0.10.2 From bbee3aec3472fc2ca10b6b1020aec84567ea25ce Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 19 Nov 2012 10:31:37 -0800 Subject: x86: Fix warning about cast from pointer to integer of different size This patch fixes a warning reported by the kbuild test robot where we were casting a pointer to a physical address which represents an integer of a different size. Per the suggestion of Peter Anvin I am replacing it and one other spot where I made a similar cast with an unsigned long. Signed-off-by: Alexander Duyck Link: http://lkml.kernel.org/r/20121119182927.3655.7641.stgit@ahduyck-cp1.jf.intel.com Signed-off-by: H. Peter Anvin diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index f15db0c..e175548 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,7 +31,7 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { memblock_reserve(__pa_symbol(_text), - (phys_addr_t)__bss_stop - (phys_addr_t)_text); + (unsigned long)__bss_stop - (unsigned long)_text); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 42f5df1..7b215a5 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -98,7 +98,7 @@ void __init x86_64_start_reservations(char *real_mode_data) copy_bootdata(__va(real_mode_data)); memblock_reserve(__pa_symbol(_text), - (phys_addr_t)__bss_stop - (phys_addr_t)_text); + (unsigned long)__bss_stop - (unsigned long)_text); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ -- cgit v0.10.2 From 5e4bf1a55da976a5ed60901bb8801f1024ef9774 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 20 Nov 2012 13:02:51 +0100 Subject: x86/mm: Don't flush the TLB on #WP pmd fixups If we have a write protection #PF and fix up the pmd then the hugetlb code [the only user of pmdp_set_access_flags], in its do_huge_pmd_wp_page() page fault resolution function calls pmdp_set_access_flags() to mark the pmd permissive again, and flushes the TLB. This TLB flush is unnecessary: a flush on #PF is guaranteed on most (all?) x86 CPUs, and even in the worst-case we'll generate a spurious fault. So remove it. Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Paul Turner Cc: Lee Schermerhorn Cc: Andrea Arcangeli Cc: Rik van Riel Cc: Johannes Weiner Cc: Christoph Lameter Cc: Mel Gorman Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20121120120251.GA15742@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 8573b83..8a828d7 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -328,7 +328,12 @@ int pmdp_set_access_flags(struct vm_area_struct *vma, if (changed && dirty) { *pmdp = entry; pmd_update_defer(vma->vm_mm, address, pmdp); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + /* + * We had a write-protection fault here and changed the pmd + * to to more permissive. No need to flush the TLB for that, + * #PF is architecturally guaranteed to do that and in the + * worst-case we'll generate a spurious fault. + */ } return changed; -- cgit v0.10.2 From a25b9316841c5afa226f8f70a457861b35276a92 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:30 -0800 Subject: x86, mm: Make DEBUG_VIRTUAL work earlier in boot The KVM code has some repeated bugs in it around use of __pa() on per-cpu data. Those data are not in an area on which using __pa() is valid. However, they are also called early enough in boot that __vmalloc_start_set is not set, and thus the CONFIG_DEBUG_VIRTUAL debugging does not catch them. This adds a check to also verify __pa() calls against max_low_pfn, which we can use earler in boot than is_vmalloc_addr(). However, if we are super-early in boot, max_low_pfn=0 and this will trip on every call, so also make sure that max_low_pfn is set before we try to use it. With this patch applied, CONFIG_DEBUG_VIRTUAL will actually catch the bug I was chasing (and fix later in this series). I'd love to find a generic way so that any __pa() call on percpu areas could do a BUG_ON(), but there don't appear to be any nice and easy ways to check if an address is a percpu one. Anybody have ideas on a way to do this? Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212430.F46F8159@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 2d125be..76604eb 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -219,7 +219,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) */ nd = alloc_remap(nid, nd_size); if (nd) { - nd_pa = __pa(nd); + nd_pa = __phys_addr_nodebug(nd); remapped = true; } else { nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index 0eb572e..2610bd9 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -560,10 +560,10 @@ int kernel_map_sync_memtype(u64 base, unsigned long size, unsigned long flags) { unsigned long id_sz; - if (base >= __pa(high_memory)) + if (base > __pa(high_memory-1)) return 0; - id_sz = (__pa(high_memory) < base + size) ? + id_sz = (__pa(high_memory-1) <= base + size) ? __pa(high_memory) - base : size; diff --git a/arch/x86/mm/physaddr.c b/arch/x86/mm/physaddr.c index c73fedd..e666cbb 100644 --- a/arch/x86/mm/physaddr.c +++ b/arch/x86/mm/physaddr.c @@ -1,3 +1,4 @@ +#include #include #include #include @@ -68,10 +69,16 @@ EXPORT_SYMBOL(__virt_addr_valid); #ifdef CONFIG_DEBUG_VIRTUAL unsigned long __phys_addr(unsigned long x) { + unsigned long phys_addr = x - PAGE_OFFSET; /* VMALLOC_* aren't constants */ VIRTUAL_BUG_ON(x < PAGE_OFFSET); VIRTUAL_BUG_ON(__vmalloc_start_set && is_vmalloc_addr((void *) x)); - return x - PAGE_OFFSET; + /* max_low_pfn is set early, but not _that_ early */ + if (max_low_pfn) { + VIRTUAL_BUG_ON((phys_addr >> PAGE_SHIFT) > max_low_pfn); + BUG_ON(slow_virt_to_phys((void *)x) != phys_addr); + } + return phys_addr; } EXPORT_SYMBOL(__phys_addr); #endif -- cgit v0.10.2 From 4cbeb51b860c57ba8b2ae50c4016ee7a41f5fbd5 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:31 -0800 Subject: x86, mm: Pagetable level size/shift/mask helpers I plan to use lookup_address() to walk the kernel pagetables in a later patch. It returns a "pte" and the level in the pagetables where the "pte" was found. The level is just an enum and needs to be converted to a useful value in order to do address calculations with it. These helpers will be used in at least two places. This also gives the anonymous enum a real name so that no one gets confused about what they should be passing in to these helpers. "PTE_SHIFT" was chosen for naming consistency with the other pagetable levels (PGD/PUD/PMD_SHIFT). Cc: H. Peter Anvin Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212431.405D3A8C@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 5199db2..bc28e6f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -390,6 +390,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); #ifndef __ASSEMBLY__ #include +#include static inline int pte_none(pte_t pte) { @@ -781,6 +782,19 @@ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) memcpy(dst, src, count * sizeof(pgd_t)); } +#define PTE_SHIFT ilog2(PTRS_PER_PTE) +static inline int page_level_shift(enum pg_level level) +{ + return (PAGE_SHIFT - PTE_SHIFT) + level * PTE_SHIFT; +} +static inline unsigned long page_level_size(enum pg_level level) +{ + return 1UL << page_level_shift(level); +} +static inline unsigned long page_level_mask(enum pg_level level) +{ + return ~(page_level_size(level) - 1); +} #include #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 3c32db8..6c297e7 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -331,7 +331,7 @@ extern void native_pagetable_init(void); struct seq_file; extern void arch_report_meminfo(struct seq_file *m); -enum { +enum pg_level { PG_LEVEL_NONE, PG_LEVEL_4K, PG_LEVEL_2M, -- cgit v0.10.2 From f3c4fbb68e93b10c781c0cc462a9d80770244da6 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:32 -0800 Subject: x86, mm: Use new pagetable helpers in try_preserve_large_page() try_preserve_large_page() can be slightly simplified by using the new page_level_*() helpers. This also moves the 'level' over to the new pg_level enum type. Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212432.14F3D993@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 40f92f3..2a5c9ab 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -396,7 +396,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot, req_prot; int i, do_split = 1; - unsigned int level; + enum pg_level level; if (cpa->force_split) return 1; @@ -412,15 +412,12 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, switch (level) { case PG_LEVEL_2M: - psize = PMD_PAGE_SIZE; - pmask = PMD_PAGE_MASK; - break; #ifdef CONFIG_X86_64 case PG_LEVEL_1G: - psize = PUD_PAGE_SIZE; - pmask = PUD_PAGE_MASK; - break; #endif + psize = page_level_size(level); + pmask = page_level_mask(level); + break; default: do_split = -EINVAL; goto out_unlock; -- cgit v0.10.2 From d765653445129b7c476758040e3079480775f80a Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:33 -0800 Subject: x86, mm: Create slow_virt_to_phys() This is necessary because __pa() does not work on some kinds of memory, like vmalloc() or the alloc_remap() areas on 32-bit NUMA systems. We have some functions to do conversions _like_ this in the vmalloc() code (like vmalloc_to_page()), but they do not work on sizes other than 4k pages. We would potentially need to be able to handle all the page sizes that we use for the kernel linear mapping (4k, 2M, 1G). In practice, on 32-bit NUMA systems, the percpu areas get stuck in the alloc_remap() area. Any __pa() call on them will break and basically return garbage. This patch introduces a new function slow_virt_to_phys(), which walks the kernel page tables on x86 and should do precisely the same logical thing as __pa(), but actually work on a wider range of memory. It should work on the normal linear mapping, vmalloc(), kmap(), etc... Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212433.4D1FCA62@kernel.stglabs.ibm.com Acked-by: Rik van Riel Signed-off-by: H. Peter Anvin diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 6c297e7..9f82690 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -352,6 +352,7 @@ static inline void update_page_count(int level, unsigned long pages) { } * as a pte too. */ extern pte_t *lookup_address(unsigned long address, unsigned int *level); +extern phys_addr_t slow_virt_to_phys(void *__address); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2a5c9ab..6d13d2a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -364,6 +364,37 @@ pte_t *lookup_address(unsigned long address, unsigned int *level) EXPORT_SYMBOL_GPL(lookup_address); /* + * This is necessary because __pa() does not work on some + * kinds of memory, like vmalloc() or the alloc_remap() + * areas on 32-bit NUMA systems. The percpu areas can + * end up in this kind of memory, for instance. + * + * This could be optimized, but it is only intended to be + * used at inititalization time, and keeping it + * unoptimized should increase the testing coverage for + * the more obscure platforms. + */ +phys_addr_t slow_virt_to_phys(void *__virt_addr) +{ + unsigned long virt_addr = (unsigned long)__virt_addr; + phys_addr_t phys_addr; + unsigned long offset; + enum pg_level level; + unsigned long psize; + unsigned long pmask; + pte_t *pte; + + pte = lookup_address(virt_addr, &level); + BUG_ON(!pte); + psize = page_level_size(level); + pmask = page_level_mask(level); + offset = virt_addr & ~pmask; + phys_addr = pte_pfn(*pte) << PAGE_SHIFT; + return (phys_addr | offset); +} +EXPORT_SYMBOL_GPL(slow_virt_to_phys); + +/* * Set the new pmd in all the pgds we know about: */ static void __set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) -- cgit v0.10.2 From 5dfd486c4750c9278c63fa96e6e85bdd2fb58e9d Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 22 Jan 2013 13:24:35 -0800 Subject: x86, kvm: Fix kvm's use of __pa() on percpu areas In short, it is illegal to call __pa() on an address holding a percpu variable. This replaces those __pa() calls with slow_virt_to_phys(). All of the cases in this patch are in boot time (or CPU hotplug time at worst) code, so the slow pagetable walking in slow_virt_to_phys() is not expected to have a performance impact. The times when this actually matters are pretty obscure (certain 32-bit NUMA systems), but it _does_ happen. It is important to keep KVM guests working on these systems because the real hardware is getting harder and harder to find. This bug manifested first by me seeing a plain hang at boot after this message: CPU 0 irqstacks, hard=f3018000 soft=f301a000 or, sometimes, it would actually make it out to the console: [ 0.000000] BUG: unable to handle kernel paging request at ffffffff I eventually traced it down to the KVM async pagefault code. This can be worked around by disabling that code either at compile-time, or on the kernel command-line. The kvm async pagefault code was injecting page faults in to the guest which the guest misinterpreted because its "reason" was not being properly sent from the host. The guest passes a physical address of an per-cpu async page fault structure via an MSR to the host. Since __pa() is broken on percpu data, the physical address it sent was bascially bogus and the host went scribbling on random data. The guest never saw the real reason for the page fault (it was injected by the host), assumed that the kernel had taken a _real_ page fault, and panic()'d. The behavior varied, though, depending on what got corrupted by the bad write. Signed-off-by: Dave Hansen Link: http://lkml.kernel.org/r/20130122212435.4905663F@kernel.stglabs.ibm.com Acked-by: Rik van Riel Reviewed-by: Marcelo Tosatti Signed-off-by: H. Peter Anvin diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9c2bd8b..aa7e58b 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -297,9 +297,9 @@ static void kvm_register_steal_time(void) memset(st, 0, sizeof(*st)); - wrmsrl(MSR_KVM_STEAL_TIME, (__pa(st) | KVM_MSR_ENABLED)); + wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); printk(KERN_INFO "kvm-stealtime: cpu %d, msr %lx\n", - cpu, __pa(st)); + cpu, slow_virt_to_phys(st)); } static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED; @@ -324,7 +324,7 @@ void __cpuinit kvm_guest_cpu_init(void) return; if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) { - u64 pa = __pa(&__get_cpu_var(apf_reason)); + u64 pa = slow_virt_to_phys(&__get_cpu_var(apf_reason)); #ifdef CONFIG_PREEMPT pa |= KVM_ASYNC_PF_SEND_ALWAYS; @@ -340,7 +340,8 @@ void __cpuinit kvm_guest_cpu_init(void) /* Size alignment is implied but just to make it explicit. */ BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4); __get_cpu_var(kvm_apic_eoi) = 0; - pa = __pa(&__get_cpu_var(kvm_apic_eoi)) | KVM_MSR_ENABLED; + pa = slow_virt_to_phys(&__get_cpu_var(kvm_apic_eoi)) + | KVM_MSR_ENABLED; wrmsrl(MSR_KVM_PV_EOI_EN, pa); } diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 220a360..9f966dc 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -162,8 +162,8 @@ int kvm_register_clock(char *txt) int low, high, ret; struct pvclock_vcpu_time_info *src = &hv_clock[cpu].pvti; - low = (int)__pa(src) | 1; - high = ((u64)__pa(src) >> 32); + low = (int)slow_virt_to_phys(src) | 1; + high = ((u64)slow_virt_to_phys(src) >> 32); ret = native_write_msr_safe(msr_kvm_system_time, low, high); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); -- cgit v0.10.2 From 1e9209edc71b851d81f0316ca03a0e6335c0ef9a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 27 Jan 2013 01:18:21 +0100 Subject: x86/numa: Use __pa_nodebug() instead MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... and fix the following warning: arch/x86/mm/numa.c: In function ‘setup_node_data’: arch/x86/mm/numa.c:222:3: warning: passing argument 1 of ‘__phys_addr_nodebug’ makes integer from pointer without a cast Signed-off-by: Borislav Petkov Acked-by: Dave Hansen Link: http://lkml.kernel.org/r/1359245901-8512-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 76604eb..b2313c6 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -219,7 +219,7 @@ static void __init setup_node_data(int nid, u64 start, u64 end) */ nd = alloc_remap(nid, nd_size); if (nd) { - nd_pa = __phys_addr_nodebug(nd); + nd_pa = __pa_nodebug(nd); remapped = true; } else { nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); -- cgit v0.10.2 From f03574f2d5b2d6229dcdf2d322848065f72953c7 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 30 Jan 2013 16:56:16 -0800 Subject: x86-32, mm: Rip out x86_32 NUMA remapping code This code was an optimization for 32-bit NUMA systems. It has probably been the cause of a number of subtle bugs over the years, although the conditions to excite them would have been hard to trigger. Essentially, we remap part of the kernel linear mapping area, and then sometimes part of that area gets freed back in to the bootmem allocator. If those pages get used by kernel data structures (say mem_map[] or a dentry), there's no big deal. But, if anyone ever tried to use the linear mapping for these pages _and_ cared about their physical address, bad things happen. For instance, say you passed __GFP_ZERO to the page allocator and then happened to get handed one of these pages, it zero the remapped page, but it would make a pte to the _old_ page. There are probably a hundred other ways that it could screw with things. We don't need to hang on to performance optimizations for these old boxes any more. All my 32-bit NUMA systems are long dead and buried, and I probably had access to more than most people. This code is causing real things to break today: https://lkml.org/lkml/2013/1/9/376 I looked in to actually fixing this, but it requires surgery to way too much brittle code, as well as stuff like per_cpu_ptr_to_phys(). [ hpa: Cc: this for -stable, since it is a memory corruption issue. However, an alternative is to simply mark NUMA as depends BROKEN rather than EXPERIMENTAL in the X86_32 subclause... ] Link: http://lkml.kernel.org/r/20130131005616.1C79F411@kernel.stglabs.ibm.com Signed-off-by: H. Peter Anvin Cc: diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 79795af..108efcb 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1253,10 +1253,6 @@ config NODES_SHIFT Specify the maximum number of NUMA Nodes available on the target system. Increases memory reserved to accommodate various tables. -config HAVE_ARCH_ALLOC_REMAP - def_bool y - depends on X86_32 && NUMA - config ARCH_HAVE_MEMORY_PRESENT def_bool y depends on X86_32 && DISCONTIGMEM diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index b2313c6..61c2b6f 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -205,9 +205,6 @@ static void __init setup_node_data(int nid, u64 start, u64 end) if (end && (end - start) < NODE_MIN_SIZE) return; - /* initialize remap allocator before aligning to ZONE_ALIGN */ - init_alloc_remap(nid, start, end); - start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Initmem setup node %d [mem %#010Lx-%#010Lx]\n", diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 534255a..73a6d73 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -73,167 +73,6 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn, extern unsigned long highend_pfn, highstart_pfn; -#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) - -static void *node_remap_start_vaddr[MAX_NUMNODES]; -void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -/* - * Remap memory allocator - */ -static unsigned long node_remap_start_pfn[MAX_NUMNODES]; -static void *node_remap_end_vaddr[MAX_NUMNODES]; -static void *node_remap_alloc_vaddr[MAX_NUMNODES]; - -/** - * alloc_remap - Allocate remapped memory - * @nid: NUMA node to allocate memory from - * @size: The size of allocation - * - * Allocate @size bytes from the remap area of NUMA node @nid. The - * size of the remap area is predetermined by init_alloc_remap() and - * only the callers considered there should call this function. For - * more info, please read the comment on top of init_alloc_remap(). - * - * The caller must be ready to handle allocation failure from this - * function and fall back to regular memory allocator in such cases. - * - * CONTEXT: - * Single CPU early boot context. - * - * RETURNS: - * Pointer to the allocated memory on success, %NULL on failure. - */ -void *alloc_remap(int nid, unsigned long size) -{ - void *allocation = node_remap_alloc_vaddr[nid]; - - size = ALIGN(size, L1_CACHE_BYTES); - - if (!allocation || (allocation + size) > node_remap_end_vaddr[nid]) - return NULL; - - node_remap_alloc_vaddr[nid] += size; - memset(allocation, 0, size); - - return allocation; -} - -#ifdef CONFIG_HIBERNATION -/** - * resume_map_numa_kva - add KVA mapping to the temporary page tables created - * during resume from hibernation - * @pgd_base - temporary resume page directory - */ -void resume_map_numa_kva(pgd_t *pgd_base) -{ - int node; - - for_each_online_node(node) { - unsigned long start_va, start_pfn, nr_pages, pfn; - - start_va = (unsigned long)node_remap_start_vaddr[node]; - start_pfn = node_remap_start_pfn[node]; - nr_pages = (node_remap_end_vaddr[node] - - node_remap_start_vaddr[node]) >> PAGE_SHIFT; - - printk(KERN_DEBUG "%s: node %d\n", __func__, node); - - for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) { - unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); - pgd_t *pgd = pgd_base + pgd_index(vaddr); - pud_t *pud = pud_offset(pgd, vaddr); - pmd_t *pmd = pmd_offset(pud, vaddr); - - set_pmd(pmd, pfn_pmd(start_pfn + pfn, - PAGE_KERNEL_LARGE_EXEC)); - - printk(KERN_DEBUG "%s: %08lx -> pfn %08lx\n", - __func__, vaddr, start_pfn + pfn); - } - } -} -#endif - -/** - * init_alloc_remap - Initialize remap allocator for a NUMA node - * @nid: NUMA node to initizlie remap allocator for - * - * NUMA nodes may end up without any lowmem. As allocating pgdat and - * memmap on a different node with lowmem is inefficient, a special - * remap allocator is implemented which can be used by alloc_remap(). - * - * For each node, the amount of memory which will be necessary for - * pgdat and memmap is calculated and two memory areas of the size are - * allocated - one in the node and the other in lowmem; then, the area - * in the node is remapped to the lowmem area. - * - * As pgdat and memmap must be allocated in lowmem anyway, this - * doesn't waste lowmem address space; however, the actual lowmem - * which gets remapped over is wasted. The amount shouldn't be - * problematic on machines this feature will be used. - * - * Initialization failure isn't fatal. alloc_remap() is used - * opportunistically and the callers will fall back to other memory - * allocation mechanisms on failure. - */ -void __init init_alloc_remap(int nid, u64 start, u64 end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long end_pfn = end >> PAGE_SHIFT; - unsigned long size, pfn; - u64 node_pa, remap_pa; - void *remap_va; - - /* - * The acpi/srat node info can show hot-add memroy zones where - * memory could be added but not currently present. - */ - printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", - nid, start_pfn, end_pfn); - - /* calculate the necessary space aligned to large page size */ - size = node_memmap_size_bytes(nid, start_pfn, end_pfn); - size += ALIGN(sizeof(pg_data_t), PAGE_SIZE); - size = ALIGN(size, LARGE_PAGE_BYTES); - - /* allocate node memory and the lowmem remap area */ - node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); - if (!node_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", - size, nid); - return; - } - memblock_reserve(node_pa, size); - - remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, - max_low_pfn << PAGE_SHIFT, - size, LARGE_PAGE_BYTES); - if (!remap_pa) { - pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", - size, nid); - memblock_free(node_pa, size); - return; - } - memblock_reserve(remap_pa, size); - remap_va = phys_to_virt(remap_pa); - - /* perform actual remap */ - for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE) - set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT), - (node_pa >> PAGE_SHIFT) + pfn, - PAGE_KERNEL_LARGE); - - /* initialize remap allocator parameters */ - node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT; - node_remap_start_vaddr[nid] = remap_va; - node_remap_end_vaddr[nid] = remap_va + size; - node_remap_alloc_vaddr[nid] = remap_va; - - printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n", - nid, node_pa, node_pa + size, remap_va, remap_va + size); -} - void __init initmem_init(void) { x86_numa_init(); diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h index 7178c3a..ad86ec9 100644 --- a/arch/x86/mm/numa_internal.h +++ b/arch/x86/mm/numa_internal.h @@ -21,12 +21,6 @@ void __init numa_reset_distance(void); void __init x86_numa_init(void); -#ifdef CONFIG_X86_64 -static inline void init_alloc_remap(int nid, u64 start, u64 end) { } -#else -void __init init_alloc_remap(int nid, u64 start, u64 end); -#endif - #ifdef CONFIG_NUMA_EMU void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt); -- cgit v0.10.2 From bb112aec5ee41427e9b9726e3d57b896709598ed Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 31 Jan 2013 13:53:10 -0800 Subject: x86-32, mm: Remove reference to resume_map_numa_kva() Remove reference to removed function resume_map_numa_kva(). Signed-off-by: H. Peter Anvin Cc: Dave Hansen Cc: Link: http://lkml.kernel.org/r/20130131005616.1C79F411@kernel.stglabs.ibm.com diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h index eb05fb3..8a9b3e2 100644 --- a/arch/x86/include/asm/mmzone_32.h +++ b/arch/x86/include/asm/mmzone_32.h @@ -14,12 +14,6 @@ extern struct pglist_data *node_data[]; #include -extern void resume_map_numa_kva(pgd_t *pgd); - -#else /* !CONFIG_NUMA */ - -static inline void resume_map_numa_kva(pgd_t *pgd) {} - #endif /* CONFIG_NUMA */ #ifdef CONFIG_DISCONTIGMEM diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 74202c1..7d28c88 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -129,8 +129,6 @@ static int resume_physical_mapping_init(pgd_t *pgd_base) } } - resume_map_numa_kva(pgd_base); - return 0; } -- cgit v0.10.2 From 07f4207a305c834f528d08428df4531744e25678 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 31 Jan 2013 14:00:48 -0800 Subject: x86-32, mm: Remove reference to alloc_remap() We have removed the remap allocator for x86-32, and x86-64 never had it (and doesn't need it). Remove residual reference to it. Reported-by: Yinghai Lu Signed-off-by: H. Peter Anvin Cc: Dave Hansen Cc: Link: http://lkml.kernel.org/r/CAE9FiQVn6_QZi3fNQ-JHYiR-7jeDJ5hT0SyT_%2BzVvfOj=PzF3w@mail.gmail.com diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 61c2b6f..8504f36 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -193,7 +193,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) static void __init setup_node_data(int nid, u64 start, u64 end) { const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); - bool remapped = false; u64 nd_pa; void *nd; int tnid; @@ -211,28 +210,22 @@ static void __init setup_node_data(int nid, u64 start, u64 end) nid, start, end - 1); /* - * Allocate node data. Try remap allocator first, node-local - * memory and then any node. Never allocate in DMA zone. + * Allocate node data. Try node-local memory and then any node. + * Never allocate in DMA zone. */ - nd = alloc_remap(nid, nd_size); - if (nd) { - nd_pa = __pa_nodebug(nd); - remapped = true; - } else { - nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); - if (!nd_pa) { - pr_err("Cannot find %zu bytes in node %d\n", - nd_size, nid); - return; - } - nd = __va(nd_pa); + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { + pr_err("Cannot find %zu bytes in node %d\n", + nd_size, nid); + return; } + nd = __va(nd_pa); /* report and initialize */ - printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]%s\n", - nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : ""); + printk(KERN_INFO " NODE_DATA [mem %#010Lx-%#010Lx]\n", + nd_pa, nd_pa + nd_size - 1); tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT); - if (!remapped && tnid != nid) + if (tnid != nid) printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid); node_data[nid] = nd; -- cgit v0.10.2