From 5aac644a9944bea93b4f05ced1883a902a2535f6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 3 Jun 2015 10:39:46 +0300 Subject: x86/tsc: Let high latency PIT fail fast in quick_pit_calibrate() If it takes longer than 12us to read the PIT counter lsb/msb, then the error margin will never fall below 500ppm within 50ms, and Fast TSC calibration will always fail. This patch detects when that will happen and fails fast. Note the failure message is not printed in that case because: 1. it will always happen on that class of hardware 2. the absence of the message is more informative than its presence Signed-off-by: Adrian Hunter Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Len Brown Cc: Andi Kleen Link: http://lkml.kernel.org/r/556EB717.9070607@intel.com Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 5054497..7437b41 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -598,10 +598,19 @@ static unsigned long quick_pit_calibrate(void) if (!pit_expect_msb(0xff-i, &delta, &d2)) break; + delta -= tsc; + + /* + * Extrapolate the error and fail fast if the error will + * never be below 500 ppm. + */ + if (i == 1 && + d1 + d2 >= (delta * MAX_QUICK_PIT_ITERATIONS) >> 11) + return 0; + /* * Iterate until the error is less than 500 ppm */ - delta -= tsc; if (d1+d2 >= delta >> 11) continue; -- cgit v0.10.2 From d0f77d4d04b222a817925d33ba3589b190bfa863 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:33 +0300 Subject: x86/init: Clear 'init_level4_pgt' earlier Currently x86_64_start_kernel() has two KASAN related function calls. The first call maps shadow to early_level4_pgt, the second maps shadow to init_level4_pgt. If we move clear_page(init_level4_pgt) earlier, we could hide KASAN low level detail from generic x86_64 initialization code. The next patch will do it. Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-2-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5a46681..65c8985 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -166,6 +166,8 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); + clear_page(init_level4_pgt); + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); @@ -177,7 +179,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - clear_page(init_level4_pgt); /* set init_level4_pgt kernel high mapping*/ init_level4_pgt[511] = early_level4_pgt[511]; -- cgit v0.10.2 From 5d5aa3cfca5cf74cd928daf3674642e6004328d1 Mon Sep 17 00:00:00 2001 From: Alexander Popov Date: Thu, 2 Jul 2015 12:09:34 +0300 Subject: x86/kasan: Fix KASAN shadow region page tables Currently KASAN shadow region page tables created without respect of physical offset (phys_base). This causes kernel halt when phys_base is not zero. So let's initialize KASAN shadow region page tables in kasan_early_init() using __pa_nodebug() which considers phys_base. This patch also separates x86_64_start_kernel() from KASAN low level details by moving kasan_map_early_shadow(init_level4_pgt) into kasan_early_init(). Remove the comment before clear_bss() which stopped bringing much profit to the code readability. Otherwise describing all the new order dependencies would be too verbose. Signed-off-by: Alexander Popov Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-3-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/kasan.h b/arch/x86/include/asm/kasan.h index 8b22422..74a2a8d 100644 --- a/arch/x86/include/asm/kasan.h +++ b/arch/x86/include/asm/kasan.h @@ -14,15 +14,11 @@ #ifndef __ASSEMBLY__ -extern pte_t kasan_zero_pte[]; -extern pte_t kasan_zero_pmd[]; -extern pte_t kasan_zero_pud[]; - #ifdef CONFIG_KASAN -void __init kasan_map_early_shadow(pgd_t *pgd); +void __init kasan_early_init(void); void __init kasan_init(void); #else -static inline void kasan_map_early_shadow(pgd_t *pgd) { } +static inline void kasan_early_init(void) { } static inline void kasan_init(void) { } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 65c8985..f129a9a 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -161,13 +161,12 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* Kill off the identity-map trampoline */ reset_early_page_tables(); - kasan_map_early_shadow(early_level4_pgt); - - /* clear bss before set_intr_gate with early_idt_handler */ clear_bss(); clear_page(init_level4_pgt); + kasan_early_init(); + for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) set_intr_gate(i, early_idt_handler_array[i]); load_idt((const struct desc_ptr *)&idt_descr); @@ -182,8 +181,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) /* set init_level4_pgt kernel high mapping*/ init_level4_pgt[511] = early_level4_pgt[511]; - kasan_map_early_shadow(init_level4_pgt); - x86_64_start_reservations(real_mode_data); } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e5c27f7..1d40ca8 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -516,38 +516,9 @@ ENTRY(phys_base) /* This must match the first entry in level2_kernel_pgt */ .quad 0x0000000000000000 -#ifdef CONFIG_KASAN -#define FILL(VAL, COUNT) \ - .rept (COUNT) ; \ - .quad (VAL) ; \ - .endr - -NEXT_PAGE(kasan_zero_pte) - FILL(kasan_zero_page - __START_KERNEL_map + _KERNPG_TABLE, 512) -NEXT_PAGE(kasan_zero_pmd) - FILL(kasan_zero_pte - __START_KERNEL_map + _KERNPG_TABLE, 512) -NEXT_PAGE(kasan_zero_pud) - FILL(kasan_zero_pmd - __START_KERNEL_map + _KERNPG_TABLE, 512) - -#undef FILL -#endif - - #include "../../x86/xen/xen-head.S" __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE -#ifdef CONFIG_KASAN -/* - * This page used as early shadow. We don't use empty_zero_page - * at early stages, stack instrumentation could write some garbage - * to this page. - * Latter we reuse it as zero shadow for large ranges of memory - * that allowed to access, but not instrumented by kasan - * (vmalloc/vmemmap ...). - */ -NEXT_PAGE(kasan_zero_page) - .skip PAGE_SIZE -#endif diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 4860906..0e4a05f 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -11,7 +11,19 @@ extern pgd_t early_level4_pgt[PTRS_PER_PGD]; extern struct range pfn_mapped[E820_X_MAX]; -extern unsigned char kasan_zero_page[PAGE_SIZE]; +static pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss; +static pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss; +static pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss; + +/* + * This page used as early shadow. We don't use empty_zero_page + * at early stages, stack instrumentation could write some garbage + * to this page. + * Latter we reuse it as zero shadow for large ranges of memory + * that allowed to access, but not instrumented by kasan + * (vmalloc/vmemmap ...). + */ +static unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss; static int __init map_range(struct range *range) { @@ -36,7 +48,7 @@ static void __init clear_pgds(unsigned long start, pgd_clear(pgd_offset_k(start)); } -void __init kasan_map_early_shadow(pgd_t *pgd) +static void __init kasan_map_early_shadow(pgd_t *pgd) { int i; unsigned long start = KASAN_SHADOW_START; @@ -166,6 +178,26 @@ static struct notifier_block kasan_die_notifier = { }; #endif +void __init kasan_early_init(void) +{ + int i; + pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL; + pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE; + pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE; + + for (i = 0; i < PTRS_PER_PTE; i++) + kasan_zero_pte[i] = __pte(pte_val); + + for (i = 0; i < PTRS_PER_PMD; i++) + kasan_zero_pmd[i] = __pmd(pmd_val); + + for (i = 0; i < PTRS_PER_PUD; i++) + kasan_zero_pud[i] = __pud(pud_val); + + kasan_map_early_shadow(early_level4_pgt); + kasan_map_early_shadow(init_level4_pgt); +} + void __init kasan_init(void) { int i; -- cgit v0.10.2 From 241d2c54c62fa0939fc9a9512b48ac3434e90a89 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:35 +0300 Subject: x86/kasan: Flush TLBs after switching CR3 load_cr3() doesn't cause tlb_flush if PGE enabled. This may cause tons of false positive reports spamming the kernel to death. To fix this __flush_tlb_all() should be called explicitly after CR3 changed. Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-4-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0e4a05f..5d26642 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -208,6 +208,7 @@ void __init kasan_init(void) memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt)); load_cr3(early_level4_pgt); + __flush_tlb_all(); clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); @@ -234,5 +235,6 @@ void __init kasan_init(void) memset(kasan_zero_page, 0, PAGE_SIZE); load_cr3(init_level4_pgt); + __flush_tlb_all(); init_task.kasan_depth = 0; } -- cgit v0.10.2 From d4f86beacc21d538dc41e1fc75a22e084f547edf Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:36 +0300 Subject: x86/kasan: Fix boot crash on AMD processors While populating zero shadow wrong bits in upper level page tables used. __PAGE_KERNEL_RO that was used for pgd/pud/pmd has _PAGE_BIT_GLOBAL set. Global bit is present only in the lowest level of the page translation hierarchy (ptes), and it should be zero in upper levels. This bug seems doesn't cause any troubles on Intel cpus, while on AMDs it cause kernel crash on boot. Use _KERNPG_TABLE bits for pgds/puds/pmds to fix this. Reported-by: Borislav Petkov Signed-off-by: Andrey Ryabinin Cc: # 4.0+ Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-5-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 5d26642..9a54dbe 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -85,7 +85,7 @@ static int __init zero_pmd_populate(pud_t *pud, unsigned long addr, while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) { WARN_ON(!pmd_none(*pmd)); set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte) - | __PAGE_KERNEL_RO)); + | _KERNPG_TABLE)); addr += PMD_SIZE; pmd = pmd_offset(pud, addr); } @@ -111,7 +111,7 @@ static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr, while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) { WARN_ON(!pud_none(*pud)); set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd) - | __PAGE_KERNEL_RO)); + | _KERNPG_TABLE)); addr += PUD_SIZE; pud = pud_offset(pgd, addr); } @@ -136,7 +136,7 @@ static int __init zero_pgd_populate(unsigned long addr, unsigned long end) while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) { WARN_ON(!pgd_none(*pgd)); set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud) - | __PAGE_KERNEL_RO)); + | _KERNPG_TABLE)); addr += PGDIR_SIZE; pgd = pgd_offset_k(addr); } -- cgit v0.10.2 From 8515522949951d81fe2d06c0a3292f171f2b8ec4 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:37 +0300 Subject: x86/kasan: Add message about KASAN being initialized Print informational message to tell user that kernel runs with KASAN enabled. Add a "kasan: " prefix to all messages in kasan_init_64.c. Signed-off-by: Andrey Ryabinin Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-6-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 9a54dbe..e1840f3 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -1,3 +1,4 @@ +#define pr_fmt(fmt) "kasan: " fmt #include #include #include @@ -237,4 +238,6 @@ void __init kasan_init(void) load_cr3(init_level4_pgt); __flush_tlb_all(); init_task.kasan_depth = 0; + + pr_info("Kernel address sanitizer initialized\n"); } -- cgit v0.10.2 From d6f2d75a7ae06ffd793bb504c4f0d1665548cffc Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Thu, 2 Jul 2015 12:09:38 +0300 Subject: x86/kasan: Move KASAN_SHADOW_OFFSET to the arch Kconfig KASAN_SHADOW_OFFSET is purely arch specific setting, so it should be in arch's Kconfig file. Signed-off-by: Andrey Ryabinin Cc: Alexander Popov Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Linus Torvalds Cc: Paul Bolle Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1435828178-10975-7-git-send-email-a.ryabinin@samsung.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 55bced1..0b2929f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -254,6 +254,11 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING config ARCH_SUPPORTS_DEBUG_PAGEALLOC def_bool y +config KASAN_SHADOW_OFFSET + hex + depends on KASAN + default 0xdffffc0000000000 + config HAVE_INTEL_TXT def_bool y depends on INTEL_IOMMU && ACPI diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index 777eda7..39f24d6 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -18,10 +18,6 @@ config KASAN For better error detection enable CONFIG_STACKTRACE, and add slub_debug=U to boot cmdline. -config KASAN_SHADOW_OFFSET - hex - default 0xdffffc0000000000 if X86_64 - choice prompt "Instrumentation type" depends on KASAN -- cgit v0.10.2 From 1db875631f8d5bbf497f67e47f254eece888d51d Mon Sep 17 00:00:00 2001 From: Zhu Guihua Date: Fri, 3 Jul 2015 17:37:18 +0800 Subject: x86/espfix: Add 'cpu' parameter to init_espfix_ap() Add a CPU index parameter to init_espfix_ap(), so that the parameter could be propagated to the function for espfix page allocation. Signed-off-by: Zhu Guihua Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cde3fcf1b3211f3f03feb1a995bce3fee850f0fc.1435824469.git.zhugh.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/espfix.h b/arch/x86/include/asm/espfix.h index 99efebb..ca3ce9a 100644 --- a/arch/x86/include/asm/espfix.h +++ b/arch/x86/include/asm/espfix.h @@ -9,7 +9,7 @@ DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_stack); DECLARE_PER_CPU_READ_MOSTLY(unsigned long, espfix_waddr); extern void init_espfix_bsp(void); -extern void init_espfix_ap(void); +extern void init_espfix_ap(int cpu); #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index f5d0730..1fb0e00 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -131,12 +131,12 @@ void __init init_espfix_bsp(void) init_espfix_random(); /* The rest is the same as for any other processor */ - init_espfix_ap(); + init_espfix_ap(0); } -void init_espfix_ap(void) +void init_espfix_ap(int cpu) { - unsigned int cpu, page; + unsigned int page; unsigned long addr; pud_t pud, *pud_p; pmd_t pmd, *pmd_p; @@ -149,7 +149,6 @@ void init_espfix_ap(void) if (likely(this_cpu_read(espfix_stack))) return; /* Already initialized */ - cpu = smp_processor_id(); addr = espfix_base_addr(cpu); page = cpu/ESPFIX_STACKS_PER_PAGE; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 8add66b..6f5abac 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -242,7 +242,7 @@ static void notrace start_secondary(void *unused) * Enable the espfix hack for this CPU */ #ifdef CONFIG_X86_ESPFIX64 - init_espfix_ap(); + init_espfix_ap(smp_processor_id()); #endif /* -- cgit v0.10.2 From 20d5e4a9cd453991e2590a4c25230a99b42dee0c Mon Sep 17 00:00:00 2001 From: Zhu Guihua Date: Fri, 3 Jul 2015 17:37:19 +0800 Subject: x86/espfix: Init espfix on the boot CPU side As we alloc pages with GFP_KERNEL in init_espfix_ap() which is called before we enable local irqs, so the lockdep sub-system would (correctly) trigger a warning about the potentially blocking API. So we allocate them on the boot CPU side when the secondary CPU is brought up by the boot CPU, and hand them over to the secondary CPU. And we use alloc_pages_node() with the secondary CPU's node, to make sure the espfix stack is NUMA-local to the CPU that is going to use it. Signed-off-by: Zhu Guihua Cc: Cc: Cc: Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c97add2670e9abebb90095369f0cfc172373ac94.1435824469.git.zhugh.fnst@cn.fujitsu.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 1fb0e00..ce95676 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -141,12 +141,12 @@ void init_espfix_ap(int cpu) pud_t pud, *pud_p; pmd_t pmd, *pmd_p; pte_t pte, *pte_p; - int n; + int n, node; void *stack_page; pteval_t ptemask; /* We only have to do this once... */ - if (likely(this_cpu_read(espfix_stack))) + if (likely(per_cpu(espfix_stack, cpu))) return; /* Already initialized */ addr = espfix_base_addr(cpu); @@ -164,12 +164,15 @@ void init_espfix_ap(int cpu) if (stack_page) goto unlock_done; + node = cpu_to_node(cpu); ptemask = __supported_pte_mask; pud_p = &espfix_pud_page[pud_index(addr)]; pud = *pud_p; if (!pud_present(pud)) { - pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); + struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); + + pmd_p = (pmd_t *)page_address(page); pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PUD_CLONES; n++) @@ -179,7 +182,9 @@ void init_espfix_ap(int cpu) pmd_p = pmd_offset(&pud, addr); pmd = *pmd_p; if (!pmd_present(pmd)) { - pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); + struct page *page = alloc_pages_node(node, PGALLOC_GFP, 0); + + pte_p = (pte_t *)page_address(page); pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PMD_CLONES; n++) @@ -187,7 +192,7 @@ void init_espfix_ap(int cpu) } pte_p = pte_offset_kernel(&pmd, addr); - stack_page = (void *)__get_free_page(GFP_KERNEL); + stack_page = page_address(alloc_pages_node(node, GFP_KERNEL, 0)); pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); @@ -198,7 +203,7 @@ void init_espfix_ap(int cpu) unlock_done: mutex_unlock(&espfix_init_mutex); done: - this_cpu_write(espfix_stack, addr); - this_cpu_write(espfix_waddr, (unsigned long)stack_page - + (addr & ~PAGE_MASK)); + per_cpu(espfix_stack, cpu) = addr; + per_cpu(espfix_waddr, cpu) = (unsigned long)stack_page + + (addr & ~PAGE_MASK); } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 6f5abac..0bd8c1d 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -239,13 +239,6 @@ static void notrace start_secondary(void *unused) check_tsc_sync_target(); /* - * Enable the espfix hack for this CPU - */ -#ifdef CONFIG_X86_ESPFIX64 - init_espfix_ap(smp_processor_id()); -#endif - - /* * We need to hold vector_lock so there the set of online cpus * does not change while we are assigning vectors to cpus. Holding * this lock ensures we don't half assign or remove an irq from a cpu. @@ -854,6 +847,13 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) initial_code = (unsigned long)start_secondary; stack_start = idle->thread.sp; + /* + * Enable the espfix hack for this CPU + */ +#ifdef CONFIG_X86_ESPFIX64 + init_espfix_ap(cpu); +#endif + /* So we see what's up */ announce_cpu(cpu, apicid); -- cgit v0.10.2 From 827a82ff399523a954253dfea401af748640f0f4 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Jul 2015 10:14:34 -0400 Subject: x86/earlyprintk: Allow early_printk() to use console style parameters like '115200n8' When I enable early_printk on a kernel, I cut and paste the console= input and add to earlyprintk parameter. But I notice recently that ktest has not been detecting triple faults. The way it detects it, is by seeing the kernel banner "Linux version .." with a different kernel version pop up. Then I noticed that early printk was no longer working on my console, which was why ktest was not seeing it. I bisected it down and it was added to 4.0 with this commit: ea9e9d802902 ("Specify PCI based UART for earlyprintk") because it converted the simple_strtoul() that converts the baud number into a kstrtoul(). The problem with this is, I had as my baud rate, 115200n8 (acceptable for console=ttyS0), but because of the "n8", the kstrtoul() doesn't parse the baud rate and returns an error, which sets the baud rate to the default 9600. This explains the garbage on my screen. Now, earlyprintk= kernel parameter does not say it accepts that format. Thus, one answer would simply be me changing my kernel parameters to remove the "n8" since it isn't parsed anyway. But I wonder if other people run into this, and it seems strange that the two consoles for serial accepts different input. I could also extend this to have earlyprintk do something with that "n8" or whatever it has and have it match the console parsing (which, BTW, still uses simple_strtoul(), as I guess it has to). This patch just makes my old kernel parameter parsing work like it use to. Although, simple_strtoull() is considered obsolete, it is the only standard string parsing function that parses a number that is attached to text. Ironically, commit ea9e9d802902 also added several calls to simple_strtoul()! Signed-off-by: Steven Rostedt Cc: Alan Cox Cc: Andy Lutomirski Cc: Andy Shevchenko Cc: Borislav Petkov Cc: Brian Gerst Cc: David Cohen Cc: Denys Vlasenko Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stuart R. Anderson Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150706101434.5f6a351b@gandalf.local.home [ Cleaned it up a bit. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 89427d8..eec40f5 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -175,7 +175,9 @@ static __init void early_serial_init(char *s) } if (*s) { - if (kstrtoul(s, 0, &baud) < 0 || baud == 0) + baud = simple_strtoull(s, &e, 0); + + if (baud == 0 || s == e) baud = DEFAULT_BAUD; } -- cgit v0.10.2 From 5a3f75e3f02836518ce49536e9c460ca8e1fa290 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:32 +0000 Subject: x86/irq: Plug irq vector hotplug race Jin debugged a nasty cpu hotplug race which results in leaking a irq vector on the newly hotplugged cpu. cpu N cpu M native_cpu_up device_shutdown do_boot_cpu free_msi_irqs start_secondary arch_teardown_msi_irqs smp_callin default_teardown_msi_irqs setup_vector_irq arch_teardown_msi_irq __setup_vector_irq native_teardown_msi_irq lock(vector_lock) destroy_irq install vectors unlock(vector_lock) lock(vector_lock) ---> __clear_irq_vector unlock(vector_lock) lock(vector_lock) set_cpu_online unlock(vector_lock) This leaves the irq vector(s) which are torn down on CPU M stale in the vector array of CPU N, because CPU M does not see CPU N online yet. There is a similar issue with concurrent newly setup interrupts. The alloc/free protection of irq descriptors does not prevent the above race, because it merily prevents interrupt descriptors from going away or changing concurrently. Prevent this by moving the call to setup_vector_irq() into the vector_lock held region which protects set_cpu_online(): cpu N cpu M native_cpu_up device_shutdown do_boot_cpu free_msi_irqs start_secondary arch_teardown_msi_irqs smp_callin default_teardown_msi_irqs lock(vector_lock) arch_teardown_msi_irq setup_vector_irq() __setup_vector_irq native_teardown_msi_irq install vectors destroy_irq set_cpu_online unlock(vector_lock) lock(vector_lock) __clear_irq_vector unlock(vector_lock) So cpu M either sees the cpu N online before clearing the vector or cpu N installs the vectors after cpu M has cleared it. Reported-by: xiao jin Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.141898931@linutronix.de diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 28eba2d..f813261 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -409,12 +409,6 @@ static void __setup_vector_irq(int cpu) int irq, vector; struct apic_chip_data *data; - /* - * vector_lock will make sure that we don't run into irq vector - * assignments that might be happening on another cpu in parallel, - * while we setup our initial vector to irq mappings. - */ - raw_spin_lock(&vector_lock); /* Mark the inuse vectors */ for_each_active_irq(irq) { data = apic_chip_data(irq_get_irq_data(irq)); @@ -436,16 +430,16 @@ static void __setup_vector_irq(int cpu) if (!cpumask_test_cpu(cpu, data->domain)) per_cpu(vector_irq, cpu)[vector] = VECTOR_UNDEFINED; } - raw_spin_unlock(&vector_lock); } /* - * Setup the vector to irq mappings. + * Setup the vector to irq mappings. Must be called with vector_lock held. */ void setup_vector_irq(int cpu) { int irq; + lockdep_assert_held(&vector_lock); /* * On most of the platforms, legacy PIC delivers the interrupts on the * boot cpu. But there are certain platforms where PIC interrupts are diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 0bd8c1d..d3010aa 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -171,11 +171,6 @@ static void smp_callin(void) apic_ap_setup(); /* - * Need to setup vector mappings before we enable interrupts. - */ - setup_vector_irq(smp_processor_id()); - - /* * Save our processor parameters. Note: this information * is needed for clock calibration. */ @@ -239,11 +234,13 @@ static void notrace start_secondary(void *unused) check_tsc_sync_target(); /* - * We need to hold vector_lock so there the set of online cpus - * does not change while we are assigning vectors to cpus. Holding - * this lock ensures we don't half assign or remove an irq from a cpu. + * Lock vector_lock and initialize the vectors on this cpu + * before setting the cpu online. We must set it online with + * vector_lock held to prevent a concurrent setup/teardown + * from seeing a half valid vector space. */ lock_vector_lock(); + setup_vector_irq(smp_processor_id()); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); cpu_set_state_online(smp_processor_id()); -- cgit v0.10.2 From cbb24dc761d95fe39a7a122bb1b298e9604cae15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:33 +0000 Subject: x86/irq: Use proper locking in check_irq_vectors_for_cpu_disable() It's unsafe to examine fields in the irq descriptor w/o holding the descriptor lock. Add proper locking. While at it add a comment why the vector check can run lock less Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: xiao jin Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.236544164@linutronix.de diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 88b36648..85ca76e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -347,14 +347,22 @@ int check_irq_vectors_for_cpu_disable(void) if (!desc) continue; + /* + * Protect against concurrent action removal, + * affinity changes etc. + */ + raw_spin_lock(&desc->lock); data = irq_desc_get_irq_data(desc); cpumask_copy(&affinity_new, data->affinity); cpumask_clear_cpu(this_cpu, &affinity_new); /* Do not count inactive or per-cpu irqs. */ - if (!irq_has_action(irq) || irqd_is_per_cpu(data)) + if (!irq_has_action(irq) || irqd_is_per_cpu(data)) { + raw_spin_unlock(&desc->lock); continue; + } + raw_spin_unlock(&desc->lock); /* * A single irq may be mapped to multiple * cpu's vector_irq[] (for example IOAPIC cluster @@ -385,6 +393,9 @@ int check_irq_vectors_for_cpu_disable(void) * vector. If the vector is marked in the used vectors * bitmap or an irq is assigned to it, we don't count * it as available. + * + * As this is an inaccurate snapshot anyway, we can do + * this w/o holding vector_lock. */ for (vector = FIRST_EXTERNAL_VECTOR; vector < first_system_vector; vector++) { -- cgit v0.10.2 From 09cf92b784fae6109450c5d64f9908066d605249 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 5 Jul 2015 17:12:35 +0000 Subject: x86/irq: Retrieve irq data after locking irq_desc irq_data is protected by irq_desc->lock, so retrieving the irq chip from irq_data outside the lock is racy vs. an concurrent update. Move it into the lock held region. While at it add a comment why the vector walk does not require vector_lock. Signed-off-by: Thomas Gleixner Cc: Peter Zijlstra Cc: xiao jin Cc: Joerg Roedel Cc: Borislav Petkov Cc: Yanmin Zhang Link: http://lkml.kernel.org/r/20150705171102.331320612@linutronix.de diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 85ca76e..c7dfe1b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -497,6 +497,11 @@ void fixup_irqs(void) */ mdelay(1); + /* + * We can walk the vector array of this cpu without holding + * vector_lock because the cpu is already marked !online, so + * nothing else will touch it. + */ for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { unsigned int irr; @@ -508,9 +513,9 @@ void fixup_irqs(void) irq = __this_cpu_read(vector_irq[vector]); desc = irq_to_desc(irq); + raw_spin_lock(&desc->lock); data = irq_desc_get_irq_data(desc); chip = irq_data_get_irq_chip(data); - raw_spin_lock(&desc->lock); if (chip->irq_retrigger) { chip->irq_retrigger(data); __this_cpu_write(vector_irq[vector], VECTOR_RETRIGGERED); -- cgit v0.10.2 From 69711ca19b06d1b33d8f21213b540b5d1c638dbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Hinderer?= Date: Wed, 8 Jul 2015 00:02:01 +0200 Subject: x86/kconfig: Fix typo in the CONFIG_CMDLINE_BOOL help text MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Sébastien Hinderer Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Samuel Thibault Cc: Thomas Gleixner Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0b2929f..3dbb7e7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2020,7 +2020,7 @@ config CMDLINE_BOOL To compile command line arguments into the kernel, set this option to 'Y', then fill in the - the boot arguments in CONFIG_CMDLINE. + boot arguments in CONFIG_CMDLINE. Systems with fully functional boot loaders (i.e. non-embedded) should leave this option set to 'N'. -- cgit v0.10.2