From b3830e8d478cd9fe33e820425ce431c8ef280967 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 1 Aug 2016 12:05:02 +0200 Subject: x86/entry: Remove duplicated comment Ok, ok, we see it is called from C :-) Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160801100502.29796-1-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index b846875..8956eae 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -347,8 +347,7 @@ ENTRY(stub_ptregs_64) jmp entry_SYSCALL64_slow_path 1: - /* Called from C */ - jmp *%rax /* called from C */ + jmp *%rax /* Called from C */ END(stub_ptregs_64) .macro ptregs_stub func -- cgit v0.10.2 From 404f6aac9b3ef595735feca99979db084ea48315 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 8 Aug 2016 16:29:06 -0700 Subject: x86: Apply more __ro_after_init and const Guided by grsecurity's analogous __read_only markings in arch/x86, this applies several uses of __ro_after_init to structures that are only updated during __init, and const for some structures that are never updated. Additionally extends __init markings to some functions that are only used during __init, and cleans up some missing C99 style static initializers. Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brad Spengler Cc: Brian Gerst Cc: David Brown Cc: Denys Vlasenko Cc: Emese Revfy Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Mathias Krause Cc: Michael Ellerman Cc: PaX Team Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/20160808232906.GA29731@www.outflux.net Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 4e10d73..12080d8 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -36,7 +36,7 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; -extern struct desc_ptr debug_idt_descr; +extern const struct desc_ptr debug_idt_descr; extern gate_desc debug_idt_table[]; struct gdt_page { diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h index ae55a43..d4957ac 100644 --- a/arch/x86/include/asm/fpu/xstate.h +++ b/arch/x86/include/asm/fpu/xstate.h @@ -45,7 +45,8 @@ extern u64 xfeatures_mask; extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask); +extern void __init update_regset_xstate_info(unsigned int size, + u64 xstate_mask); void fpu__xstate_clear_all_cpu_caps(void); void *get_xsave_addr(struct xregs_state *xsave, int xstate); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 5b2ae10..8862da7 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -25,7 +25,7 @@ static struct apic apic_physflat; static struct apic apic_flat; -struct apic __read_mostly *apic = &apic_flat; +struct apic *apic __ro_after_init = &apic_flat; EXPORT_SYMBOL_GPL(apic); static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) @@ -154,7 +154,7 @@ static int flat_probe(void) return 1; } -static struct apic apic_flat = { +static struct apic apic_flat __ro_after_init = { .name = "flat", .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, @@ -248,7 +248,7 @@ static int physflat_probe(void) return 0; } -static struct apic apic_physflat = { +static struct apic apic_physflat __ro_after_init = { .name = "physical flat", .probe = physflat_probe, diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c index c05688b..b109e43 100644 --- a/arch/x86/kernel/apic/apic_noop.c +++ b/arch/x86/kernel/apic/apic_noop.c @@ -108,7 +108,7 @@ static void noop_apic_write(u32 reg, u32 v) WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_APIC) && !disable_apic); } -struct apic apic_noop = { +struct apic apic_noop __ro_after_init = { .name = "noop", .probe = noop_probe, .acpi_madt_oem_check = NULL, diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index 06dbaa4..5601201 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -142,7 +142,7 @@ static int probe_bigsmp(void) return dmi_bigsmp; } -static struct apic apic_bigsmp = { +static struct apic apic_bigsmp __ro_after_init = { .name = "bigsmp", .probe = probe_bigsmp, diff --git a/arch/x86/kernel/apic/msi.c b/arch/x86/kernel/apic/msi.c index ade2532..015bbf3 100644 --- a/arch/x86/kernel/apic/msi.c +++ b/arch/x86/kernel/apic/msi.c @@ -269,7 +269,7 @@ static void hpet_msi_write_msg(struct irq_data *data, struct msi_msg *msg) hpet_msi_write(irq_data_get_irq_handler_data(data), msg); } -static struct irq_chip hpet_msi_controller = { +static struct irq_chip hpet_msi_controller __ro_after_init = { .name = "HPET-MSI", .irq_unmask = hpet_msi_unmask, .irq_mask = hpet_msi_mask, diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 7c43e71..e5fb2f0 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -72,7 +72,7 @@ static int probe_default(void) return 1; } -static struct apic apic_default = { +static struct apic apic_default __ro_after_init = { .name = "default", .probe = probe_default, @@ -126,7 +126,7 @@ static struct apic apic_default = { apic_driver(apic_default); -struct apic *apic = &apic_default; +struct apic *apic __ro_after_init = &apic_default; EXPORT_SYMBOL_GPL(apic); static int cmdline_apic __initdata; diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 6368fa6..766bdef 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -222,7 +222,7 @@ static void cluster_vector_allocation_domain(int cpu, struct cpumask *retmask, cpumask_and(retmask, mask, per_cpu(cpus_in_cluster, cpu)); } -static struct apic apic_x2apic_cluster = { +static struct apic apic_x2apic_cluster __ro_after_init = { .name = "cluster x2apic", .probe = x2apic_cluster_probe, diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index 4f13f54f..ff111f0 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -98,7 +98,7 @@ static int x2apic_phys_probe(void) return apic == &apic_x2apic_phys; } -static struct apic apic_x2apic_phys = { +static struct apic apic_x2apic_phys __ro_after_init = { .name = "physical x2apic", .probe = x2apic_phys_probe, diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 09b59ad..ed887de 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -554,7 +554,7 @@ static int uv_probe(void) return apic == &apic_x2apic_uv_x; } -static struct apic __refdata apic_x2apic_uv_x = { +static struct apic apic_x2apic_uv_x __ro_after_init = { .name = "UV large system", .probe = uv_probe, diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 809eda0..d3b91be 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1265,9 +1265,14 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 -struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; -struct desc_ptr debug_idt_descr = { NR_VECTORS * 16 - 1, - (unsigned long) debug_idt_table }; +struct desc_ptr idt_descr __ro_after_init = { + .size = NR_VECTORS * 16 - 1, + .address = (unsigned long) idt_table, +}; +const struct desc_ptr debug_idt_descr = { + .size = NR_VECTORS * 16 - 1, + .address = (unsigned long) debug_idt_table, +}; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 28f1b54..24e87e7 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -72,14 +72,14 @@ static DEFINE_MUTEX(mtrr_mutex); u64 size_or_mask, size_and_mask; static bool mtrr_aps_delayed_init; -static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; +static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM] __ro_after_init; const struct mtrr_ops *mtrr_if; static void set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type type); -void set_mtrr_ops(const struct mtrr_ops *ops) +void __init set_mtrr_ops(const struct mtrr_ops *ops) { if (ops->vendor && ops->vendor < X86_VENDOR_NUM) mtrr_ops[ops->vendor] = ops; diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 6c7ced0..ad8bd76 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -54,7 +54,7 @@ void fill_mtrr_var_range(unsigned int index, bool get_mtrr_state(void); void mtrr_bp_pat_init(void); -extern void set_mtrr_ops(const struct mtrr_ops *ops); +extern void __init set_mtrr_ops(const struct mtrr_ops *ops); extern u64 size_or_mask, size_and_mask; extern const struct mtrr_ops *mtrr_if; diff --git a/arch/x86/kernel/ksysfs.c b/arch/x86/kernel/ksysfs.c index c2bedae..4afc67f 100644 --- a/arch/x86/kernel/ksysfs.c +++ b/arch/x86/kernel/ksysfs.c @@ -184,7 +184,7 @@ out: static struct kobj_attribute type_attr = __ATTR_RO(type); -static struct bin_attribute data_attr = { +static struct bin_attribute data_attr __ro_after_init = { .attr = { .name = "data", .mode = S_IRUGO, diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1d39bfb..0964399 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -29,7 +29,7 @@ #include #include -static int kvmclock = 1; +static int kvmclock __ro_after_init = 1; static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static cycle_t kvm_sched_clock_offset; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ad5bc95..b8e4680 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -389,7 +389,7 @@ NOKPROBE_SYMBOL(native_load_idt); #define PTE_IDENT __PV_IS_CALLEE_SAVE(_paravirt_ident_64) #endif -struct pv_mmu_ops pv_mmu_ops = { +struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .read_cr2 = native_read_cr2, .write_cr2 = native_write_cr2, diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f79576a..2537cfb 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1250,7 +1250,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, #ifdef CONFIG_X86_64 -static struct user_regset x86_64_regsets[] __read_mostly = { +static struct user_regset x86_64_regsets[] __ro_after_init = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct) / sizeof(long), @@ -1291,7 +1291,7 @@ static const struct user_regset_view user_x86_64_view = { #endif /* CONFIG_X86_64 */ #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION -static struct user_regset x86_32_regsets[] __read_mostly = { +static struct user_regset x86_32_regsets[] __ro_after_init = { [REGSET_GENERAL] = { .core_note_type = NT_PRSTATUS, .n = sizeof(struct user_regs_struct32) / sizeof(u32), @@ -1344,7 +1344,7 @@ static const struct user_regset_view user_x86_32_view = { */ u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS]; -void update_regset_xstate_info(unsigned int size, u64 xstate_mask) +void __init update_regset_xstate_info(unsigned int size, u64 xstate_mask) { #ifdef CONFIG_X86_64 x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64); diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 63bf27d..e244c19 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -705,7 +705,7 @@ static void native_machine_power_off(void) tboot_shutdown(TB_SHUTDOWN_HALT); } -struct machine_ops machine_ops = { +struct machine_ops machine_ops __ro_after_init = { .power_off = native_machine_power_off, .shutdown = native_machine_shutdown, .emergency_restart = native_machine_emergency_restart, diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 95cf31c..2d98798 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -210,9 +210,9 @@ EXPORT_SYMBOL(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) -__visible unsigned long mmu_cr4_features; +__visible unsigned long mmu_cr4_features __ro_after_init; #else -__visible unsigned long mmu_cr4_features = X86_CR4_PAE; +__visible unsigned long mmu_cr4_features __ro_after_init = X86_CR4_PAE; #endif /* Boot loader ID and version as integers, for the benefit of proc_dointvec */ diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7a40e06..1d5c794 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -33,7 +33,7 @@ EXPORT_PER_CPU_SYMBOL(cpu_number); DEFINE_PER_CPU_READ_MOSTLY(unsigned long, this_cpu_off) = BOOT_PERCPU_OFFSET; EXPORT_PER_CPU_SYMBOL(this_cpu_off); -unsigned long __per_cpu_offset[NR_CPUS] __read_mostly = { +unsigned long __per_cpu_offset[NR_CPUS] __ro_after_init = { [0 ... NR_CPUS-1] = BOOT_PERCPU_OFFSET, }; EXPORT_SYMBOL(__per_cpu_offset); diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 76c5e52..0bd9f12 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -91,7 +91,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; static int default_i8042_detect(void) { return 1; }; -struct x86_platform_ops x86_platform = { +struct x86_platform_ops x86_platform __ro_after_init = { .calibrate_cpu = native_calibrate_cpu, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, @@ -108,7 +108,7 @@ struct x86_platform_ops x86_platform = { EXPORT_SYMBOL_GPL(x86_platform); #if defined(CONFIG_PCI_MSI) -struct x86_msi_ops x86_msi = { +struct x86_msi_ops x86_msi __ro_after_init = { .setup_msi_irqs = native_setup_msi_irqs, .teardown_msi_irq = native_teardown_msi_irq, .teardown_msi_irqs = default_teardown_msi_irqs, @@ -137,7 +137,7 @@ void arch_restore_msi_irqs(struct pci_dev *dev) } #endif -struct x86_io_apic_ops x86_io_apic_ops = { +struct x86_io_apic_ops x86_io_apic_ops __ro_after_init = { .read = native_io_apic_read, .disable = native_disable_io_apic, }; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index af523d8..1e6b84b 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4961,7 +4961,7 @@ static inline void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -static struct kvm_x86_ops svm_x86_ops = { +static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, .hardware_setup = svm_hardware_setup, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a45d858..87eaa6b 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -11175,7 +11175,7 @@ static void vmx_setup_mce(struct kvm_vcpu *vcpu) ~FEATURE_CONTROL_LMCE; } -static struct kvm_x86_ops vmx_x86_ops = { +static struct kvm_x86_ops vmx_x86_ops __ro_after_init = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, .hardware_setup = hardware_setup, diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index 9770e55..1d97cea 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -120,9 +120,12 @@ static unsigned long __init bios32_service(unsigned long service) static struct { unsigned long address; unsigned short segment; -} pci_indirect = { 0, __KERNEL_CS }; +} pci_indirect __ro_after_init = { + .address = 0, + .segment = __KERNEL_CS, +}; -static int pci_bios_present; +static int pci_bios_present __ro_after_init; static int __init check_pcibios(void) { -- cgit v0.10.2 From 25dfe4785332723f09311dcb7fd91015a60c022f Mon Sep 17 00:00:00 2001 From: Thomas Garnier Date: Wed, 27 Jul 2016 08:59:56 -0700 Subject: x86/mm/64: Enable KASLR for vmemmap memory region Add vmemmap in the list of randomized memory regions. The vmemmap region holds a representation of the physical memory (through a struct page array). An attacker could use this region to disclose the kernel memory layout (walking the page linked list). Signed-off-by: Thomas Garnier Signed-off-by: Kees Cook Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kernel-hardening@lists.openwall.com Link: http://lkml.kernel.org/r/1469635196-122447-1-git-send-email-thgarnie@google.com [ Minor edits. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index 2674ee3..1052a79 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -6,6 +6,7 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY extern unsigned long page_offset_base; extern unsigned long vmalloc_base; +extern unsigned long vmemmap_base; void kernel_randomize_memory(void); #else diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 6fdef9e..3a26420 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -57,11 +57,13 @@ typedef struct { pteval_t pte; } pte_t; #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) #define VMALLOC_SIZE_TB _AC(32, UL) #define __VMALLOC_BASE _AC(0xffffc90000000000, UL) -#define VMEMMAP_START _AC(0xffffea0000000000, UL) +#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) #ifdef CONFIG_RANDOMIZE_MEMORY #define VMALLOC_START vmalloc_base +#define VMEMMAP_START vmemmap_base #else #define VMALLOC_START __VMALLOC_BASE +#define VMEMMAP_START __VMEMMAP_BASE #endif /* CONFIG_RANDOMIZE_MEMORY */ #define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index ec8654f..aec03aa 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -40,17 +40,26 @@ * You need to add an if/def entry if you introduce a new memory region * compatible with KASLR. Your entry must be in logical order with memory * layout. For example, ESPFIX is before EFI because its virtual address is - * before. You also need to add a BUILD_BUG_ON in kernel_randomize_memory to + * before. You also need to add a BUILD_BUG_ON() in kernel_randomize_memory() to * ensure that this order is correct and won't be changed. */ static const unsigned long vaddr_start = __PAGE_OFFSET_BASE; -static const unsigned long vaddr_end = VMEMMAP_START; + +#if defined(CONFIG_X86_ESPFIX64) +static const unsigned long vaddr_end = ESPFIX_BASE_ADDR; +#elif defined(CONFIG_EFI) +static const unsigned long vaddr_end = EFI_VA_START; +#else +static const unsigned long vaddr_end = __START_KERNEL_map; +#endif /* Default values */ unsigned long page_offset_base = __PAGE_OFFSET_BASE; EXPORT_SYMBOL(page_offset_base); unsigned long vmalloc_base = __VMALLOC_BASE; EXPORT_SYMBOL(vmalloc_base); +unsigned long vmemmap_base = __VMEMMAP_BASE; +EXPORT_SYMBOL(vmemmap_base); /* * Memory regions randomized by KASLR (except modules that use a separate logic @@ -63,6 +72,7 @@ static __initdata struct kaslr_memory_region { } kaslr_regions[] = { { &page_offset_base, 64/* Maximum */ }, { &vmalloc_base, VMALLOC_SIZE_TB }, + { &vmemmap_base, 1 }, }; /* Get size in bytes used by the memory region */ @@ -89,6 +99,18 @@ void __init kernel_randomize_memory(void) struct rnd_state rand_state; unsigned long remain_entropy; + /* + * All these BUILD_BUG_ON checks ensures the memory layout is + * consistent with the vaddr_start/vaddr_end variables. + */ + BUILD_BUG_ON(vaddr_start >= vaddr_end); + BUILD_BUG_ON(config_enabled(CONFIG_X86_ESPFIX64) && + vaddr_end >= EFI_VA_START); + BUILD_BUG_ON((config_enabled(CONFIG_X86_ESPFIX64) || + config_enabled(CONFIG_EFI)) && + vaddr_end >= __START_KERNEL_map); + BUILD_BUG_ON(vaddr_end > __START_KERNEL_map); + if (!kaslr_memory_enabled()) return; -- cgit v0.10.2 From bf255bdaada6d497536aadee5406f6ded318978b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:01 -0500 Subject: x86/dumpstack: Remove show_trace() There are a bewildering array of options for dumping the stack. Simplify things a little by removing show_trace(), which is unused. Signed-off-by: Josh Poimboeuf Reviewed-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/fe02292eac9d409001ec0cf6d06f90ced242570d.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h index 1ef9d58..d318811 100644 --- a/arch/x86/include/asm/kdebug.h +++ b/arch/x86/include/asm/kdebug.h @@ -24,8 +24,6 @@ enum die_val { extern void printk_address(unsigned long address); extern void die(const char *, struct pt_regs *,long); extern int __must_check __die(const char *, struct pt_regs *, long); -extern void show_trace(struct task_struct *t, struct pt_regs *regs, - unsigned long *sp, unsigned long bp); extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, int all); extern unsigned long oops_begin(void); diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 92e8f0a..5f49c04 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -182,12 +182,6 @@ show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); } -void show_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp) -{ - show_trace_log_lvl(task, regs, stack, bp, ""); -} - void show_stack(struct task_struct *task, unsigned long *sp) { unsigned long bp = 0; -- cgit v0.10.2 From 32541b47bd34940d836fbdf713d16c7ac70d51be Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:02 -0500 Subject: x86/asm/head: Remove unused init_rsp variable extern There is no init_rsp variable. Remove its extern. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c183bbecd5730d84e8c6aff3824537c1c1bf3591.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index b2988c0..3327ffb 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -44,7 +44,6 @@ struct trampoline_header { extern struct real_mode_header *real_mode_header; extern unsigned char real_mode_blob_end[]; -extern unsigned long init_rsp; extern unsigned long initial_code; extern unsigned long initial_gs; -- cgit v0.10.2 From b32f96c75d0dcbb9bf9cc7994e8022c8ce20a668 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:03 -0500 Subject: x86/asm/head: Rename 'stack_start' -> 'initial_stack' The 'stack_start' variable is similar in usage to 'initial_code' and 'initial_gs': they're all stored in head_64.S and they're all updated by SMP and ACPI suspend before starting a CPU. Rename it to 'initial_stack' to be consistent with the others. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/87063d773a3212051b77e17b0ee427f6582a5050.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h index 3327ffb..230e190 100644 --- a/arch/x86/include/asm/realmode.h +++ b/arch/x86/include/asm/realmode.h @@ -46,6 +46,7 @@ extern unsigned char real_mode_blob_end[]; extern unsigned long initial_code; extern unsigned long initial_gs; +extern unsigned long initial_stack; extern unsigned char real_mode_blob[]; extern unsigned char real_mode_relocs[]; diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index ebd0c16..19980b3 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -39,9 +39,6 @@ DECLARE_EARLY_PER_CPU_READ_MOSTLY(u16, x86_bios_cpu_apicid); DECLARE_EARLY_PER_CPU_READ_MOSTLY(int, x86_cpu_to_logical_apicid); #endif -/* Static state in head.S used to set up a CPU */ -extern unsigned long stack_start; /* Initial stack pointer address */ - struct task_struct; struct smp_ops { diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index adb3eaf..4858733 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -99,7 +99,7 @@ int x86_acpi_suspend_lowlevel(void) saved_magic = 0x12345678; #else /* CONFIG_64BIT */ #ifdef CONFIG_SMP - stack_start = (unsigned long)temp_stack + sizeof(temp_stack); + initial_stack = (unsigned long)temp_stack + sizeof(temp_stack); early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(smp_processor_id()); initial_gs = per_cpu_offset(smp_processor_id()); diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 6f8902b..5f40126 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -94,7 +94,7 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE) */ __HEAD ENTRY(startup_32) - movl pa(stack_start),%ecx + movl pa(initial_stack),%ecx /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ @@ -286,7 +286,7 @@ num_subarch_entries = (. - subarch_entries) / 4 * start_secondary(). */ ENTRY(start_cpu0) - movl stack_start, %ecx + movl initial_stack, %ecx movl %ecx, %esp jmp *(initial_code) ENDPROC(start_cpu0) @@ -307,7 +307,7 @@ ENTRY(startup_32_smp) movl %eax,%es movl %eax,%fs movl %eax,%gs - movl pa(stack_start),%ecx + movl pa(initial_stack),%ecx movl %eax,%ss leal -__PAGE_OFFSET(%ecx),%esp @@ -703,7 +703,7 @@ ENTRY(initial_page_table) .data .balign 4 -ENTRY(stack_start) +ENTRY(initial_stack) .long init_thread_union+THREAD_SIZE __INITRODATA diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 9f8efc9..e048142 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -66,7 +66,7 @@ startup_64: */ /* - * Setup stack for verify_cpu(). "-8" because stack_start is defined + * Setup stack for verify_cpu(). "-8" because initial_stack is defined * this way, see below. Our best guess is a NULL ptr for stack * termination heuristics and we don't want to break anything which * might depend on it (kgdb, ...). @@ -226,7 +226,7 @@ ENTRY(secondary_startup_64) movq %rax, %cr0 /* Setup a boot time stack */ - movq stack_start(%rip), %rsp + movq initial_stack(%rip), %rsp /* zero EFLAGS after setting rsp */ pushq $0 @@ -310,7 +310,7 @@ ENDPROC(secondary_startup_64) * start_secondary(). */ ENTRY(start_cpu0) - movq stack_start(%rip),%rsp + movq initial_stack(%rip),%rsp movq initial_code(%rip),%rax pushq $0 # fake return address to stop unwinder pushq $__KERNEL_CS # set correct cs @@ -319,15 +319,14 @@ ENTRY(start_cpu0) ENDPROC(start_cpu0) #endif - /* SMP bootup changes these two */ + /* Both SMP bootup and ACPI suspend change these variables */ __REFDATA .balign 8 GLOBAL(initial_code) .quad x86_64_start_kernel GLOBAL(initial_gs) .quad INIT_PER_CPU_VAR(irq_stack_union) - - GLOBAL(stack_start) + GLOBAL(initial_stack) .quad init_thread_union+THREAD_SIZE-8 .word 0 __FINITDATA diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 4296beb..c85d2c6 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -969,7 +969,7 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; - stack_start = idle->thread.sp; + initial_stack = idle->thread.sp; /* * Enable the espfix hack for this CPU -- cgit v0.10.2 From 6225f3232a04a54786f817f3648a1f8cc5920272 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:04 -0500 Subject: x86/dumpstack: Remove extra brackets around "" When starting the dump of an exception stack, it shows "<>" instead of "". print_trace_stack() already adds brackets, no need to add them again. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/77f185fd5b81845869b400aa619415458df6b6cc.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 9ee4520..daf9f63 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -202,7 +202,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, bp = ops->walk_stack(task, stack, bp, ops, data, stack_end, &graph); - ops->stack(data, ""); + ops->stack(data, "EOE"); /* * We link to the next stack via the * second-to-last pointer (index -2 to end) in the -- cgit v0.10.2 From ae952ffdfdf986ecd1452d552a69b82cae7b5e58 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:05 -0500 Subject: x86/head: Remove useless zeroed word This zeroed word has no apparent purpose, so remove it. Brian Gerst says: "FYI the word used to be the SS segment selector for the LSS instruction, which isn't needed in 64-bit mode." Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/b056855c295bbb3825b97c1e9f7958539a4d6cf2.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e048142..c98a559 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -328,7 +328,6 @@ ENDPROC(start_cpu0) .quad INIT_PER_CPU_VAR(irq_stack_union) GLOBAL(initial_stack) .quad init_thread_union+THREAD_SIZE-8 - .word 0 __FINITDATA bad_address: -- cgit v0.10.2 From 72b4f6a5e903b071f2a7c4eb1418cbe4eefdc344 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:06 -0500 Subject: x86/dumpstack: Fix x86_32 kernel_stack_pointer() previous stack access On x86_32, when an interrupt happens from kernel space, SS and SP aren't pushed and the existing stack is used. So pt_regs is effectively two words shorter, and the previous stack pointer is normally the memory after the shortened pt_regs, aka '®s->sp'. But in the rare case where the interrupt hits right after the stack pointer has been changed to point to an empty stack, like for example when call_on_stack() is used, the address immediately after the shortened pt_regs is no longer on the stack. In that case, instead of '®s->sp', the previous stack pointer should be retrieved from the beginning of the current stack page. kernel_stack_pointer() wants to do that, but it forgets to dereference the pointer. So instead of returning a pointer to the previous stack, it returns a pointer to the beginning of the current stack. Note that it's probably outside of kernel_stack_pointer()'s scope to be switching stacks at all. The x86_64 version of this function doesn't do it, and it would be better for the caller to do it if necessary. But that's a patch for another day. This just fixes the original intent. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Fixes: 0788aa6a23cb ("x86: Prepare removal of previous_esp from i386 thread_info structure") Link: http://lkml.kernel.org/r/472453d6e9f6a2d4ab16aaed4935f43117111566.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f79576a..a1606ea 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -173,8 +173,8 @@ unsigned long kernel_stack_pointer(struct pt_regs *regs) return sp; prev_esp = (u32 *)(context); - if (prev_esp) - return (unsigned long)prev_esp; + if (*prev_esp) + return (unsigned long)*prev_esp; return (unsigned long)regs; } -- cgit v0.10.2 From 8b927d734122f3021c5999aaeffaa2a36ab224c2 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:07 -0500 Subject: proc: Fix return address printk conversion specifer in /proc//stack When printing call return addresses found on a stack, /proc//stack can sometimes give a confusing result. If the call instruction was the last instruction in the function (which can happen when calling a noreturn function), '%pS' will incorrectly display the name of the function which happens to be next in the object code, rather than the name of the actual calling function. Use '%pB' instead, which was created for this exact purpose. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/47ad2821e5ebdbed1fbf83fb85424ae4fbdf8b6e.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/fs/proc/base.c b/fs/proc/base.c index 54e2702..e9ff186 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -483,7 +483,7 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { - seq_printf(m, "[<%pK>] %pS\n", + seq_printf(m, "[<%pK>] %pB\n", (void *)entries[i], (void *)entries[i]); } unlock_trace(task); -- cgit v0.10.2 From 4950d6d48a0c43cc61d0bbb76fb10e0214b79c66 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 18 Aug 2016 10:59:08 -0500 Subject: x86/dumpstack: Remove 64-byte gap at end of irq stack There has been a 64-byte gap at the end of the irq stack for at least 12 years. It predates git history, and I can't find any good reason for it. Remove it. What's the worst that could happen? Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/14f9281c5475cc44af95945ea7546bff2e3836db.1471535549.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 809eda0..6ef55e8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1281,7 +1281,7 @@ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = EXPORT_PER_CPU_SYMBOL(current_task); DEFINE_PER_CPU(char *, irq_stack_ptr) = - init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE - 64; + init_per_cpu_var(irq_stack_union.irq_stack) + IRQ_STACK_SIZE; DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index daf9f63..066eb5c7 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -103,9 +103,6 @@ in_irq_stack(unsigned long *stack, unsigned long *irq_stack, return (stack >= irq_stack && stack < irq_stack_end); } -static const unsigned long irq_stack_size = - (IRQ_STACK_SIZE - 64) / sizeof(unsigned long); - enum stack_type { STACK_IS_UNKNOWN, STACK_IS_NORMAL, @@ -133,7 +130,7 @@ analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, return STACK_IS_NORMAL; *stack_end = irq_stack; - irq_stack = irq_stack - irq_stack_size; + irq_stack -= (IRQ_STACK_SIZE / sizeof(long)); if (in_irq_stack(stack, irq_stack, *stack_end)) return STACK_IS_IRQ; @@ -256,8 +253,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, preempt_disable(); cpu = smp_processor_id(); - irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); - irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); + irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); + irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); /* * Debugging aid: "show_stack(NULL, NULL);" prints the diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 7a40e06..d182799 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -246,7 +246,7 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_X86_64 per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack_union.irq_stack, cpu) + - IRQ_STACK_SIZE - 64; + IRQ_STACK_SIZE; #endif #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = -- cgit v0.10.2 From 556b6723689694ac9134bcc36a07828168e057f4 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 23 Aug 2016 19:23:56 +0200 Subject: x86/entry: Remove outdated comment about SYSCALL targets The comment probably meant some old AMD64 incarnation which most likely never saw the light of day. STAR and LSTAR are two different registers and STAR sets CS/SS(DS) selectors for *all* modes, not only 32-bit. So simply remove that comment. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160823172356.15879-1-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 6ef55e8..e374c19 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1305,11 +1305,6 @@ static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks /* May not be marked __init: used by software suspend */ void syscall_init(void) { - /* - * LSTAR and STAR live in a bit strange symbiosis. - * They both write to the same internal register. STAR allows to - * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. - */ wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); -- cgit v0.10.2 From ba14a194a434ccc8f733e263ad2ce941e35e5787 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Aug 2016 02:35:21 -0700 Subject: fork: Add generic vmalloced stack support If CONFIG_VMAP_STACK=y is selected, kernel stacks are allocated with __vmalloc_node_range(). Grsecurity has had a similar feature (called GRKERNSEC_KSTACKOVERFLOW=y) for a long time. Signed-off-by: Andy Lutomirski Acked-by: Michal Hocko Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Dmitry Vyukov Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/14c07d4fd173a5b117f51e8b939f9f4323e39899.1470907718.git.luto@kernel.org [ Minor edits. ] Signed-off-by: Ingo Molnar diff --git a/arch/Kconfig b/arch/Kconfig index e9c9334..9ecf9f6 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -707,4 +707,38 @@ config ARCH_NO_COHERENT_DMA_MMAP config CPU_NO_EFFICIENT_FFS def_bool n +config HAVE_ARCH_VMAP_STACK + def_bool n + help + An arch should select this symbol if it can support kernel stacks + in vmalloc space. This means: + + - vmalloc space must be large enough to hold many kernel stacks. + This may rule out many 32-bit architectures. + + - Stacks in vmalloc space need to work reliably. For example, if + vmap page tables are created on demand, either this mechanism + needs to work while the stack points to a virtual address with + unpopulated page tables or arch code (switch_to() and switch_mm(), + most likely) needs to ensure that the stack's page table entries + are populated before running on a possibly unpopulated stack. + + - If the stack overflows into a guard page, something reasonable + should happen. The definition of "reasonable" is flexible, but + instantly rebooting without logging anything would be unfriendly. + +config VMAP_STACK + default y + bool "Use a virtually-mapped stack" + depends on HAVE_ARCH_VMAP_STACK && !KASAN + ---help--- + Enable this if you want the use virtually-mapped kernel stacks + with guard pages. This causes kernel stack overflows to be + caught immediately rather than causing difficult-to-diagnose + corruption. + + This is presently incompatible with KASAN because KASAN expects + the stack to map directly to the KASAN shadow map using a formula + that is incorrect if the stack is in vmalloc space. + source "kernel/gcov/Kconfig" diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 29bd597..c702642 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -56,7 +56,7 @@ struct thread_info { #define alloc_thread_stack_node(tsk, node) ((unsigned long *) 0) #define task_thread_info(tsk) ((struct thread_info *) 0) #endif -#define free_thread_stack(ti) /* nothing */ +#define free_thread_stack(tsk) /* nothing */ #define task_stack_page(tsk) ((void *)(tsk)) #define __HAVE_THREAD_FUNCTIONS diff --git a/include/linux/sched.h b/include/linux/sched.h index 62c68e5..20f9f47 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1923,6 +1923,9 @@ struct task_struct { #ifdef CONFIG_MMU struct task_struct *oom_reaper_list; #endif +#ifdef CONFIG_VMAP_STACK + struct vm_struct *stack_vm_area; +#endif /* CPU-specific state of this task */ struct thread_struct thread; /* @@ -1939,6 +1942,18 @@ extern int arch_task_struct_size __read_mostly; # define arch_task_struct_size (sizeof(struct task_struct)) #endif +#ifdef CONFIG_VMAP_STACK +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) +{ + return t->stack_vm_area; +} +#else +static inline struct vm_struct *task_stack_vm_area(const struct task_struct *t) +{ + return NULL; +} +#endif + /* Future-safe accessor for struct task_struct's cpus_allowed. */ #define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) diff --git a/kernel/fork.c b/kernel/fork.c index 52e725d4..9b85f6b 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -158,19 +158,39 @@ void __weak arch_release_thread_stack(unsigned long *stack) * Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a * kmemcache based allocator. */ -# if THREAD_SIZE >= PAGE_SIZE -static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, - int node) +# if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { +#ifdef CONFIG_VMAP_STACK + void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP | __GFP_HIGHMEM, + PAGE_KERNEL, + 0, node, + __builtin_return_address(0)); + + /* + * We can't call find_vm_area() in interrupt context, and + * free_thread_stack() can be called in interrupt context, + * so cache the vm_struct. + */ + if (stack) + tsk->stack_vm_area = find_vm_area(stack); + return stack; +#else struct page *page = alloc_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); return page ? page_address(page) : NULL; +#endif } -static inline void free_thread_stack(unsigned long *stack) +static inline void free_thread_stack(struct task_struct *tsk) { - __free_pages(virt_to_page(stack), THREAD_SIZE_ORDER); + if (task_stack_vm_area(tsk)) + vfree(tsk->stack); + else + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; @@ -181,9 +201,9 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_thread_stack(unsigned long *stack) +static void free_thread_stack(struct task_struct *tsk) { - kmem_cache_free(thread_stack_cache, stack); + kmem_cache_free(thread_stack_cache, tsk->stack); } void thread_stack_cache_init(void) @@ -213,24 +233,47 @@ struct kmem_cache *vm_area_cachep; /* SLAB cache for mm_struct structures (tsk->mm) */ static struct kmem_cache *mm_cachep; -static void account_kernel_stack(unsigned long *stack, int account) +static void account_kernel_stack(struct task_struct *tsk, int account) { - /* All stack pages are in the same zone and belong to the same memcg. */ - struct page *first_page = virt_to_page(stack); + void *stack = task_stack_page(tsk); + struct vm_struct *vm = task_stack_vm_area(tsk); + + BUILD_BUG_ON(IS_ENABLED(CONFIG_VMAP_STACK) && PAGE_SIZE % 1024 != 0); + + if (vm) { + int i; + + BUG_ON(vm->nr_pages != THREAD_SIZE / PAGE_SIZE); + + for (i = 0; i < THREAD_SIZE / PAGE_SIZE; i++) { + mod_zone_page_state(page_zone(vm->pages[i]), + NR_KERNEL_STACK_KB, + PAGE_SIZE / 1024 * account); + } - mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, - THREAD_SIZE / 1024 * account); + /* All stack pages belong to the same memcg. */ + memcg_kmem_update_page_stat(vm->pages[0], MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } else { + /* + * All stack pages are in the same zone and belong to the + * same memcg. + */ + struct page *first_page = virt_to_page(stack); + + mod_zone_page_state(page_zone(first_page), NR_KERNEL_STACK_KB, + THREAD_SIZE / 1024 * account); - memcg_kmem_update_page_stat( - first_page, MEMCG_KERNEL_STACK_KB, - account * (THREAD_SIZE / 1024)); + memcg_kmem_update_page_stat(first_page, MEMCG_KERNEL_STACK_KB, + account * (THREAD_SIZE / 1024)); + } } void free_task(struct task_struct *tsk) { - account_kernel_stack(tsk->stack, -1); + account_kernel_stack(tsk, -1); arch_release_thread_stack(tsk->stack); - free_thread_stack(tsk->stack); + free_thread_stack(tsk); rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -342,6 +385,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) { struct task_struct *tsk; unsigned long *stack; + struct vm_struct *stack_vm_area; int err; if (node == NUMA_NO_NODE) @@ -354,11 +398,23 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) if (!stack) goto free_tsk; + stack_vm_area = task_stack_vm_area(tsk); + err = arch_dup_task_struct(tsk, orig); + + /* + * arch_dup_task_struct() clobbers the stack-related fields. Make + * sure they're properly initialized before using any stack-related + * functions again. + */ + tsk->stack = stack; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = stack_vm_area; +#endif + if (err) goto free_stack; - tsk->stack = stack; #ifdef CONFIG_SECCOMP /* * We must handle setting up seccomp filters once we're under @@ -390,14 +446,14 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; - account_kernel_stack(stack, 1); + account_kernel_stack(tsk, 1); kcov_task_init(tsk); return tsk; free_stack: - free_thread_stack(stack); + free_thread_stack(tsk); free_tsk: free_task_struct(tsk); return NULL; -- cgit v0.10.2 From b4a0f533e5976cb1a79f31d6152e1d322d79b7f1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Aug 2016 02:35:22 -0700 Subject: dma-api: Teach the "DMA-from-stack" check about vmapped stacks If we're using CONFIG_VMAP_STACK=y and we manage to point an sg entry at the stack, then either the sg page will be in highmem or sg_virt() will return the direct-map alias. In neither case will the existing check_for_stack() implementation realize that it's a stack page. Fix it by explicitly checking for stack pages. This has no effect by itself. It's broken out for ease of review. Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/448460622731312298bf19dcbacb1606e75de7a9.1470907718.git.luto@kernel.org [ Minor edits. ] Signed-off-by: Ingo Molnar diff --git a/lib/dma-debug.c b/lib/dma-debug.c index fcfa193..06f02f6 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -1164,11 +1165,32 @@ static void check_unmap(struct dma_debug_entry *ref) put_hash_bucket(bucket, &flags); } -static void check_for_stack(struct device *dev, void *addr) +static void check_for_stack(struct device *dev, + struct page *page, size_t offset) { - if (object_is_on_stack(addr)) - err_printk(dev, NULL, "DMA-API: device driver maps memory from " - "stack [addr=%p]\n", addr); + void *addr; + struct vm_struct *stack_vm_area = task_stack_vm_area(current); + + if (!stack_vm_area) { + /* Stack is direct-mapped. */ + if (PageHighMem(page)) + return; + addr = page_address(page) + offset; + if (object_is_on_stack(addr)) + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [addr=%p]\n", addr); + } else { + /* Stack is vmalloced. */ + int i; + + for (i = 0; i < stack_vm_area->nr_pages; i++) { + if (page != stack_vm_area->pages[i]) + continue; + + addr = (u8 *)current->stack + i * PAGE_SIZE + offset; + err_printk(dev, NULL, "DMA-API: device driver maps memory from stack [probable addr=%p]\n", addr); + break; + } + } } static inline bool overlap(void *addr, unsigned long len, void *start, void *end) @@ -1291,10 +1313,11 @@ void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, if (map_single) entry->type = dma_debug_single; + check_for_stack(dev, page, offset); + if (!PageHighMem(page)) { void *addr = page_address(page) + offset; - check_for_stack(dev, addr); check_for_illegal_area(dev, addr, size); } @@ -1386,8 +1409,9 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg, entry->sg_call_ents = nents; entry->sg_mapped_ents = mapped_ents; + check_for_stack(dev, sg_page(s), s->offset); + if (!PageHighMem(sg_page(s))) { - check_for_stack(dev, sg_virt(s)); check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s)); } -- cgit v0.10.2 From e37e43a497d5a8b7c0cc1736d56986f432c394c9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 11 Aug 2016 02:35:23 -0700 Subject: x86/mm/64: Enable vmapped stacks (CONFIG_HAVE_ARCH_VMAP_STACK=y) This allows x86_64 kernels to enable vmapped stacks by setting HAVE_ARCH_VMAP_STACK=y - which enables the CONFIG_VMAP_STACK=y high level Kconfig option. There are a couple of interesting bits: First, x86 lazily faults in top-level paging entries for the vmalloc area. This won't work if we get a page fault while trying to access the stack: the CPU will promote it to a double-fault and we'll die. To avoid this problem, probe the new stack when switching stacks and forcibly populate the pgd entry for the stack when switching mms. Second, once we have guard pages around the stack, we'll want to detect and handle stack overflow. I didn't enable it on x86_32. We'd need to rework the double-fault code a bit and I'm concerned about running out of vmalloc virtual addresses under some workloads. This patch, by itself, will behave somewhat erratically when the stack overflows while RSP is still more than a few tens of bytes above the bottom of the stack. Specifically, we'll get #PF and make it to no_context and them oops without reliably triggering a double-fault, and no_context doesn't know about stack overflows. The next patch will improve that case. Thank you to Nadav and Brian for helping me pay enough attention to the SDM to hopefully get this right. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Nadav Amit Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c88f3e2920b18e6cc621d772a04a62c06869037e.1470907718.git.luto@kernel.org [ Minor edits. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c580d8c..21a6d0e 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -94,6 +94,7 @@ config X86 select HAVE_ARCH_TRANSPARENT_HUGEPAGE select HAVE_ARCH_WITHIN_STACK_FRAMES select HAVE_EBPF_JIT if X86_64 + select HAVE_ARCH_VMAP_STACK if X86_64 select HAVE_CC_STACKPROTECTOR select HAVE_CMPXCHG_DOUBLE select HAVE_CMPXCHG_LOCAL diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 8f321a1..14e4b20 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -8,6 +8,28 @@ struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); +/* This runs runs on the previous thread's stack. */ +static inline void prepare_switch_to(struct task_struct *prev, + struct task_struct *next) +{ +#ifdef CONFIG_VMAP_STACK + /* + * If we switch to a stack that has a top-level paging entry + * that is not present in the current mm, the resulting #PF will + * will be promoted to a double-fault and we'll panic. Probe + * the new stack now so that vmalloc_fault can fix up the page + * tables if needed. This can only happen if we use a stack + * in vmap space. + * + * We assume that the stack is aligned so that it never spans + * more than one top-level paging entry. + * + * To minimize cache pollution, just follow the stack pointer. + */ + READ_ONCE(*(unsigned char *)next->thread.sp); +#endif +} + #ifdef CONFIG_X86_32 #ifdef CONFIG_CC_STACKPROTECTOR @@ -39,6 +61,8 @@ do { \ */ \ unsigned long ebx, ecx, edx, esi, edi; \ \ + prepare_switch_to(prev, next); \ + \ asm volatile("pushl %%ebp\n\t" /* save EBP */ \ "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ @@ -103,7 +127,9 @@ do { \ * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL * has no effect. */ -#define switch_to(prev, next, last) \ +#define switch_to(prev, next, last) \ + prepare_switch_to(prev, next); \ + \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index b70ca12..907b4e4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -292,12 +292,30 @@ DO_ERROR(X86_TRAP_NP, SIGBUS, "segment not present", segment_not_present) DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) +#ifdef CONFIG_VMAP_STACK +static void __noreturn handle_stack_overflow(const char *message, + struct pt_regs *regs, + unsigned long fault_address) +{ + printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", + (void *)fault_address, current->stack, + (char *)current->stack + THREAD_SIZE - 1); + die(message, regs, 0); + + /* Be absolutely certain we don't return. */ + panic(message); +} +#endif + #ifdef CONFIG_X86_64 /* Runs on IST stack */ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) { static const char str[] = "double fault"; struct task_struct *tsk = current; +#ifdef CONFIG_VMAP_STACK + unsigned long cr2; +#endif #ifdef CONFIG_X86_ESPFIX64 extern unsigned char native_irq_return_iret[]; @@ -332,6 +350,49 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_DF; +#ifdef CONFIG_VMAP_STACK + /* + * If we overflow the stack into a guard page, the CPU will fail + * to deliver #PF and will send #DF instead. Similarly, if we + * take any non-IST exception while too close to the bottom of + * the stack, the processor will get a page fault while + * delivering the exception and will generate a double fault. + * + * According to the SDM (footnote in 6.15 under "Interrupt 14 - + * Page-Fault Exception (#PF): + * + * Processors update CR2 whenever a page fault is detected. If a + * second page fault occurs while an earlier page fault is being + * deliv- ered, the faulting linear address of the second fault will + * overwrite the contents of CR2 (replacing the previous + * address). These updates to CR2 occur even if the page fault + * results in a double fault or occurs during the delivery of a + * double fault. + * + * The logic below has a small possibility of incorrectly diagnosing + * some errors as stack overflows. For example, if the IDT or GDT + * gets corrupted such that #GP delivery fails due to a bad descriptor + * causing #GP and we hit this condition while CR2 coincidentally + * points to the stack guard page, we'll think we overflowed the + * stack. Given that we're going to panic one way or another + * if this happens, this isn't necessarily worth fixing. + * + * If necessary, we could improve the test by only diagnosing + * a stack overflow if the saved RSP points within 47 bytes of + * the bottom of the stack: if RSP == tsk_stack + 48 and we + * take an exception, the stack is already aligned and there + * will be enough room SS, RSP, RFLAGS, CS, RIP, and a + * possible error code, so a stack overflow would *not* double + * fault. With any less space left, exception delivery could + * fail, and, as a practical matter, we've overflowed the + * stack even if the actual trigger for the double fault was + * something else. + */ + cr2 = read_cr2(); + if ((unsigned long)task_stack_page(tsk) - 1 - cr2 < PAGE_SIZE) + handle_stack_overflow("kernel stack overflow (double-fault)", regs, cr2); +#endif + #ifdef CONFIG_DOUBLEFAULT df_debug(regs, error_code); #endif diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 4dbe656..a7655f6 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -77,10 +77,25 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, unsigned cpu = smp_processor_id(); if (likely(prev != next)) { + if (IS_ENABLED(CONFIG_VMAP_STACK)) { + /* + * If our current stack is in vmalloc space and isn't + * mapped in the new pgd, we'll double-fault. Forcibly + * map it. + */ + unsigned int stack_pgd_index = pgd_index(current_stack_pointer()); + + pgd_t *pgd = next->pgd + stack_pgd_index; + + if (unlikely(pgd_none(*pgd))) + set_pgd(pgd, init_mm.pgd[stack_pgd_index]); + } + #ifdef CONFIG_SMP this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK); this_cpu_write(cpu_tlbstate.active_mm, next); #endif + cpumask_set_cpu(cpu, mm_cpumask(next)); /* -- cgit v0.10.2 From e4a744ef2fef5c803348b650a3a2d01da7797a9b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:55 -0500 Subject: ftrace: Remove CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST from config Make HAVE_FUNCTION_GRAPH_FP_TEST a normal define, independent from kconfig. This removes some config file pollution and simplifies the checking for the fp test. Suggested-by: Steven Rostedt Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2c4e5f05054d6d367f702fd153af7a0109dd5c81.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S index 0f03a8f..aef02d2 100644 --- a/arch/arm64/kernel/entry-ftrace.S +++ b/arch/arm64/kernel/entry-ftrace.S @@ -219,7 +219,7 @@ ENDPROC(ftrace_graph_caller) * * Run ftrace_return_to_handler() before going back to parent. * @fp is checked against the value passed by ftrace_graph_caller() - * only when CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST is enabled. + * only when HAVE_FUNCTION_GRAPH_FP_TEST is enabled. */ ENTRY(return_to_handler) save_return_regs diff --git a/arch/blackfin/kernel/ftrace-entry.S b/arch/blackfin/kernel/ftrace-entry.S index 28d0595..3b8bdcb 100644 --- a/arch/blackfin/kernel/ftrace-entry.S +++ b/arch/blackfin/kernel/ftrace-entry.S @@ -169,7 +169,7 @@ ENTRY(_ftrace_graph_caller) r0 = sp; /* unsigned long *parent */ r1 = [sp]; /* unsigned long self_addr */ # endif -# ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST +# ifdef HAVE_FUNCTION_GRAPH_FP_TEST r2 = fp; /* unsigned long frame_pointer */ # endif r0 += 16; /* skip the 4 local regs on stack */ @@ -190,7 +190,7 @@ ENTRY(_return_to_handler) [--sp] = r1; /* get original return address */ -# ifdef CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST +# ifdef HAVE_FUNCTION_GRAPH_FP_TEST r0 = fp; /* Blackfin is sane, so omit this */ # endif call _ftrace_return_to_handler; diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 59b0960..f5d60f1 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -56,7 +56,6 @@ config SPARC64 def_bool 64BIT select HAVE_FUNCTION_TRACER select HAVE_FUNCTION_GRAPH_TRACER - select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_KRETPROBES select HAVE_KPROBES select HAVE_RCU_TABLE_FREE if SMP diff --git a/arch/sparc/include/asm/ftrace.h b/arch/sparc/include/asm/ftrace.h index 3192a8e..62755a3 100644 --- a/arch/sparc/include/asm/ftrace.h +++ b/arch/sparc/include/asm/ftrace.h @@ -9,6 +9,10 @@ void _mcount(void); #endif +#endif /* CONFIG_MCOUNT */ + +#if defined(CONFIG_SPARC64) && !defined(CC_USE_FENTRY) +#define HAVE_FUNCTION_GRAPH_FP_TEST #endif #ifdef CONFIG_DYNAMIC_FTRACE diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 21a6d0e..ce8860c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -111,7 +111,6 @@ config X86 select HAVE_EXIT_THREAD select HAVE_FENTRY if X86_64 select HAVE_FTRACE_MCOUNT_RECORD - select HAVE_FUNCTION_GRAPH_FP_TEST select HAVE_FUNCTION_GRAPH_TRACER select HAVE_FUNCTION_TRACER select HAVE_GCC_PLUGINS diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index a4820d4..37f67cb 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -6,6 +6,7 @@ # define MCOUNT_ADDR ((unsigned long)(__fentry__)) #else # define MCOUNT_ADDR ((unsigned long)(mcount)) +# define HAVE_FUNCTION_GRAPH_FP_TEST #endif #define MCOUNT_INSN_SIZE 5 /* sizeof mcount call */ diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f4b86e8..ba33267 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -24,11 +24,6 @@ config HAVE_FUNCTION_GRAPH_TRACER help See Documentation/trace/ftrace-design.txt -config HAVE_FUNCTION_GRAPH_FP_TEST - bool - help - See Documentation/trace/ftrace-design.txt - config HAVE_DYNAMIC_FTRACE bool help diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 7363ccf..fc173cd 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -204,7 +204,7 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret, return; } -#if defined(CONFIG_HAVE_FUNCTION_GRAPH_FP_TEST) && !defined(CC_USING_FENTRY) +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST /* * The arch may choose to record the frame pointer used * and check it here to make sure that it is what we expect it -- cgit v0.10.2 From daa460a88c09b26b68e8b017de589c217e901afb Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:56 -0500 Subject: ftrace: Only allocate the ret_stack 'fp' field when needed This saves some memory when HAVE_FUNCTION_GRAPH_FP_TEST isn't defined. On x86_64 with newer versions of gcc which have -mfentry, it saves 400 bytes per task. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5c7747d9ea7b5cb47ef0a8ce8a6cea6bf7aa94bf.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 7d565af..4ad9ccc 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -795,7 +795,9 @@ struct ftrace_ret_stack { unsigned long func; unsigned long long calltime; unsigned long long subtime; +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST unsigned long fp; +#endif }; /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index fc173cd..0e03ed0 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -171,7 +171,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; current->ret_stack[index].subtime = 0; +#ifdef HAVE_FUNCTION_GRAPH_FP_TEST current->ret_stack[index].fp = frame_pointer; +#endif *depth = current->curr_ret_stack; return 0; -- cgit v0.10.2 From 9a7c348ba6a46f6270d4fe49577649dad5664fe7 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:57 -0500 Subject: ftrace: Add return address pointer to ftrace_ret_stack Storing this value will help prevent unwinders from getting out of sync with the function graph tracer ret_stack. Now instead of needing a stateful iterator, they can compare the return address pointer to find the right ret_stack entry. Note that an array of 50 ftrace_ret_stack structs is allocated for every task. So when an arch implements this, it will add either 200 or 400 bytes of memory usage per task (depending on whether it's a 32-bit or 64-bit platform). Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a95cfcc39e8f26b89a430c56926af0bb217bc0a1.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/Documentation/trace/ftrace-design.txt b/Documentation/trace/ftrace-design.txt index dd5f916..a273dd0 100644 --- a/Documentation/trace/ftrace-design.txt +++ b/Documentation/trace/ftrace-design.txt @@ -203,6 +203,17 @@ along to ftrace_push_return_trace() instead of a stub value of 0. Similarly, when you call ftrace_return_to_handler(), pass it the frame pointer. +HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +-------------------------------- + +An arch may pass in a pointer to the return address on the stack. This +prevents potential stack unwinding issues where the unwinder gets out of +sync with ret_stack and the wrong addresses are reported by +ftrace_graph_ret_addr(). + +Adding support for it is easy: just define the macro in asm/ftrace.h and +pass the return address pointer as the 'retp' argument to +ftrace_push_return_trace(). HAVE_FTRACE_NMI_ENTER --------------------- diff --git a/arch/arm/kernel/ftrace.c b/arch/arm/kernel/ftrace.c index 709ee1d..3f17594 100644 --- a/arch/arm/kernel/ftrace.c +++ b/arch/arm/kernel/ftrace.c @@ -218,7 +218,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, } err = ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer); + frame_pointer, NULL); if (err == -EBUSY) { *parent = old; return; diff --git a/arch/arm64/kernel/ftrace.c b/arch/arm64/kernel/ftrace.c index ebecf9a..40ad08a 100644 --- a/arch/arm64/kernel/ftrace.c +++ b/arch/arm64/kernel/ftrace.c @@ -138,7 +138,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, return; err = ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer); + frame_pointer, NULL); if (err == -EBUSY) return; else diff --git a/arch/blackfin/kernel/ftrace.c b/arch/blackfin/kernel/ftrace.c index 095de0f..8dad758 100644 --- a/arch/blackfin/kernel/ftrace.c +++ b/arch/blackfin/kernel/ftrace.c @@ -107,7 +107,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, return; if (ftrace_push_return_trace(*parent, self_addr, &trace.depth, - frame_pointer) == -EBUSY) + frame_pointer, NULL) == -EBUSY) return; trace.func = self_addr; diff --git a/arch/microblaze/kernel/ftrace.c b/arch/microblaze/kernel/ftrace.c index fc7b48a..d57563c 100644 --- a/arch/microblaze/kernel/ftrace.c +++ b/arch/microblaze/kernel/ftrace.c @@ -63,7 +63,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0); + err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0, NULL); if (err == -EBUSY) { *parent = old; return; diff --git a/arch/mips/kernel/ftrace.c b/arch/mips/kernel/ftrace.c index 937c54b..30a3b75 100644 --- a/arch/mips/kernel/ftrace.c +++ b/arch/mips/kernel/ftrace.c @@ -382,8 +382,8 @@ void prepare_ftrace_return(unsigned long *parent_ra_addr, unsigned long self_ra, if (unlikely(faulted)) goto out; - if (ftrace_push_return_trace(old_parent_ra, self_ra, &trace.depth, fp) - == -EBUSY) { + if (ftrace_push_return_trace(old_parent_ra, self_ra, &trace.depth, fp, + NULL) == -EBUSY) { *parent_ra_addr = old_parent_ra; return; } diff --git a/arch/parisc/kernel/ftrace.c b/arch/parisc/kernel/ftrace.c index a828a0a..5a5506a 100644 --- a/arch/parisc/kernel/ftrace.c +++ b/arch/parisc/kernel/ftrace.c @@ -48,7 +48,7 @@ static void __hot prepare_ftrace_return(unsigned long *parent, return; if (ftrace_push_return_trace(old, self_addr, &trace.depth, - 0 ) == -EBUSY) + 0, NULL) == -EBUSY) return; /* activate parisc_return_to_handler() as return point */ diff --git a/arch/powerpc/kernel/ftrace.c b/arch/powerpc/kernel/ftrace.c index cc52d97..a95639b 100644 --- a/arch/powerpc/kernel/ftrace.c +++ b/arch/powerpc/kernel/ftrace.c @@ -593,7 +593,8 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) if (!ftrace_graph_entry(&trace)) goto out; - if (ftrace_push_return_trace(parent, ip, &trace.depth, 0) == -EBUSY) + if (ftrace_push_return_trace(parent, ip, &trace.depth, 0, + NULL) == -EBUSY) goto out; parent = return_hooker; diff --git a/arch/s390/kernel/ftrace.c b/arch/s390/kernel/ftrace.c index 0f7bfeb..60a8a4e 100644 --- a/arch/s390/kernel/ftrace.c +++ b/arch/s390/kernel/ftrace.c @@ -209,7 +209,8 @@ unsigned long prepare_ftrace_return(unsigned long parent, unsigned long ip) /* Only trace if the calling function expects to. */ if (!ftrace_graph_entry(&trace)) goto out; - if (ftrace_push_return_trace(parent, ip, &trace.depth, 0) == -EBUSY) + if (ftrace_push_return_trace(parent, ip, &trace.depth, 0, + NULL) == -EBUSY) goto out; parent = (unsigned long) return_to_handler; out: diff --git a/arch/sh/kernel/ftrace.c b/arch/sh/kernel/ftrace.c index 38993e0..95eccd4 100644 --- a/arch/sh/kernel/ftrace.c +++ b/arch/sh/kernel/ftrace.c @@ -382,7 +382,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr) return; } - err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0); + err = ftrace_push_return_trace(old, self_addr, &trace.depth, 0, NULL); if (err == -EBUSY) { __raw_writel(old, parent); return; diff --git a/arch/sparc/kernel/ftrace.c b/arch/sparc/kernel/ftrace.c index 0a2d2dd..6bcff69 100644 --- a/arch/sparc/kernel/ftrace.c +++ b/arch/sparc/kernel/ftrace.c @@ -131,7 +131,7 @@ unsigned long prepare_ftrace_return(unsigned long parent, return parent + 8UL; if (ftrace_push_return_trace(parent, self_addr, &trace.depth, - frame_pointer) == -EBUSY) + frame_pointer, NULL) == -EBUSY) return parent + 8UL; trace.func = self_addr; diff --git a/arch/tile/kernel/ftrace.c b/arch/tile/kernel/ftrace.c index 4a57208..b827a41 100644 --- a/arch/tile/kernel/ftrace.c +++ b/arch/tile/kernel/ftrace.c @@ -184,7 +184,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr, *parent = return_hooker; err = ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer); + frame_pointer, NULL); if (err == -EBUSY) { *parent = old; return; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index d036cfb..ae3b1fb 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, } if (ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer) == -EBUSY) { + frame_pointer, NULL) == -EBUSY) { *parent = old; return; } diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 4ad9ccc..483e02a 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -798,6 +798,9 @@ struct ftrace_ret_stack { #ifdef HAVE_FUNCTION_GRAPH_FP_TEST unsigned long fp; #endif +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + unsigned long *retp; +#endif }; /* @@ -809,7 +812,7 @@ extern void return_to_handler(void); extern int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer); + unsigned long frame_pointer, unsigned long *retp); /* * Sometimes we don't want to trace a function with the function diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 0e03ed0..f7212ec 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -119,7 +119,7 @@ print_graph_duration(struct trace_array *tr, unsigned long long duration, /* Add a function return address to the trace stack on thread info.*/ int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, - unsigned long frame_pointer) + unsigned long frame_pointer, unsigned long *retp) { unsigned long long calltime; int index; @@ -174,6 +174,9 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, #ifdef HAVE_FUNCTION_GRAPH_FP_TEST current->ret_stack[index].fp = frame_pointer; #endif +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + current->ret_stack[index].retp = retp; +#endif *depth = current->curr_ret_stack; return 0; -- cgit v0.10.2 From 223918e32a87c79ac55ca4aa513ba405ba4d57cd Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:58 -0500 Subject: ftrace: Add ftrace_graph_ret_addr() stack unwinding helpers When function graph tracing is enabled for a function, ftrace modifies the stack by replacing the original return address with the address of a hook function (return_to_handler). Stack unwinders need a way to get the original return address. Add an arch-independent helper function for that named ftrace_graph_ret_addr(). This adds two variations of the function: one depends on HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, and the other relies on an index state variable. The former is recommended because, in some cases, the latter can cause problems when the unwinder skips stack frames. It can get out of sync with the ret_stack index and wrong addresses can be reported for the stack trace. Once all arches have been ported to use HAVE_FUNCTION_GRAPH_RET_ADDR_PTR, we can get rid of the distinction. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/36bd90f762fc5e5af3929e3797a68a64906421cf.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 483e02a..6f93ac4 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -814,6 +814,9 @@ extern int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth, unsigned long frame_pointer, unsigned long *retp); +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp); + /* * Sometimes we don't want to trace a function with the function * graph tracer but we want them to keep traced by the usual function @@ -875,6 +878,13 @@ static inline int task_curr_ret_stack(struct task_struct *tsk) return -1; } +static inline unsigned long +ftrace_graph_ret_addr(struct task_struct *task, int *idx, unsigned long ret, + unsigned long *retp) +{ + return ret; +} + static inline void pause_graph_tracing(void) { } static inline void unpause_graph_tracing(void) { } #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index f7212ec..0cbe38a 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -284,6 +284,64 @@ unsigned long ftrace_return_to_handler(unsigned long frame_pointer) return ret; } +/** + * ftrace_graph_ret_addr - convert a potentially modified stack return address + * to its original value + * + * This function can be called by stack unwinding code to convert a found stack + * return address ('ret') to its original value, in case the function graph + * tracer has modified it to be 'return_to_handler'. If the address hasn't + * been modified, the unchanged value of 'ret' is returned. + * + * 'idx' is a state variable which should be initialized by the caller to zero + * before the first call. + * + * 'retp' is a pointer to the return address on the stack. It's ignored if + * the arch doesn't have HAVE_FUNCTION_GRAPH_RET_ADDR_PTR defined. + */ +#ifdef HAVE_FUNCTION_GRAPH_RET_ADDR_PTR +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int index = task->curr_ret_stack; + int i; + + if (ret != (unsigned long)return_to_handler) + return ret; + + if (index < -1) + index += FTRACE_NOTRACE_DEPTH; + + if (index < 0) + return ret; + + for (i = 0; i <= index; i++) + if (task->ret_stack[i].retp == retp) + return task->ret_stack[i].ret; + + return ret; +} +#else /* !HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ +unsigned long ftrace_graph_ret_addr(struct task_struct *task, int *idx, + unsigned long ret, unsigned long *retp) +{ + int task_idx; + + if (ret != (unsigned long)return_to_handler) + return ret; + + task_idx = task->curr_ret_stack; + + if (!task->ret_stack || task_idx < *idx) + return ret; + + task_idx -= *idx; + (*idx)++; + + return task->ret_stack[task_idx].ret; +} +#endif /* HAVE_FUNCTION_GRAPH_RET_ADDR_PTR */ + int __trace_graph_entry(struct trace_array *tr, struct ftrace_graph_ent *trace, unsigned long flags, -- cgit v0.10.2 From 408fe5de2f2767059a9561e0ae6d4385d1b39dac Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:52:59 -0500 Subject: x86/dumpstack/ftrace: Convert dump_trace() callbacks to use ftrace_graph_ret_addr() Convert print_context_stack() and print_context_stack_bp() to use the arch-independent ftrace_graph_ret_addr() helper. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/56ec97cafc1bf2e34d1119e6443d897db406da86.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 5f49c04..9bf3d02 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -38,38 +38,6 @@ void printk_address(unsigned long address) pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); } -#ifdef CONFIG_FUNCTION_GRAPH_TRACER -static void -print_ftrace_graph_addr(unsigned long addr, void *data, - const struct stacktrace_ops *ops, - struct task_struct *task, int *graph) -{ - unsigned long ret_addr; - int index; - - if (addr != (unsigned long)return_to_handler) - return; - - index = task->curr_ret_stack; - - if (!task->ret_stack || index < *graph) - return; - - index -= *graph; - ret_addr = task->ret_stack[index].ret; - - ops->address(data, ret_addr, 1); - - (*graph)++; -} -#else -static inline void -print_ftrace_graph_addr(unsigned long addr, void *data, - const struct stacktrace_ops *ops, - struct task_struct *task, int *graph) -{ } -#endif - /* * x86-64 can have up to three kernel stacks: * process stack @@ -107,18 +75,24 @@ print_context_stack(struct task_struct *task, stack = (unsigned long *)task_stack_page(task); while (valid_stack_ptr(task, stack, sizeof(*stack), end)) { - unsigned long addr; + unsigned long addr = *stack; - addr = *stack; if (__kernel_text_address(addr)) { + unsigned long real_addr; + int reliable = 0; + if ((unsigned long) stack == bp + sizeof(long)) { - ops->address(data, addr, 1); + reliable = 1; frame = frame->next_frame; bp = (unsigned long) frame; - } else { - ops->address(data, addr, 0); } - print_ftrace_graph_addr(addr, data, ops, task, graph); + + ops->address(data, addr, reliable); + + real_addr = ftrace_graph_ret_addr(task, graph, addr, + stack); + if (real_addr != addr) + ops->address(data, real_addr, 1); } stack++; } @@ -133,19 +107,24 @@ print_context_stack_bp(struct task_struct *task, unsigned long *end, int *graph) { struct stack_frame *frame = (struct stack_frame *)bp; - unsigned long *ret_addr = &frame->return_address; + unsigned long *retp = &frame->return_address; - while (valid_stack_ptr(task, ret_addr, sizeof(*ret_addr), end)) { - unsigned long addr = *ret_addr; + while (valid_stack_ptr(task, retp, sizeof(*retp), end)) { + unsigned long addr = *retp; + unsigned long real_addr; if (!__kernel_text_address(addr)) break; if (ops->address(data, addr, 1)) break; + + real_addr = ftrace_graph_ret_addr(task, graph, addr, retp); + if (real_addr != addr) + ops->address(data, real_addr, 1); + frame = frame->next_frame; - ret_addr = &frame->return_address; - print_ftrace_graph_addr(addr, data, ops, task, graph); + retp = &frame->return_address; } return (unsigned long)frame; -- cgit v0.10.2 From 471bd10f5e2880bd91a2627d887f6062494cfe9c Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:53:00 -0500 Subject: ftrace/x86: Implement HAVE_FUNCTION_GRAPH_RET_ADDR_PTR Use the more reliable version of ftrace_graph_ret_addr() so we no longer have to worry about the unwinder getting out of sync with the function graph ret_stack index, which can happen if the unwinder skips any frames before calling ftrace_graph_ret_addr(). This fixes this issue (and several others like it): $ cat /proc/self/stack [] save_stack_trace_tsk+0x22/0x40 [] proc_pid_stack+0xb9/0x110 [] proc_single_show+0x54/0x80 [] seq_read+0x108/0x3e0 [] __vfs_read+0x37/0x140 [] vfs_read+0x99/0x140 [] SyS_read+0x58/0xc0 [] entry_SYSCALL_64_fastpath+0x1f/0xbd [] 0xffffffffffffffff $ echo function_graph > /sys/kernel/debug/tracing/current_tracer $ cat /proc/self/stack [] return_to_handler+0x0/0x27 [] print_context_stack+0xfc/0x100 [] return_to_handler+0x0/0x27 [] dump_trace+0x12b/0x350 [] return_to_handler+0x0/0x27 [] save_stack_trace_tsk+0x22/0x40 [] return_to_handler+0x0/0x27 [] proc_pid_stack+0xb9/0x110 [] return_to_handler+0x0/0x27 [] proc_single_show+0x54/0x80 [] return_to_handler+0x0/0x27 [] seq_read+0x108/0x3e0 [] return_to_handler+0x0/0x27 [] __vfs_read+0x37/0x140 [] return_to_handler+0x0/0x27 [] vfs_read+0x99/0x140 [] 0xffffffffffffffff Enabling function graph tracing causes the stack trace to change in two ways: First, the real call addresses are confusingly interspersed with 'return_to_handler' addresses. This issue will be fixed by the next patch. Second, the stack trace is offset by two frames, because the unwinder skipped the first two frames and got out of sync with the ret_stack index. This patch fixes this issue. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a6d623e36f8d08f9a17bd74d804d201177a23afd.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h index 37f67cb..eccd0ac 100644 --- a/arch/x86/include/asm/ftrace.h +++ b/arch/x86/include/asm/ftrace.h @@ -14,6 +14,8 @@ #define ARCH_SUPPORTS_FTRACE_OPS 1 #endif +#define HAVE_FUNCTION_GRAPH_RET_ADDR_PTR + #ifndef __ASSEMBLY__ extern void mcount(void); extern atomic_t modifying_ftrace_code; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ae3b1fb..8639bb2 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -1029,7 +1029,7 @@ void prepare_ftrace_return(unsigned long self_addr, unsigned long *parent, } if (ftrace_push_return_trace(old, self_addr, &trace.depth, - frame_pointer, NULL) == -EBUSY) { + frame_pointer, parent) == -EBUSY) { *parent = old; return; } -- cgit v0.10.2 From 6f727b84e23421721025f4eb1b4f6cea1d4d723a Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:53:01 -0500 Subject: x86/dumpstack/ftrace: Mark function graph handler function as unreliable When function graph tracing is enabled for a function, its return address on the stack is replaced with the address of an ftrace handler (return_to_handler). Currently 'return_to_handler' can be reported as reliable. That's not ideal, and can actually be misleading. When saving or dumping the stack, you normally only care about what led up to that point (the call path), rather than what will happen in the future (the return path). That's especially true in the non-oops stack trace case, which isn't used for debugging. For example, in a perf profiling operation, reporting return_to_handler() in the trace would just be confusing. And in the oops case, where debugging is important, "unreliable" is also more appropriate there because it serves as a hint that graph tracing was involved, instead of trying to imply that return_to_handler() was the real caller. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f8af15749c7d632d3e7f815995831d5b7f82950d.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 9bf3d02..6aad8d4 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -87,12 +87,21 @@ print_context_stack(struct task_struct *task, bp = (unsigned long) frame; } - ops->address(data, addr, reliable); - + /* + * When function graph tracing is enabled for a + * function, its return address on the stack is + * replaced with the address of an ftrace handler + * (return_to_handler). In that case, before printing + * the "real" address, we want to print the handler + * address as an "unreliable" hint that function graph + * tracing was involved. + */ real_addr = ftrace_graph_ret_addr(task, graph, addr, stack); if (real_addr != addr) - ops->address(data, real_addr, 1); + ops->address(data, addr, 0); + + ops->address(data, real_addr, reliable); } stack++; } @@ -116,12 +125,11 @@ print_context_stack_bp(struct task_struct *task, if (!__kernel_text_address(addr)) break; - if (ops->address(data, addr, 1)) - break; - real_addr = ftrace_graph_ret_addr(task, graph, addr, retp); - if (real_addr != addr) - ops->address(data, real_addr, 1); + if (real_addr != addr && ops->address(data, addr, 0)) + break; + if (ops->address(data, real_addr, 1)) + break; frame = frame->next_frame; retp = &frame->return_address; -- cgit v0.10.2 From 13e25bab7e51bdd4ba7df1ef2388961294bb565e Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 19 Aug 2016 06:53:02 -0500 Subject: x86/dumpstack/ftrace: Don't print unreliable addresses in print_context_stack_bp() When function graph tracing is enabled, print_context_stack_bp() can report return_to_handler() as an unreliable address, which is confusing and misleading: return_to_handler() is really only useful as a hint for debugging, whereas print_context_stack_bp() users only care about the actual 'reliable' call path. Signed-off-by: Josh Poimboeuf Acked-by: Steven Rostedt Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c51aef578d8027791b38d2ad9bac0c7f499fde91.1471607358.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6aad8d4..01072e9 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -126,8 +126,6 @@ print_context_stack_bp(struct task_struct *task, break; real_addr = ftrace_graph_ret_addr(task, graph, addr, retp); - if (real_addr != addr && ops->address(data, addr, 0)) - break; if (ops->address(data, real_addr, 1)) break; -- cgit v0.10.2 From 4e047aa7f267c3449b6d323510d35864829aca70 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:16 -0400 Subject: sched/x86/32, kgdb: Don't use thread.ip in sleeping_thread_to_gdb_regs() Match 64-bit and set gdb_regs[GDB_PC] to zero. thread.ip is always the same point in the scheduler (except for newly forked processes), and will be removed in a future patch. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jason Wessel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-2-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 04cde52..fe649a5 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -172,7 +172,6 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_ES] = __KERNEL_DS; gdb_regs[GDB_PS] = 0; gdb_regs[GDB_CS] = __KERNEL_CS; - gdb_regs[GDB_PC] = p->thread.ip; gdb_regs[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; @@ -180,7 +179,6 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; @@ -190,6 +188,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_R14] = 0; gdb_regs[GDB_R15] = 0; #endif + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_SP] = p->thread.sp; } -- cgit v0.10.2 From 163630191ecb0dd9e4146d3c910045aba1cfeec1 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:17 -0400 Subject: sched/x86/64, kgdb: Clear GDB_PS on 64-bit switch_to() no longer saves EFLAGS, so it's bogus to look for it on the stack. Set it to zero like 32-bit. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jason Wessel Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-3-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index fe649a5..5e3f294 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -176,7 +176,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_FS] = 0xFFFF; gdb_regs[GDB_GS] = 0xFFFF; #else - gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); + gdb_regs32[GDB_PS] = 0; gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; gdb_regs[GDB_R8] = 0; -- cgit v0.10.2 From 7b32aeadbc95d4a41402c1c0da6aa3ab51af4c10 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:18 -0400 Subject: sched/x86: Add 'struct inactive_task_frame' to better document the sleeping task stack frame Add 'struct inactive_task_frame', which defines the layout of the stack for a sleeping process. For now, the only defined field is the BP register (frame pointer). Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-4-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 0944218..7646fb2 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -8,6 +8,7 @@ #include #include +#include extern int kstack_depth_to_print; @@ -70,8 +71,7 @@ stack_frame(struct task_struct *task, struct pt_regs *regs) return bp; } - /* bp is the last reg pushed by switch_to */ - return *(unsigned long *)task->thread.sp; + return ((struct inactive_task_frame *)task->thread.sp)->bp; } #else static inline unsigned long diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 14e4b20..ec689c6 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -30,6 +30,11 @@ static inline void prepare_switch_to(struct task_struct *prev, #endif } +/* data that is pointed to by thread.sp */ +struct inactive_task_frame { + unsigned long bp; +}; + #ifdef CONFIG_X86_32 #ifdef CONFIG_CC_STACKPROTECTOR diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 5e3f294..8e36f24 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -50,6 +50,7 @@ #include #include #include +#include struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { @@ -166,7 +167,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_DX] = 0; gdb_regs[GDB_SI] = 0; gdb_regs[GDB_DI] = 0; - gdb_regs[GDB_BP] = *(unsigned long *)p->thread.sp; + gdb_regs[GDB_BP] = ((struct inactive_task_frame *)p->thread.sp)->bp; #ifdef CONFIG_X86_32 gdb_regs[GDB_DS] = __KERNEL_DS; gdb_regs[GDB_ES] = __KERNEL_DS; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 62c0b0e..0115a4a 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -32,6 +32,7 @@ #include #include #include +#include /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -556,7 +557,7 @@ unsigned long get_wchan(struct task_struct *p) if (sp < bottom || sp > top) return 0; - fp = READ_ONCE_NOCHECK(*(unsigned long *)sp); + fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); do { if (fp < bottom || fp > top) return 0; -- cgit v0.10.2 From 0100301bfdf56a2a370c7157b5ab0fbf9313e1cd Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:19 -0400 Subject: sched/x86: Rewrite the switch_to() code Move the low-level context switch code to an out-of-line asm stub instead of using complex inline asm. This allows constructing a new stack frame for the child process to make it seamlessly flow to ret_from_fork without an extra test and branch in __switch_to(). It also improves code generation for __schedule() by using the C calling convention instead of clobbering all registers. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-5-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 0b56666..bf8f221 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -204,6 +204,43 @@ POP_GS_EX .endm +/* + * %eax: prev task + * %edx: next task + */ +ENTRY(__switch_to_asm) + /* + * Save callee-saved registers + * This must match the order in struct inactive_task_frame + */ + pushl %ebp + pushl %ebx + pushl %edi + pushl %esi + + /* switch stack */ + movl %esp, TASK_threadsp(%eax) + movl TASK_threadsp(%edx), %esp + +#ifdef CONFIG_CC_STACKPROTECTOR + movl TASK_stack_canary(%edx), %ebx + movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset +#endif + + /* restore callee-saved registers */ + popl %esi + popl %edi + popl %ebx + popl %ebp + + jmp __switch_to +END(__switch_to_asm) + +/* + * A newly forked process directly context switches into this address. + * + * eax: prev task we switched from + */ ENTRY(ret_from_fork) pushl %eax call schedule_tail diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index f6b40e5..c1af8ac 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -368,13 +368,48 @@ END(ptregs_\func) #include /* + * %rdi: prev task + * %rsi: next task + */ +ENTRY(__switch_to_asm) + /* + * Save callee-saved registers + * This must match the order in inactive_task_frame + */ + pushq %rbp + pushq %rbx + pushq %r12 + pushq %r13 + pushq %r14 + pushq %r15 + + /* switch stack */ + movq %rsp, TASK_threadsp(%rdi) + movq TASK_threadsp(%rsi), %rsp + +#ifdef CONFIG_CC_STACKPROTECTOR + movq TASK_stack_canary(%rsi), %rbx + movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset +#endif + + /* restore callee-saved registers */ + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbx + popq %rbp + + jmp __switch_to +END(__switch_to_asm) + +/* * A newly forked process directly context switches into this address. * - * rdi: prev task we switched from + * rax: prev task we switched from */ ENTRY(ret_from_fork) - LOCK ; btr $TIF_FORK, TI_flags(%r8) - + movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ testb $3, CS(%rsp) /* from kernel_thread? */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 63def95..6fee863 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -389,9 +389,6 @@ struct thread_struct { unsigned short fsindex; unsigned short gsindex; #endif -#ifdef CONFIG_X86_32 - unsigned long ip; -#endif #ifdef CONFIG_X86_64 unsigned long fsbase; unsigned long gsbase; diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index ec689c6..886d5ea 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -2,8 +2,12 @@ #define _ASM_X86_SWITCH_TO_H struct task_struct; /* one of the stranger aspects of C forward declarations */ + +struct task_struct *__switch_to_asm(struct task_struct *prev, + struct task_struct *next); + __visible struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); + struct task_struct *next); struct tss_struct; void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, struct tss_struct *tss); @@ -32,131 +36,30 @@ static inline void prepare_switch_to(struct task_struct *prev, /* data that is pointed to by thread.sp */ struct inactive_task_frame { +#ifdef CONFIG_X86_64 + unsigned long r15; + unsigned long r14; + unsigned long r13; + unsigned long r12; +#else + unsigned long si; + unsigned long di; +#endif + unsigned long bx; unsigned long bp; + unsigned long ret_addr; }; -#ifdef CONFIG_X86_32 - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movl %P[task_canary](%[next]), %%ebx\n\t" \ - "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" -#define __switch_canary_oparam \ - , [stack_canary] "=m" (stack_canary.canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ +struct fork_frame { + struct inactive_task_frame frame; + struct pt_regs regs; +}; -/* - * Saving eflags is important. It switches not only IOPL between tasks, - * it also protects other tasks from NT leaking through sysenter etc. - */ #define switch_to(prev, next, last) \ do { \ - /* \ - * Context-switching clobbers all registers, so we clobber \ - * them explicitly, via unused output variables. \ - * (EAX and EBP is not listed because EBP is saved/restored \ - * explicitly for wchan access and EAX is the return value of \ - * __switch_to()) \ - */ \ - unsigned long ebx, ecx, edx, esi, edi; \ - \ prepare_switch_to(prev, next); \ \ - asm volatile("pushl %%ebp\n\t" /* save EBP */ \ - "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ - "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ - "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ - "pushl %[next_ip]\n\t" /* restore EIP */ \ - __switch_canary \ - "jmp __switch_to\n" /* regparm call */ \ - "1:\t" \ - "popl %%ebp\n\t" /* restore EBP */ \ - \ - /* output parameters */ \ - : [prev_sp] "=m" (prev->thread.sp), \ - [prev_ip] "=m" (prev->thread.ip), \ - "=a" (last), \ - \ - /* clobbered output registers: */ \ - "=b" (ebx), "=c" (ecx), "=d" (edx), \ - "=S" (esi), "=D" (edi) \ - \ - __switch_canary_oparam \ - \ - /* input parameters: */ \ - : [next_sp] "m" (next->thread.sp), \ - [next_ip] "m" (next->thread.ip), \ - \ - /* regparm parameters for __switch_to(): */ \ - [prev] "a" (prev), \ - [next] "d" (next) \ - \ - __switch_canary_iparam \ - \ - : /* reloaded segment registers */ \ - "memory"); \ + ((last) = __switch_to_asm((prev), (next))); \ } while (0) -#else /* CONFIG_X86_32 */ - -/* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" - -#define __EXTRA_CLOBBER \ - , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15", "flags" - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movq %P[task_canary](%%rsi),%%r8\n\t" \ - "movq %%r8,"__percpu_arg([gs_canary])"\n\t" -#define __switch_canary_oparam \ - , [gs_canary] "=m" (irq_stack_union.stack_canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* - * There is no need to save or restore flags, because flags are always - * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL - * has no effect. - */ -#define switch_to(prev, next, last) \ - prepare_switch_to(prev, next); \ - \ - asm volatile(SAVE_CONTEXT \ - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ - "movq "__percpu_arg([current_task])",%%rsi\n\t" \ - __switch_canary \ - "movq %P[thread_info](%%rsi),%%r8\n\t" \ - "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ - "jnz ret_from_fork\n\t" \ - RESTORE_CONTEXT \ - : "=a" (last) \ - __switch_canary_oparam \ - : [next] "S" (next), [prev] "D" (prev), \ - [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ - [ti_flags] "i" (offsetof(struct thread_info, flags)), \ - [_tif_fork] "i" (_TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [current_task] "m" (current_task) \ - __switch_canary_iparam \ - : "memory", "cc" __EXTRA_CLOBBER) - -#endif /* CONFIG_X86_32 */ - #endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8b7c8d8..494c4b5 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -95,7 +95,6 @@ struct thread_info { #define TIF_UPROBE 12 /* breakpointed or singlestepping */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ -#define TIF_FORK 18 /* ret_from_fork */ #define TIF_NOHZ 19 /* in adaptive nohz mode */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ @@ -119,7 +118,6 @@ struct thread_info { #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) -#define _TIF_FORK (1 << TIF_FORK) #define _TIF_NOHZ (1 << TIF_NOHZ) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 2bd5c6f..db3a0af 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -29,6 +29,12 @@ void common(void) { BLANK(); + OFFSET(TASK_threadsp, task_struct, thread.sp); +#ifdef CONFIG_CC_STACKPROTECTOR + OFFSET(TASK_stack_canary, task_struct, stack_canary); +#endif + + BLANK(); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index ecdc1d2..880aa09 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -57,6 +57,11 @@ void foo(void) /* Size of SYSENTER_stack */ DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); +#ifdef CONFIG_CC_STACKPROTECTOR + BLANK(); + OFFSET(stack_canary_offset, stack_canary, canary); +#endif + #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d875f97..210927e 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -56,6 +56,11 @@ int main(void) OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); BLANK(); +#ifdef CONFIG_CC_STACKPROTECTOR + DEFINE(stack_canary_offset, offsetof(union irq_stack_union, stack_canary)); + BLANK(); +#endif + DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); DEFINE(NR_syscalls, sizeof(syscalls_64)); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index d86be29..4bedbc0 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -133,17 +133,20 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { struct pt_regs *childregs = task_pt_regs(p); + struct fork_frame *fork_frame = container_of(childregs, struct fork_frame, regs); + struct inactive_task_frame *frame = &fork_frame->frame; struct task_struct *tsk; int err; - p->thread.sp = (unsigned long) childregs; + frame->bp = 0; + p->thread.sp = (unsigned long) fork_frame; p->thread.sp0 = (unsigned long) (childregs+1); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - p->thread.ip = (unsigned long) ret_from_kernel_thread; + frame->ret_addr = (unsigned long) ret_from_kernel_thread; task_user_gs(p) = __KERNEL_STACK_CANARY; childregs->ds = __USER_DS; childregs->es = __USER_DS; @@ -161,7 +164,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, if (sp) childregs->sp = sp; - p->thread.ip = (unsigned long) ret_from_fork; + frame->ret_addr = (unsigned long) ret_from_fork; task_user_gs(p) = get_user_gs(current_pt_regs()); p->thread.io_bitmap_ptr = NULL; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 63236d8..827eeed 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -141,12 +141,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, { int err; struct pt_regs *childregs; + struct fork_frame *fork_frame; + struct inactive_task_frame *frame; struct task_struct *me = current; p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); - p->thread.sp = (unsigned long) childregs; - set_tsk_thread_flag(p, TIF_FORK); + fork_frame = container_of(childregs, struct fork_frame, regs); + frame = &fork_frame->frame; + frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; + p->thread.sp = (unsigned long) fork_frame; p->thread.io_bitmap_ptr = NULL; savesegment(gs, p->thread.gsindex); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c85d2c6..7e52f83 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -942,7 +942,6 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle) per_cpu(cpu_current_top_of_stack, cpu) = (unsigned long)task_stack_page(idle) + THREAD_SIZE; #else - clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); #endif } -- cgit v0.10.2 From 616d24835eeafa8ef3466479db028abfdfc77531 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:20 -0400 Subject: sched/x86: Pass kernel thread parameters in 'struct fork_frame' Instead of setting up a fake pt_regs context, put the kernel thread function pointer and arg into the unused callee-restored registers of 'struct fork_frame'. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-6-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index bf8f221..b75a8bc 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -240,35 +240,34 @@ END(__switch_to_asm) * A newly forked process directly context switches into this address. * * eax: prev task we switched from + * ebx: kernel thread func (NULL for user thread) + * edi: kernel thread arg */ ENTRY(ret_from_fork) pushl %eax call schedule_tail popl %eax + testl %ebx, %ebx + jnz 1f /* kernel threads are uncommon */ + +2: /* When we fork, we trace the syscall return in the child, too. */ movl %esp, %eax call syscall_return_slowpath jmp restore_all -END(ret_from_fork) - -ENTRY(ret_from_kernel_thread) - pushl %eax - call schedule_tail - popl %eax - movl PT_EBP(%esp), %eax - call *PT_EBX(%esp) - movl $0, PT_EAX(%esp) + /* kernel thread */ +1: movl %edi, %eax + call *%ebx /* - * Kernel threads return to userspace as if returning from a syscall. - * We should check whether anything actually uses this path and, if so, - * consider switching it over to ret_from_fork. + * A kernel thread is allowed to return here after successfully + * calling do_execve(). Exit to userspace to complete the execve() + * syscall. */ - movl %esp, %eax - call syscall_return_slowpath - jmp restore_all -ENDPROC(ret_from_kernel_thread) + movl $0, PT_EAX(%esp) + jmp 2b +END(ret_from_fork) /* * Return to user mode is not as complex as all this looks, diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c1af8ac..c0373d6 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -407,37 +407,34 @@ END(__switch_to_asm) * A newly forked process directly context switches into this address. * * rax: prev task we switched from + * rbx: kernel thread func (NULL for user thread) + * r12: kernel thread arg */ ENTRY(ret_from_fork) movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ - testb $3, CS(%rsp) /* from kernel_thread? */ - jnz 1f - - /* - * We came from kernel_thread. This code path is quite twisted, and - * someone should clean it up. - * - * copy_thread_tls stashes the function pointer in RBX and the - * parameter to be passed in RBP. The called function is permitted - * to call do_execve and thereby jump to user mode. - */ - movq RBP(%rsp), %rdi - call *RBX(%rsp) - movl $0, RAX(%rsp) - - /* - * Fall through as though we're exiting a syscall. This makes a - * twisted sort of sense if we just called do_execve. - */ + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ -1: +2: movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS jmp restore_regs_and_iret + +1: + /* kernel thread */ + movq %r12, %rdi + call *%rbx + /* + * A kernel thread is allowed to return here after successfully + * calling do_execve(). Exit to userspace to complete the execve() + * syscall. + */ + movq $0, RAX(%rsp) + jmp 2b END(ret_from_fork) /* diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index 886d5ea..5cb436a 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -34,6 +34,8 @@ static inline void prepare_switch_to(struct task_struct *prev, #endif } +asmlinkage void ret_from_fork(void); + /* data that is pointed to by thread.sp */ struct inactive_task_frame { #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4bedbc0..18714a1 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,9 +55,6 @@ #include #include -asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); -asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); - /* * Return saved PC of a blocked thread. */ @@ -139,6 +136,7 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, int err; frame->bp = 0; + frame->ret_addr = (unsigned long) ret_from_fork; p->thread.sp = (unsigned long) fork_frame; p->thread.sp0 = (unsigned long) (childregs+1); memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); @@ -146,25 +144,17 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - frame->ret_addr = (unsigned long) ret_from_kernel_thread; - task_user_gs(p) = __KERNEL_STACK_CANARY; - childregs->ds = __USER_DS; - childregs->es = __USER_DS; - childregs->fs = __KERNEL_PERCPU; - childregs->bx = sp; /* function */ - childregs->bp = arg; - childregs->orig_ax = -1; - childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + frame->bx = sp; /* function */ + frame->di = arg; p->thread.io_bitmap_ptr = NULL; return 0; } + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; if (sp) childregs->sp = sp; - frame->ret_addr = (unsigned long) ret_from_fork; task_user_gs(p) = get_user_gs(current_pt_regs()); p->thread.io_bitmap_ptr = NULL; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 827eeed..b812cd0 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -50,8 +50,6 @@ #include #include -asmlinkage extern void ret_from_fork(void); - __visible DEFINE_PER_CPU(unsigned long, rsp_scratch); /* Prints also some state that isn't saved in the pt_regs */ @@ -165,15 +163,11 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp, if (unlikely(p->flags & PF_KTHREAD)) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->sp = (unsigned long)childregs; - childregs->ss = __KERNEL_DS; - childregs->bx = sp; /* function */ - childregs->bp = arg; - childregs->orig_ax = -1; - childregs->cs = __KERNEL_CS | get_kernel_rpl(); - childregs->flags = X86_EFLAGS_IF | X86_EFLAGS_FIXED; + frame->bx = sp; /* function */ + frame->r12 = arg; return 0; } + frame->bx = 0; *childregs = *current_pt_regs(); childregs->ax = 0; -- cgit v0.10.2 From ffcb043ba524d3fbd979a9dac2c9ce8ad352000d Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:21 -0400 Subject: sched/x86: Fix thread_saved_pc() thread_saved_pc() was using a completely bogus method to get the return address. Since switch_to() was previously inlined, there was no sane way to know where on the stack the return address was stored. Now with the frame of a sleeping thread well defined, this can be implemented correctly. Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-7-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 6fee863..b22fb5a 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -721,8 +721,6 @@ static inline void spin_lock_prefetch(const void *x) .addr_limit = KERNEL_DS, \ } -extern unsigned long thread_saved_pc(struct task_struct *tsk); - /* * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. * This is necessary to guarantee that the entire "struct pt_regs" @@ -773,17 +771,13 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); .addr_limit = KERNEL_DS, \ } -/* - * Return saved PC of a blocked thread. - * What is this good for? it will be always the scheduler or ret_from_fork. - */ -#define thread_saved_pc(t) READ_ONCE_NOCHECK(*(unsigned long *)((t)->thread.sp - 8)) - #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ +extern unsigned long thread_saved_pc(struct task_struct *tsk); + extern void start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp); diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0115a4a..c1fa790 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -514,6 +514,17 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) } /* + * Return saved PC of a blocked thread. + * What is this good for? it will be always the scheduler or ret_from_fork. + */ +unsigned long thread_saved_pc(struct task_struct *tsk) +{ + struct inactive_task_frame *frame = + (struct inactive_task_frame *) READ_ONCE(tsk->thread.sp); + return READ_ONCE_NOCHECK(frame->ret_addr); +} + +/* * Called from fs/proc with a reference on @p to find the function * which called into schedule(). This needs to be done carefully * because the task might wake up and we might look at a stack diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 18714a1..404efdf 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,14 +55,6 @@ #include #include -/* - * Return saved PC of a blocked thread. - */ -unsigned long thread_saved_pc(struct task_struct *tsk) -{ - return ((unsigned long *)tsk->thread.sp)[3]; -} - void __show_regs(struct pt_regs *regs, int all) { unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L; -- cgit v0.10.2 From 01175255fd8e3e993353a779f819ec8c0c59137e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sat, 13 Aug 2016 12:38:22 -0400 Subject: sched: Remove __schedule() non-standard frame annotation Now that the x86 switch_to() uses the standard C calling convention, the STACK_FRAME_NON_STANDARD() annotation is no longer needed. Suggested-by: Josh Poimboeuf Signed-off-by: Brian Gerst Reviewed-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1471106302-10159-8-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2a906f2..3d91b63dd 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3381,7 +3381,6 @@ static void __sched notrace __schedule(bool preempt) balance_callback(rq); } -STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ static inline void sched_submit_work(struct task_struct *tsk) { -- cgit v0.10.2 From 6271cfdfc0e4731b76921ef02fdd87409d71dfdf Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 30 Aug 2016 17:27:57 -0700 Subject: x86/mm: Improve stack-overflow #PF handling If we get a page fault indicating kernel stack overflow, invoke handle_stack_overflow(). To prevent us from overflowing the stack again while handling the overflow (because we are likely to have very little stack space left), call handle_stack_overflow() on the double-fault stack. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6d6cf96b3fb9b4c9aa303817e1dc4de0c7c36487.1472603235.git.luto@kernel.org [ Minor edit. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c3496619..01fd0a7 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -117,6 +117,12 @@ extern void ist_exit(struct pt_regs *regs); extern void ist_begin_non_atomic(struct pt_regs *regs); extern void ist_end_non_atomic(void); +#ifdef CONFIG_VMAP_STACK +void __noreturn handle_stack_overflow(const char *message, + struct pt_regs *regs, + unsigned long fault_address); +#endif + /* Interrupts/Exceptions */ enum { X86_TRAP_DE = 0, /* 0, Divide-by-zero */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 907b4e4..bd4e3d4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -293,9 +293,9 @@ DO_ERROR(X86_TRAP_SS, SIGBUS, "stack segment", stack_segment) DO_ERROR(X86_TRAP_AC, SIGBUS, "alignment check", alignment_check) #ifdef CONFIG_VMAP_STACK -static void __noreturn handle_stack_overflow(const char *message, - struct pt_regs *regs, - unsigned long fault_address) +__visible void __noreturn handle_stack_overflow(const char *message, + struct pt_regs *regs, + unsigned long fault_address) { printk(KERN_EMERG "BUG: stack guard page was hit at %p (stack is %p..%p)\n", (void *)fault_address, current->stack, diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index dc80230..0b92fce 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -753,6 +753,38 @@ no_context(struct pt_regs *regs, unsigned long error_code, return; } +#ifdef CONFIG_VMAP_STACK + /* + * Stack overflow? During boot, we can fault near the initial + * stack in the direct map, but that's not an overflow -- check + * that we're in vmalloc space to avoid this. + */ + if (is_vmalloc_addr((void *)address) && + (((unsigned long)tsk->stack - 1 - address < PAGE_SIZE) || + address - ((unsigned long)tsk->stack + THREAD_SIZE) < PAGE_SIZE)) { + register void *__sp asm("rsp"); + unsigned long stack = this_cpu_read(orig_ist.ist[DOUBLEFAULT_STACK]) - sizeof(void *); + /* + * We're likely to be running with very little stack space + * left. It's plausible that we'd hit this condition but + * double-fault even before we get this far, in which case + * we're fine: the double-fault handler will deal with it. + * + * We don't want to make it all the way into the oops code + * and then double-fault, though, because we're likely to + * break the console driver and lose most of the stack dump. + */ + asm volatile ("movq %[stack], %%rsp\n\t" + "call handle_stack_overflow\n\t" + "1: jmp 1b" + : "+r" (__sp) + : "D" ("kernel stack overflow (page fault)"), + "S" (regs), "d" (address), + [stack] "rm" (stack)); + unreachable(); + } +#endif + /* * 32-bit: * -- cgit v0.10.2 From 9472fe7040bba45c6200858cbe40d643cf02bccb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 30 Aug 2016 08:04:15 -0700 Subject: virtio_console: Stop doing DMA on the stack virtio_console uses a small DMA buffer for control requests. Move that buffer into heap memory. Doing virtio DMA on the stack is normally okay on non-DMA-API virtio systems (which is currently most of them), but it breaks completely if the stack is virtually mapped (CONFIG_VMAP_STACK=y). Tested by typing both directions using picocom aimed at /dev/hvc0. Signed-off-by: Andy Lutomirski Reviewed-by: Amit Shah Cc: Linus Torvalds Cc: Michael S. Tsirkin Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: virtualization@lists.linux-foundation.org Link: http://lkml.kernel.org/r/0afe68f9b4be6c95af9e7672b07acd0274c26dfe.1472569320.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index d2406fe..5da47e26 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -165,6 +165,12 @@ struct ports_device { */ struct virtqueue *c_ivq, *c_ovq; + /* + * A control packet buffer for guest->host requests, protected + * by c_ovq_lock. + */ + struct virtio_console_control cpkt; + /* Array of per-port IO virtqueues */ struct virtqueue **in_vqs, **out_vqs; @@ -560,28 +566,29 @@ static ssize_t __send_control_msg(struct ports_device *portdev, u32 port_id, unsigned int event, unsigned int value) { struct scatterlist sg[1]; - struct virtio_console_control cpkt; struct virtqueue *vq; unsigned int len; if (!use_multiport(portdev)) return 0; - cpkt.id = cpu_to_virtio32(portdev->vdev, port_id); - cpkt.event = cpu_to_virtio16(portdev->vdev, event); - cpkt.value = cpu_to_virtio16(portdev->vdev, value); - vq = portdev->c_ovq; - sg_init_one(sg, &cpkt, sizeof(cpkt)); - spin_lock(&portdev->c_ovq_lock); - if (virtqueue_add_outbuf(vq, sg, 1, &cpkt, GFP_ATOMIC) == 0) { + + portdev->cpkt.id = cpu_to_virtio32(portdev->vdev, port_id); + portdev->cpkt.event = cpu_to_virtio16(portdev->vdev, event); + portdev->cpkt.value = cpu_to_virtio16(portdev->vdev, value); + + sg_init_one(sg, &portdev->cpkt, sizeof(struct virtio_console_control)); + + if (virtqueue_add_outbuf(vq, sg, 1, &portdev->cpkt, GFP_ATOMIC) == 0) { virtqueue_kick(vq); while (!virtqueue_get_buf(vq, &len) && !virtqueue_is_broken(vq)) cpu_relax(); } + spin_unlock(&portdev->c_ovq_lock); return 0; } -- cgit v0.10.2 From 1d723de7396c1c028a091a37b2211ff6892c7f52 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Sun, 28 Aug 2016 11:51:06 +0100 Subject: selftests/x86: Fix spelling mistake "preseve" -> "preserve" Trivial fix to spelling mistakes in printf messages. Signed-off-by: Colin Ian King Acked-by: Andy Lutomirski Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: linux-kselftest@vger.kernel.org Link: http://lkml.kernel.org/r/20160828105106.9763-1-colin.king@canonical.com Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c index 4214567..b037ce9c 100644 --- a/tools/testing/selftests/x86/ptrace_syscall.c +++ b/tools/testing/selftests/x86/ptrace_syscall.c @@ -147,7 +147,7 @@ static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *)) if (args.nr != getpid() || args.arg0 != 10 || args.arg1 != 11 || args.arg2 != 12 || args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) { - printf("[FAIL]\tgetpid() failed to preseve regs\n"); + printf("[FAIL]\tgetpid() failed to preserve regs\n"); nerrs++; } else { printf("[OK]\tgetpid() preserves regs\n"); @@ -162,7 +162,7 @@ static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *)) if (args.nr != 0 || args.arg0 != getpid() || args.arg1 != SIGUSR1 || args.arg2 != 12 || args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) { - printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preseve regs\n"); + printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preserve regs\n"); nerrs++; } else { printf("[OK]\tkill(getpid(), SIGUSR1) preserves regs\n"); -- cgit v0.10.2 From 019e579d395733d14097c2d29c8c43226dad1617 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 24 Aug 2016 11:50:14 -0500 Subject: perf/x86: Check perf_callchain_store() error Add a check to perf_callchain_kernel() so that it returns early if the callchain entry array is already full. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Brian Gerst Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/dce6d60bab08be2600efd90021d9b85620646161.1472057064.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index d0efb5c..c1319ac 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2277,7 +2277,8 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re return; } - perf_callchain_store(entry, regs->ip); + if (perf_callchain_store(entry, regs->ip)) + return; dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); } -- cgit v0.10.2 From 3e344a0db900757caaf0beeb749de4c7b59bfd60 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 24 Aug 2016 11:50:15 -0500 Subject: oprofile/x86: Add regs->ip to oprofile trace dump_trace() doesn't add the interrupted instruction's address to the trace, so add it manually. This makes the profile more useful, and also makes it more consistent with what perf profiling does. Signed-off-by: Josh Poimboeuf Acked-by: Robert Richter Cc: Andy Lutomirski Cc: Brian Gerst Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6c745a83dbd69fc6857ef9b2f8be0f011d775936.1472057064.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index cb31a44..2ef6c8b 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -114,9 +114,16 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) if (!user_mode(regs)) { unsigned long stack = kernel_stack_pointer(regs); - if (depth) - dump_trace(NULL, regs, (unsigned long *)stack, 0, - &backtrace_ops, &depth); + + if (!depth) + return; + + oprofile_add_trace(regs->ip); + if (!--depth) + return; + + dump_trace(NULL, regs, (unsigned long *)stack, 0, + &backtrace_ops, &depth); return; } -- cgit v0.10.2 From d438f5fda30ec087512355e405e9c8955d8bd337 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 24 Aug 2016 11:50:16 -0500 Subject: x86/dumpstack: Make printk_stack_address() more generally useful Change printk_stack_address() to be useful when called by an unwinder outside the context of dump_trace(). Specifically: - printk_stack_address()'s 'data' argument is always used as the log level string. Make that explicit. - Call touch_nmi_watchdog(). Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Brian Gerst Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/9fbe0db05bacf66d337c162edbf61450d0cff1e2.1472057064.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 01072e9..f0ddf85 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -26,10 +26,11 @@ int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; static void printk_stack_address(unsigned long address, int reliable, - void *data) + char *log_lvl) { + touch_nmi_watchdog(); printk("%s [<%p>] %s%pB\n", - (char *)data, (void *)address, reliable ? "" : "? ", + log_lvl, (void *)address, reliable ? "" : "? ", (void *)address); } @@ -148,7 +149,6 @@ static int print_trace_stack(void *data, char *name) */ static int print_trace_address(void *data, unsigned long addr, int reliable) { - touch_nmi_watchdog(); printk_stack_address(addr, reliable, data); return 0; } -- cgit v0.10.2 From 4b8afafbe743be1a81c96ddcd75b19c534d5e262 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 24 Aug 2016 11:50:17 -0500 Subject: x86/dumpstack: Add get_stack_pointer() and get_frame_pointer() The various functions involved in dumping the stack all do similar things with regard to getting the stack pointer and the frame pointer based on the regs and task arguments. Create helper functions to do that instead. Signed-off-by: Josh Poimboeuf Reviewed-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Brian Gerst Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f448914885a35f333fe04da1b97a6c2cc1f80974.1472057064.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 7646fb2..3552f5e 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -50,36 +50,41 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) #else #define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) #endif #ifdef CONFIG_FRAME_POINTER -static inline unsigned long -stack_frame(struct task_struct *task, struct pt_regs *regs) +static inline unsigned long * +get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { - unsigned long bp; - if (regs) - return regs->bp; + return (unsigned long *)regs->bp; - if (task == current) { - /* Grab bp right from our regs */ - get_bp(bp); - return bp; - } + if (!task || task == current) + return __builtin_frame_address(0); - return ((struct inactive_task_frame *)task->thread.sp)->bp; + return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp; } #else -static inline unsigned long -stack_frame(struct task_struct *task, struct pt_regs *regs) +static inline unsigned long * +get_frame_pointer(struct task_struct *task, struct pt_regs *regs) { - return 0; + return NULL; +} +#endif /* CONFIG_FRAME_POINTER */ + +static inline unsigned long * +get_stack_pointer(struct task_struct *task, struct pt_regs *regs) +{ + if (regs) + return (unsigned long *)kernel_stack_pointer(regs); + + if (!task || task == current) + return __builtin_frame_address(0); + + return (unsigned long *)task->thread.sp; } -#endif extern void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, @@ -106,7 +111,7 @@ static inline unsigned long caller_frame_pointer(void) { struct stack_frame *frame; - get_bp(frame); + frame = __builtin_frame_address(0); #ifdef CONFIG_FRAME_POINTER frame = frame->next_frame; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index f0ddf85..6d6f468 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -170,15 +170,14 @@ show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, void show_stack(struct task_struct *task, unsigned long *sp) { unsigned long bp = 0; - unsigned long stack; /* * Stack frames below this one aren't interesting. Don't show them * if we're printing for %current. */ if (!sp && (!task || task == current)) { - sp = &stack; - bp = stack_frame(current, NULL); + sp = get_stack_pointer(current, NULL); + bp = (unsigned long)get_frame_pointer(current, NULL); } show_stack_log_lvl(task, NULL, sp, bp, ""); diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 0967571..358fe1c 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -46,19 +46,9 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, int graph = 0; u32 *prev_esp; - if (!task) - task = current; - - if (!stack) { - unsigned long dummy; - - stack = &dummy; - if (task != current) - stack = (unsigned long *)task->thread.sp; - } - - if (!bp) - bp = stack_frame(task, regs); + task = task ? : current; + stack = stack ? : get_stack_pointer(task, regs); + bp = bp ? : (unsigned long)get_frame_pointer(task, regs); for (;;) { void *end_stack; @@ -95,14 +85,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack; int i; - if (sp == NULL) { - if (regs) - sp = (unsigned long *)regs->sp; - else if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } + sp = sp ? : get_stack_pointer(task, regs); stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 066eb5c7..7f3b806 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -151,25 +151,14 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, { const unsigned cpu = get_cpu(); unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); - unsigned long dummy; unsigned used = 0; int graph = 0; int done = 0; - if (!task) - task = current; + task = task ? : current; + stack = stack ? : get_stack_pointer(task, regs); + bp = bp ? : (unsigned long)get_frame_pointer(task, regs); - if (!stack) { - if (regs) - stack = (unsigned long *)regs->sp; - else if (task != current) - stack = (unsigned long *)task->thread.sp; - else - stack = &dummy; - } - - if (!bp) - bp = stack_frame(task, regs); /* * Print function call entries in all stacks, starting at the * current stack address. If the stacks consist of nested @@ -256,18 +245,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); - /* - * Debugging aid: "show_stack(NULL, NULL);" prints the - * back trace for this cpu: - */ - if (sp == NULL) { - if (regs) - sp = (unsigned long *)regs->sp; - else if (task) - sp = (unsigned long *)task->thread.sp; - else - sp = (unsigned long *)&sp; - } + sp = sp ? : get_stack_pointer(task, regs); stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { -- cgit v0.10.2 From 5a8ff54c260ecfed3de9b8d1272eb87826935df8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 24 Aug 2016 11:50:18 -0500 Subject: x86/dumpstack: Remove unnecessary stack pointer arguments When calling show_stack_log_lvl() or dump_trace() with a regs argument, providing a stack pointer or frame pointer is redundant. Signed-off-by: Josh Poimboeuf d Reviewed-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Brian Gerst Cc: Byungchul Park Cc: Frederic Weisbecker Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1694e2e955e3b9a73a3c3d5ba2634344014dd550.1472057064.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6d6f468..c6c6c39 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -185,7 +185,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, (unsigned long *)regs->sp, regs->bp, ""); + show_stack_log_lvl(current, regs, NULL, 0, ""); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 358fe1c..c533b8b 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -122,7 +122,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; pr_emerg("Stack:\n"); - show_stack_log_lvl(NULL, regs, ®s->sp, 0, KERN_EMERG); + show_stack_log_lvl(NULL, regs, NULL, 0, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 7f3b806..b243352 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -283,9 +283,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, void show_regs(struct pt_regs *regs) { int i; - unsigned long sp; - sp = regs->sp; show_regs_print_info(KERN_DEFAULT); __show_regs(regs, 1); @@ -300,8 +298,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(NULL, regs, (unsigned long *)sp, - 0, KERN_DEFAULT); + show_stack_log_lvl(NULL, regs, NULL, 0, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 2ef6c8b..d950f9e 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -113,8 +113,6 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); if (!user_mode(regs)) { - unsigned long stack = kernel_stack_pointer(regs); - if (!depth) return; @@ -122,8 +120,7 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) if (!--depth) return; - dump_trace(NULL, regs, (unsigned long *)stack, 0, - &backtrace_ops, &depth); + dump_trace(NULL, regs, NULL, 0, &backtrace_ops, &depth); return; } -- cgit v0.10.2 From 1ef0199a1a698d82ecd39d11d1daa3f4ab006c75 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 12 Sep 2016 15:05:50 -0700 Subject: selftests/x86/sigreturn: Use CX, not AX, as the scratch register RAX is handled specially in ESPFIX64. Use CX as our scratch register so that, if something goes wrong with RAX handling, we'll notice. Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/9ceeb24ea56925586c330dc46306f757ddea9fb5.1473717910.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/sigreturn.c b/tools/testing/selftests/x86/sigreturn.c index 8a577e7..246145b 100644 --- a/tools/testing/selftests/x86/sigreturn.c +++ b/tools/testing/selftests/x86/sigreturn.c @@ -106,7 +106,7 @@ asm (".pushsection .text\n\t" ".type int3, @function\n\t" ".align 4096\n\t" "int3:\n\t" - "mov %ss,%eax\n\t" + "mov %ss,%ecx\n\t" "int3\n\t" ".size int3, . - int3\n\t" ".align 4096, 0xcc\n\t" @@ -306,7 +306,7 @@ static volatile sig_atomic_t sig_corrupt_final_ss; #ifdef __x86_64__ # define REG_IP REG_RIP # define REG_SP REG_RSP -# define REG_AX REG_RAX +# define REG_CX REG_RCX struct selectors { unsigned short cs, gs, fs, ss; @@ -326,7 +326,7 @@ static unsigned short *csptr(ucontext_t *ctx) #else # define REG_IP REG_EIP # define REG_SP REG_ESP -# define REG_AX REG_EAX +# define REG_CX REG_ECX static greg_t *ssptr(ucontext_t *ctx) { @@ -457,10 +457,10 @@ static void sigusr1(int sig, siginfo_t *info, void *ctx_void) ctx->uc_mcontext.gregs[REG_IP] = sig_cs == code16_sel ? 0 : (unsigned long)&int3; ctx->uc_mcontext.gregs[REG_SP] = (unsigned long)0x8badf00d5aadc0deULL; - ctx->uc_mcontext.gregs[REG_AX] = 0; + ctx->uc_mcontext.gregs[REG_CX] = 0; memcpy(&requested_regs, &ctx->uc_mcontext.gregs, sizeof(gregset_t)); - requested_regs[REG_AX] = *ssptr(ctx); /* The asm code does this. */ + requested_regs[REG_CX] = *ssptr(ctx); /* The asm code does this. */ return; } @@ -482,7 +482,7 @@ static void sigtrap(int sig, siginfo_t *info, void *ctx_void) unsigned short ss; asm ("mov %%ss,%0" : "=r" (ss)); - greg_t asm_ss = ctx->uc_mcontext.gregs[REG_AX]; + greg_t asm_ss = ctx->uc_mcontext.gregs[REG_CX]; if (asm_ss != sig_ss && sig == SIGTRAP) { /* Sanity check failure. */ printf("[FAIL]\tSIGTRAP: ss = %hx, frame ss = %hx, ax = %llx\n", @@ -654,8 +654,8 @@ static int test_valid_sigreturn(int cs_bits, bool use_16bit_ss, int force_ss) #endif /* Sanity check on the kernel */ - if (i == REG_AX && requested_regs[i] != resulting_regs[i]) { - printf("[FAIL]\tAX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n", + if (i == REG_CX && requested_regs[i] != resulting_regs[i]) { + printf("[FAIL]\tCX (saved SP) mismatch: requested 0x%llx; got 0x%llx\n", (unsigned long long)requested_regs[i], (unsigned long long)resulting_regs[i]); nerrs++; -- cgit v0.10.2 From 85063fac1f72419eec4349621fe829b07f9acb1e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 12 Sep 2016 15:05:51 -0700 Subject: x86/entry/64: Clean up and document espfix64 stack setup The espfix64 setup code was a bit inscrutible and contained an unnecessary push of RAX. Remove that push, update all the stack offsets to match, and document the whole mess. Reported-By: Borislav Petkov Signed-off-by: Andy Lutomirski Reviewed-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e5459eb10cf1175c8b36b840bc425f210d045f35.1473717910.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index c0373d6..e7fba58 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -586,27 +586,69 @@ native_irq_return_iret: #ifdef CONFIG_X86_ESPFIX64 native_irq_return_ldt: - pushq %rax - pushq %rdi + /* + * We are running with user GSBASE. All GPRs contain their user + * values. We have a percpu ESPFIX stack that is eight slots + * long (see ESPFIX_STACK_SIZE). espfix_waddr points to the bottom + * of the ESPFIX stack. + * + * We clobber RAX and RDI in this code. We stash RDI on the + * normal stack and RAX on the ESPFIX stack. + * + * The ESPFIX stack layout we set up looks like this: + * + * --- top of ESPFIX stack --- + * SS + * RSP + * RFLAGS + * CS + * RIP <-- RSP points here when we're done + * RAX <-- espfix_waddr points here + * --- bottom of ESPFIX stack --- + */ + + pushq %rdi /* Stash user RDI */ SWAPGS movq PER_CPU_VAR(espfix_waddr), %rdi - movq %rax, (0*8)(%rdi) /* RAX */ - movq (2*8)(%rsp), %rax /* RIP */ + movq %rax, (0*8)(%rdi) /* user RAX */ + movq (1*8)(%rsp), %rax /* user RIP */ movq %rax, (1*8)(%rdi) - movq (3*8)(%rsp), %rax /* CS */ + movq (2*8)(%rsp), %rax /* user CS */ movq %rax, (2*8)(%rdi) - movq (4*8)(%rsp), %rax /* RFLAGS */ + movq (3*8)(%rsp), %rax /* user RFLAGS */ movq %rax, (3*8)(%rdi) - movq (6*8)(%rsp), %rax /* SS */ + movq (5*8)(%rsp), %rax /* user SS */ movq %rax, (5*8)(%rdi) - movq (5*8)(%rsp), %rax /* RSP */ + movq (4*8)(%rsp), %rax /* user RSP */ movq %rax, (4*8)(%rdi) - andl $0xffff0000, %eax - popq %rdi + /* Now RAX == RSP. */ + + andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ + popq %rdi /* Restore user RDI */ + + /* + * espfix_stack[31:16] == 0. The page tables are set up such that + * (espfix_stack | (X & 0xffff0000)) points to a read-only alias of + * espfix_waddr for any X. That is, there are 65536 RO aliases of + * the same page. Set up RSP so that RSP[31:16] contains the + * respective 16 bits of the /userspace/ RSP and RSP nonetheless + * still points to an RO alias of the ESPFIX stack. + */ orq PER_CPU_VAR(espfix_stack), %rax SWAPGS movq %rax, %rsp - popq %rax + + /* + * At this point, we cannot write to the stack any more, but we can + * still read. + */ + popq %rax /* Restore user RAX */ + + /* + * RSP now points to an ordinary IRET frame, except that the page + * is read-only and RSP[31:16] are preloaded with the userspace + * values. We can now IRET back to userspace. + */ jmp native_irq_return_iret #endif END(common_interrupt) -- cgit v0.10.2 From cfeeed279dc2fa83a00fbe4856ebd231d56201ab Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Thu, 8 Sep 2016 16:49:20 -0500 Subject: x86/dumpstack: Allow preemption in show_stack_log_lvl() and dump_trace() show_stack_log_lvl() and dump_trace() are already preemption safe: - If they're running in irq or exception context, preemption is already disabled and the percpu stack pointers can be trusted. - If they're running with preemption enabled, they must be running on the task stack anyway, so it doesn't matter if they're comparing the stack pointer against a percpu stack pointer from this CPU or another one: either way it won't match. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a0ca0b1044eca97d4f0ec7c1619cf80b3b65560d.1473371307.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c533b8b..da5cd62 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -24,16 +24,16 @@ static void *is_irq_stack(void *p, void *irq) } -static void *is_hardirq_stack(unsigned long *stack, int cpu) +static void *is_hardirq_stack(unsigned long *stack) { - void *irq = per_cpu(hardirq_stack, cpu); + void *irq = this_cpu_read(hardirq_stack); return is_irq_stack(stack, irq); } -static void *is_softirq_stack(unsigned long *stack, int cpu) +static void *is_softirq_stack(unsigned long *stack) { - void *irq = per_cpu(softirq_stack, cpu); + void *irq = this_cpu_read(softirq_stack); return is_irq_stack(stack, irq); } @@ -42,7 +42,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { - const unsigned cpu = get_cpu(); int graph = 0; u32 *prev_esp; @@ -53,9 +52,9 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, for (;;) { void *end_stack; - end_stack = is_hardirq_stack(stack, cpu); + end_stack = is_hardirq_stack(stack); if (!end_stack) - end_stack = is_softirq_stack(stack, cpu); + end_stack = is_softirq_stack(stack); bp = ops->walk_stack(task, stack, bp, ops, data, end_stack, &graph); @@ -74,7 +73,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, break; touch_nmi_watchdog(); } - put_cpu(); } EXPORT_SYMBOL(dump_trace); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index b243352..07373be 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -31,8 +31,8 @@ static char x86_stack_ids[][8] = { #endif }; -static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, - unsigned *usedp, char **idp) +static unsigned long *in_exception_stack(unsigned long stack, unsigned *usedp, + char **idp) { unsigned k; @@ -41,7 +41,7 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, * 'stack' is in one of them: */ for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = per_cpu(orig_ist, cpu).ist[k]; + unsigned long end = raw_cpu_ptr(&orig_ist)->ist[k]; /* * Is 'stack' above this exception frame's end? * If yes then skip to the next frame. @@ -111,7 +111,7 @@ enum stack_type { }; static enum stack_type -analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, +analyze_stack(struct task_struct *task, unsigned long *stack, unsigned long **stack_end, unsigned long *irq_stack, unsigned *used, char **id) { @@ -121,8 +121,7 @@ analyze_stack(int cpu, struct task_struct *task, unsigned long *stack, if ((unsigned long)task_stack_page(task) == addr) return STACK_IS_NORMAL; - *stack_end = in_exception_stack(cpu, (unsigned long)stack, - used, id); + *stack_end = in_exception_stack((unsigned long)stack, used, id); if (*stack_end) return STACK_IS_EXCEPTION; @@ -149,8 +148,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { - const unsigned cpu = get_cpu(); - unsigned long *irq_stack = (unsigned long *)per_cpu(irq_stack_ptr, cpu); + unsigned long *irq_stack = (unsigned long *)this_cpu_read(irq_stack_ptr); unsigned used = 0; int graph = 0; int done = 0; @@ -169,8 +167,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, enum stack_type stype; char *id; - stype = analyze_stack(cpu, task, stack, &stack_end, - irq_stack, &used, &id); + stype = analyze_stack(task, stack, &stack_end, irq_stack, &used, + &id); /* Default finish unless specified to continue */ done = 1; @@ -225,7 +223,6 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * This handles the process stack: */ bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph); - put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -236,13 +233,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *irq_stack_end; unsigned long *irq_stack; unsigned long *stack; - int cpu; int i; - preempt_disable(); - cpu = smp_processor_id(); - - irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); + irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); sp = sp ? : get_stack_pointer(task, regs); @@ -274,7 +267,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, stack++; touch_nmi_watchdog(); } - preempt_enable(); pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); -- cgit v0.10.2 From 9c00390757fd9f5851f7973b2f0e1e41550bb3b8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 14 Sep 2016 21:07:41 -0500 Subject: x86/dumpstack: Simplify in_exception_stack() in_exception_stack() does some bad, bad things just so the unwinder can print different values for different areas of the debug exception stack. There's no need to clarify where exactly on the stack it is. Just print "#DB" and be done with it. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e91cb410169dd576678dd427c35efb716fd0cee1.1473905218.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 07373be..904fb46 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,83 +16,46 @@ #include +static char *exception_stack_names[N_EXCEPTION_STACKS] = { + [ DOUBLEFAULT_STACK-1 ] = "#DF", + [ NMI_STACK-1 ] = "NMI", + [ DEBUG_STACK-1 ] = "#DB", + [ MCE_STACK-1 ] = "#MC", +}; -#define N_EXCEPTION_STACKS_END \ - (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) - -static char x86_stack_ids[][8] = { - [ DEBUG_STACK-1 ] = "#DB", - [ NMI_STACK-1 ] = "NMI", - [ DOUBLEFAULT_STACK-1 ] = "#DF", - [ MCE_STACK-1 ] = "#MC", -#if DEBUG_STKSZ > EXCEPTION_STKSZ - [ N_EXCEPTION_STACKS ... - N_EXCEPTION_STACKS_END ] = "#DB[?]" -#endif +static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ }; static unsigned long *in_exception_stack(unsigned long stack, unsigned *usedp, char **idp) { + unsigned long begin, end; unsigned k; - /* - * Iterate over all exception stacks, and figure out whether - * 'stack' is in one of them: - */ + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + for (k = 0; k < N_EXCEPTION_STACKS; k++) { - unsigned long end = raw_cpu_ptr(&orig_ist)->ist[k]; - /* - * Is 'stack' above this exception frame's end? - * If yes then skip to the next frame. - */ - if (stack >= end) + end = raw_cpu_ptr(&orig_ist)->ist[k]; + begin = end - exception_stack_sizes[k]; + + if (stack < begin || stack >= end) continue; + /* - * Is 'stack' above this exception frame's start address? - * If yes then we found the right frame. - */ - if (stack >= end - EXCEPTION_STKSZ) { - /* - * Make sure we only iterate through an exception - * stack once. If it comes up for the second time - * then there's something wrong going on - just - * break out and return NULL: - */ - if (*usedp & (1U << k)) - break; - *usedp |= 1U << k; - *idp = x86_stack_ids[k]; - return (unsigned long *)end; - } - /* - * If this is a debug stack, and if it has a larger size than - * the usual exception stacks, then 'stack' might still - * be within the lower portion of the debug stack: + * Make sure we only iterate through an exception stack once. + * If it comes up for the second time then there's something + * wrong going on - just break and return NULL: */ -#if DEBUG_STKSZ > EXCEPTION_STKSZ - if (k == DEBUG_STACK - 1 && stack >= end - DEBUG_STKSZ) { - unsigned j = N_EXCEPTION_STACKS - 1; + if (*usedp & (1U << k)) + break; + *usedp |= 1U << k; - /* - * Black magic. A large debug stack is composed of - * multiple exception stack entries, which we - * iterate through now. Dont look: - */ - do { - ++j; - end -= EXCEPTION_STKSZ; - x86_stack_ids[j][4] = '1' + - (j - N_EXCEPTION_STACKS); - } while (stack < end - EXCEPTION_STKSZ); - if (*usedp & (1U << j)) - break; - *usedp |= 1U << j; - *idp = x86_stack_ids[j]; - return (unsigned long *)end; - } -#endif + *idp = exception_stack_names[k]; + return (unsigned long *)end; } + return NULL; } -- cgit v0.10.2 From cb76c93982404273d746f3ccd5085b47689099a8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 14 Sep 2016 21:07:42 -0500 Subject: x86/dumpstack: Add get_stack_info() interface valid_stack_ptr() is buggy: it assumes that all stacks are of size THREAD_SIZE, which is not true for exception stacks. So the walk_stack() callbacks will need to know the location of the beginning of the stack as well as the end. Another issue is that in general the various features of a stack (type, size, next stack pointer, description string) are scattered around in various places throughout the stack dump code. Encapsulate all that information in a single place with a new stack_info struct and a get_stack_info() interface. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/8164dd0db96b7e6a279fa17ae5e6dc375eecb4a9.1473905218.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index c1319ac..477dc38 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -2251,7 +2251,7 @@ void arch_perf_update_userpage(struct perf_event *event, * callchain support */ -static int backtrace_stack(void *data, char *name) +static int backtrace_stack(void *data, const char *name) { return 0; } diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 3552f5e..780a83e 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -10,6 +10,39 @@ #include #include +enum stack_type { + STACK_TYPE_UNKNOWN, + STACK_TYPE_TASK, + STACK_TYPE_IRQ, + STACK_TYPE_SOFTIRQ, + STACK_TYPE_EXCEPTION, + STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, +}; + +struct stack_info { + enum stack_type type; + unsigned long *begin, *end, *next_sp; +}; + +bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info); + +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask); + +void stack_type_str(enum stack_type type, const char **begin, + const char **end); + +static inline bool on_stack(struct stack_info *info, void *addr, size_t len) +{ + void *begin = info->begin; + void *end = info->end; + + return (info->type != STACK_TYPE_UNKNOWN && + addr >= begin && addr < end && + addr + len > begin && addr + len <= end); +} + extern int kstack_depth_to_print; struct thread_info; @@ -20,27 +53,27 @@ typedef unsigned long (*walk_stack_t)(struct task_struct *task, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end, + struct stack_info *info, int *graph); extern unsigned long print_context_stack(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph); + struct stack_info *info, int *graph); extern unsigned long print_context_stack_bp(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph); + struct stack_info *info, int *graph); /* Generic stack tracer with callbacks */ struct stacktrace_ops { int (*address)(void *data, unsigned long address, int reliable); /* On negative return stop dumping */ - int (*stack)(void *data, char *name); + int (*stack)(void *data, const char *name); walk_stack_t walk_stack; }; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c6c6c39..aa208e5 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -25,6 +25,23 @@ unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; +bool in_task_stack(unsigned long *stack, struct task_struct *task, + struct stack_info *info) +{ + unsigned long *begin = task_stack_page(task); + unsigned long *end = task_stack_page(task) + THREAD_SIZE; + + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_TASK; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { @@ -46,24 +63,11 @@ void printk_address(unsigned long address) * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -static inline int valid_stack_ptr(struct task_struct *task, - void *p, unsigned int size, void *end) -{ - void *t = task_stack_page(task); - if (end) { - if (p < end && p >= (end-THREAD_SIZE)) - return 1; - else - return 0; - } - return p >= t && p < t + THREAD_SIZE - size; -} - unsigned long print_context_stack(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph) + struct stack_info *info, int *graph) { struct stack_frame *frame = (struct stack_frame *)bp; @@ -75,7 +79,7 @@ print_context_stack(struct task_struct *task, PAGE_SIZE) stack = (unsigned long *)task_stack_page(task); - while (valid_stack_ptr(task, stack, sizeof(*stack), end)) { + while (on_stack(info, stack, sizeof(*stack))) { unsigned long addr = *stack; if (__kernel_text_address(addr)) { @@ -114,12 +118,12 @@ unsigned long print_context_stack_bp(struct task_struct *task, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data, - unsigned long *end, int *graph) + struct stack_info *info, int *graph) { struct stack_frame *frame = (struct stack_frame *)bp; unsigned long *retp = &frame->return_address; - while (valid_stack_ptr(task, retp, sizeof(*retp), end)) { + while (on_stack(info, stack, sizeof(*stack) * 2)) { unsigned long addr = *retp; unsigned long real_addr; @@ -138,7 +142,7 @@ print_context_stack_bp(struct task_struct *task, } EXPORT_SYMBOL_GPL(print_context_stack_bp); -static int print_trace_stack(void *data, char *name) +static int print_trace_stack(void *data, const char *name) { printk("%s <%s> ", (char *)data, name); return 0; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index da5cd62..c92da5a 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,61 +16,117 @@ #include -static void *is_irq_stack(void *p, void *irq) +void stack_type_str(enum stack_type type, const char **begin, const char **end) { - if (p < irq || p >= (irq + THREAD_SIZE)) - return NULL; - return irq + THREAD_SIZE; + switch (type) { + case STACK_TYPE_IRQ: + case STACK_TYPE_SOFTIRQ: + *begin = "IRQ"; + *end = "EOI"; + break; + default: + *begin = NULL; + *end = NULL; + } } +static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) +{ + unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); + + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; + + /* + * See irq_32.c -- the next stack pointer is stored at the beginning of + * the stack. + */ + info->next_sp = (unsigned long *)*begin; + + return true; +} -static void *is_hardirq_stack(unsigned long *stack) +static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) { - void *irq = this_cpu_read(hardirq_stack); + unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); + unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); + + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_SOFTIRQ; + info->begin = begin; + info->end = end; + + /* + * The next stack pointer is stored at the beginning of the stack. + * See irq_32.c. + */ + info->next_sp = (unsigned long *)*begin; - return is_irq_stack(stack, irq); + return true; } -static void *is_softirq_stack(unsigned long *stack) +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) { - void *irq = this_cpu_read(softirq_stack); + if (!stack) + goto unknown; - return is_irq_stack(stack, irq); + task = task ? : current; + + if (in_task_stack(stack, task, info)) + return 0; + + if (task != current) + goto unknown; + + if (in_hardirq_stack(stack, info)) + return 0; + + if (in_softirq_stack(stack, info)) + return 0; + +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { + unsigned long visit_mask = 0; int graph = 0; - u32 *prev_esp; task = task ? : current; stack = stack ? : get_stack_pointer(task, regs); bp = bp ? : (unsigned long)get_frame_pointer(task, regs); for (;;) { - void *end_stack; + const char *begin_str, *end_str; + struct stack_info info; - end_stack = is_hardirq_stack(stack); - if (!end_stack) - end_stack = is_softirq_stack(stack); + if (get_stack_info(stack, task, &info, &visit_mask)) + break; - bp = ops->walk_stack(task, stack, bp, ops, data, - end_stack, &graph); + stack_type_str(info.type, &begin_str, &end_str); - /* Stop if not on irq stack */ - if (!end_stack) + if (begin_str && ops->stack(data, begin_str) < 0) break; - /* The previous esp is saved on the bottom of the stack */ - prev_esp = (u32 *)(end_stack - THREAD_SIZE); - stack = (unsigned long *)*prev_esp; - if (!stack) - break; + bp = ops->walk_stack(task, stack, bp, ops, data, &info, &graph); - if (ops->stack(data, "IRQ") < 0) + if (end_str && ops->stack(data, end_str) < 0) break; + + stack = info.next_sp; + touch_nmi_watchdog(); } } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 904fb46..41813ab 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -28,76 +28,109 @@ static unsigned long exception_stack_sizes[N_EXCEPTION_STACKS] = { [DEBUG_STACK - 1] = DEBUG_STKSZ }; -static unsigned long *in_exception_stack(unsigned long stack, unsigned *usedp, - char **idp) +void stack_type_str(enum stack_type type, const char **begin, const char **end) { - unsigned long begin, end; + BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); + + switch (type) { + case STACK_TYPE_IRQ: + *begin = "IRQ"; + *end = "EOI"; + break; + case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: + *begin = exception_stack_names[type - STACK_TYPE_EXCEPTION]; + *end = "EOE"; + break; + default: + *begin = NULL; + *end = NULL; + } +} + +static bool in_exception_stack(unsigned long *stack, struct stack_info *info, + unsigned long *visit_mask) +{ + unsigned long *begin, *end; + struct pt_regs *regs; unsigned k; BUILD_BUG_ON(N_EXCEPTION_STACKS != 4); for (k = 0; k < N_EXCEPTION_STACKS; k++) { - end = raw_cpu_ptr(&orig_ist)->ist[k]; - begin = end - exception_stack_sizes[k]; + end = (unsigned long *)raw_cpu_ptr(&orig_ist)->ist[k]; + begin = end - (exception_stack_sizes[k] / sizeof(long)); + regs = (struct pt_regs *)end - 1; if (stack < begin || stack >= end) continue; /* - * Make sure we only iterate through an exception stack once. - * If it comes up for the second time then there's something - * wrong going on - just break and return NULL: + * Make sure we don't iterate through an exception stack more + * than once. If it comes up a second time then there's + * something wrong going on - just break out and report an + * unknown stack type. */ - if (*usedp & (1U << k)) + if (*visit_mask & (1U << k)) break; - *usedp |= 1U << k; + *visit_mask |= 1U << k; - *idp = exception_stack_names[k]; - return (unsigned long *)end; + info->type = STACK_TYPE_EXCEPTION + k; + info->begin = begin; + info->end = end; + info->next_sp = (unsigned long *)regs->sp; + + return true; } - return NULL; + return false; } -static inline int -in_irq_stack(unsigned long *stack, unsigned long *irq_stack, - unsigned long *irq_stack_end) +static bool in_irq_stack(unsigned long *stack, struct stack_info *info) { - return (stack >= irq_stack && stack < irq_stack_end); -} + unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); + unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); -enum stack_type { - STACK_IS_UNKNOWN, - STACK_IS_NORMAL, - STACK_IS_EXCEPTION, - STACK_IS_IRQ, -}; + if (stack < begin || stack >= end) + return false; + + info->type = STACK_TYPE_IRQ; + info->begin = begin; + info->end = end; + + /* + * The next stack pointer is the first thing pushed by the entry code + * after switching to the irq stack. + */ + info->next_sp = (unsigned long *)*(end - 1); + + return true; +} -static enum stack_type -analyze_stack(struct task_struct *task, unsigned long *stack, - unsigned long **stack_end, unsigned long *irq_stack, - unsigned *used, char **id) +int get_stack_info(unsigned long *stack, struct task_struct *task, + struct stack_info *info, unsigned long *visit_mask) { - unsigned long addr; + if (!stack) + goto unknown; - addr = ((unsigned long)stack & (~(THREAD_SIZE - 1))); - if ((unsigned long)task_stack_page(task) == addr) - return STACK_IS_NORMAL; + task = task ? : current; + + if (in_task_stack(stack, task, info)) + return 0; - *stack_end = in_exception_stack((unsigned long)stack, used, id); - if (*stack_end) - return STACK_IS_EXCEPTION; + if (task != current) + goto unknown; - if (!irq_stack) - return STACK_IS_NORMAL; + if (in_exception_stack(stack, info, visit_mask)) + return 0; - *stack_end = irq_stack; - irq_stack -= (IRQ_STACK_SIZE / sizeof(long)); + if (in_irq_stack(stack, info)) + return 0; - if (in_irq_stack(stack, irq_stack, *stack_end)) - return STACK_IS_IRQ; + return 0; - return STACK_IS_UNKNOWN; +unknown: + info->type = STACK_TYPE_UNKNOWN; + return -EINVAL; } /* @@ -111,8 +144,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, const struct stacktrace_ops *ops, void *data) { - unsigned long *irq_stack = (unsigned long *)this_cpu_read(irq_stack_ptr); - unsigned used = 0; + unsigned long visit_mask = 0; + struct stack_info info; int graph = 0; int done = 0; @@ -126,57 +159,37 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * exceptions */ while (!done) { - unsigned long *stack_end; - enum stack_type stype; - char *id; + const char *begin_str, *end_str; - stype = analyze_stack(task, stack, &stack_end, irq_stack, &used, - &id); + get_stack_info(stack, task, &info, &visit_mask); /* Default finish unless specified to continue */ done = 1; - switch (stype) { + switch (info.type) { /* Break out early if we are on the thread stack */ - case STACK_IS_NORMAL: + case STACK_TYPE_TASK: break; - case STACK_IS_EXCEPTION: + case STACK_TYPE_IRQ: + case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: + + stack_type_str(info.type, &begin_str, &end_str); - if (ops->stack(data, id) < 0) + if (ops->stack(data, begin_str) < 0) break; bp = ops->walk_stack(task, stack, bp, ops, - data, stack_end, &graph); - ops->stack(data, "EOE"); - /* - * We link to the next stack via the - * second-to-last pointer (index -2 to end) in the - * exception stack: - */ - stack = (unsigned long *) stack_end[-2]; - done = 0; - break; + data, &info, &graph); - case STACK_IS_IRQ: + ops->stack(data, end_str); - if (ops->stack(data, "IRQ") < 0) - break; - bp = ops->walk_stack(task, stack, bp, - ops, data, stack_end, &graph); - /* - * We link to the next stack (which would be - * the process stack normally) the last - * pointer (index -1 to end) in the IRQ stack: - */ - stack = (unsigned long *) (stack_end[-1]); - irq_stack = NULL; - ops->stack(data, "EOI"); + stack = info.next_sp; done = 0; break; - case STACK_IS_UNKNOWN: + default: ops->stack(data, "UNK"); break; } @@ -185,7 +198,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, /* * This handles the process stack: */ - bp = ops->walk_stack(task, stack, bp, ops, data, NULL, &graph); + bp = ops->walk_stack(task, stack, bp, ops, data, &info, &graph); } EXPORT_SYMBOL(dump_trace); diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 4738f5e..785aef1 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -9,7 +9,7 @@ #include #include -static int save_stack_stack(void *data, char *name) +static int save_stack_stack(void *data, const char *name) { return 0; } diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index d950f9e..7539148 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -17,7 +17,7 @@ #include #include -static int backtrace_stack(void *data, char *name) +static int backtrace_stack(void *data, const char *name) { /* Yes, we want all stacks */ return 0; -- cgit v0.10.2 From 5fe599e02e41550c59831613a11c8ae057897c29 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 14 Sep 2016 21:07:43 -0500 Subject: x86/dumpstack: Add support for unwinding empty IRQ stacks When an interrupt happens in entry code while running on a software IRQ stack, and the IRQ stack was empty, regs->sp will contain the stack end address (e.g., irq_stack_ptr). If the regs are passed to dump_trace(), get_stack_info() will report STACK_TYPE_UNKNOWN, causing dump_trace() to return prematurely without trying to go to the next stack. Update the bounds checking for software interrupt stacks so that the ending address is now considered part of the stack. This means that it's now possible for the 'walk_stack' callbacks -- print_context_stack() and print_context_stack_bp() -- to be called with an empty stack. But that's fine; they're already prepared to deal with that due to their on_stack() checks. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5a5e5de92dcf11e8dc6b6e8e50ad7639d067830b.1473905218.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c92da5a..50076d4 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -35,7 +35,11 @@ static bool in_hardirq_stack(unsigned long *stack, struct stack_info *info) unsigned long *begin = (unsigned long *)this_cpu_read(hardirq_stack); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); - if (stack < begin || stack >= end) + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; @@ -56,7 +60,11 @@ static bool in_softirq_stack(unsigned long *stack, struct stack_info *info) unsigned long *begin = (unsigned long *)this_cpu_read(softirq_stack); unsigned long *end = begin + (THREAD_SIZE / sizeof(long)); - if (stack < begin || stack >= end) + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_SOFTIRQ; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 41813ab..2e708af 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -90,7 +90,11 @@ static bool in_irq_stack(unsigned long *stack, struct stack_info *info) unsigned long *end = (unsigned long *)this_cpu_read(irq_stack_ptr); unsigned long *begin = end - (IRQ_STACK_SIZE / sizeof(long)); - if (stack < begin || stack >= end) + /* + * This is a software stack, so 'end' can be a valid stack pointer. + * It just means the stack is empty. + */ + if (stack < begin || stack > end) return false; info->type = STACK_TYPE_IRQ; -- cgit v0.10.2 From fcd709ef20a9d83bdb7524d27cd6719dac8690a0 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Wed, 14 Sep 2016 21:07:44 -0500 Subject: x86/dumpstack: Add recursion checking for all stacks in_exception_stack() has some recursion checking which makes sure the stack trace code never traverses a given exception stack more than once. This prevents an infinite loop if corruption somehow causes a stack's "next stack" pointer to point to itself (directly or indirectly). The recursion checking can be useful for other stacks in addition to the exception stack, so extend it to work for all stacks. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/95de5db4cfe111754845a5cef04e20630d01423f.1473905218.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 50076d4..2d65cfa 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -89,16 +89,32 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, task = task ? : current; if (in_task_stack(stack, task, info)) - return 0; + goto recursion_check; if (task != current) goto unknown; if (in_hardirq_stack(stack, info)) - return 0; + goto recursion_check; if (in_softirq_stack(stack, info)) - return 0; + goto recursion_check; + + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) + goto unknown; + *visit_mask |= 1UL << info->type; + } + + return 0; unknown: info->type = STACK_TYPE_UNKNOWN; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 2e708af..8cb6004 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -47,8 +47,7 @@ void stack_type_str(enum stack_type type, const char **begin, const char **end) } } -static bool in_exception_stack(unsigned long *stack, struct stack_info *info, - unsigned long *visit_mask) +static bool in_exception_stack(unsigned long *stack, struct stack_info *info) { unsigned long *begin, *end; struct pt_regs *regs; @@ -64,16 +63,6 @@ static bool in_exception_stack(unsigned long *stack, struct stack_info *info, if (stack < begin || stack >= end) continue; - /* - * Make sure we don't iterate through an exception stack more - * than once. If it comes up a second time then there's - * something wrong going on - just break out and report an - * unknown stack type. - */ - if (*visit_mask & (1U << k)) - break; - *visit_mask |= 1U << k; - info->type = STACK_TYPE_EXCEPTION + k; info->begin = begin; info->end = end; @@ -119,16 +108,30 @@ int get_stack_info(unsigned long *stack, struct task_struct *task, task = task ? : current; if (in_task_stack(stack, task, info)) - return 0; + goto recursion_check; if (task != current) goto unknown; - if (in_exception_stack(stack, info, visit_mask)) - return 0; + if (in_exception_stack(stack, info)) + goto recursion_check; if (in_irq_stack(stack, info)) - return 0; + goto recursion_check; + + goto unknown; + +recursion_check: + /* + * Make sure we don't iterate through any given stack more than once. + * If it comes up a second time then there's something wrong going on: + * just break out and report an unknown stack type. + */ + if (visit_mask) { + if (*visit_mask & (1UL << info->type)) + goto unknown; + *visit_mask |= 1UL << info->type; + } return 0; -- cgit v0.10.2 From b9d989c7218ac922185d82ad46f3e58b27a4bea9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 13 Sep 2016 14:29:21 -0700 Subject: x86/asm: Move the thread_info::status field to thread_struct Because sched.h and thread_info.h are a tangled mess, I turned in_compat_syscall() into a macro. If we had current_thread_struct() or similar and we could use it from thread_info.h, then this would be a bit cleaner. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ccc8a1b2f41f9c264a41f771bb4a6539a642ad72.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 1433f6b..871bbf9 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -209,7 +209,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) * special case only applies after poking regs and before the * very next return to user mode. */ - ti->status &= ~(TS_COMPAT|TS_I386_REGS_POKED); + current->thread.status &= ~(TS_COMPAT|TS_I386_REGS_POKED); #endif user_enter_irqoff(); @@ -307,7 +307,7 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) unsigned int nr = (unsigned int)regs->orig_ax; #ifdef CONFIG_IA32_EMULATION - ti->status |= TS_COMPAT; + current->thread.status |= TS_COMPAT; #endif if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b22fb5a..984a7bf 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -389,6 +389,9 @@ struct thread_struct { unsigned short fsindex; unsigned short gsindex; #endif + + u32 status; /* thread synchronous flags */ + #ifdef CONFIG_X86_64 unsigned long fsbase; unsigned long gsbase; @@ -435,6 +438,15 @@ struct thread_struct { }; /* + * Thread-synchronous status. + * + * This is different from the flags in that nobody else + * ever touches our thread-synchronous status, so we don't + * have to worry about atomic accesses. + */ +#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ + +/* * Set IOPL bits in EFLAGS from given mask */ static inline void native_set_iopl_mask(unsigned mask) diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index 4e23dd1..e3c95e8 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -60,7 +60,7 @@ static inline long syscall_get_error(struct task_struct *task, * TS_COMPAT is set for 32-bit syscall entries and then * remains set until we return to user mode. */ - if (task_thread_info(task)->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (task->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) /* * Sign-extend the value so (int)-EFOO becomes (long)-EFOO * and will match correctly in comparisons. @@ -116,7 +116,7 @@ static inline void syscall_get_arguments(struct task_struct *task, unsigned long *args) { # ifdef CONFIG_IA32_EMULATION - if (task_thread_info(task)->status & TS_COMPAT) + if (task->thread.status & TS_COMPAT) switch (i) { case 0: if (!n--) break; @@ -177,7 +177,7 @@ static inline void syscall_set_arguments(struct task_struct *task, const unsigned long *args) { # ifdef CONFIG_IA32_EMULATION - if (task_thread_info(task)->status & TS_COMPAT) + if (task->thread.status & TS_COMPAT) switch (i) { case 0: if (!n--) break; @@ -234,18 +234,8 @@ static inline void syscall_set_arguments(struct task_struct *task, static inline int syscall_get_arch(void) { -#ifdef CONFIG_IA32_EMULATION - /* - * TS_COMPAT is set for 32-bit syscall entry and then - * remains set until we return to user mode. - * - * x32 tasks should be considered AUDIT_ARCH_X86_64. - */ - if (task_thread_info(current)->status & TS_COMPAT) - return AUDIT_ARCH_I386; -#endif - /* Both x32 and x86_64 are considered "64-bit". */ - return AUDIT_ARCH_X86_64; + /* x32 tasks should be considered AUDIT_ARCH_X86_64. */ + return in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; } #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 494c4b5..c9dcfe7 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -55,7 +55,6 @@ struct task_struct; struct thread_info { struct task_struct *task; /* main task structure */ __u32 flags; /* low level flags */ - __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ }; @@ -253,31 +252,17 @@ static inline int arch_within_stack_frames(const void * const stack, #endif -/* - * Thread-synchronous status. - * - * This is different from the flags in that nobody else - * ever touches our thread-synchronous status, so we don't - * have to worry about atomic accesses. - */ -#define TS_COMPAT 0x0002 /* 32bit syscall active (64BIT)*/ #ifdef CONFIG_COMPAT #define TS_I386_REGS_POKED 0x0004 /* regs poked by 32-bit ptracer */ #endif - #ifndef __ASSEMBLY__ -static inline bool in_ia32_syscall(void) -{ #ifdef CONFIG_X86_32 - return true; -#endif -#ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & TS_COMPAT) - return true; +#define in_ia32_syscall() true +#else +#define in_ia32_syscall() (IS_ENABLED(CONFIG_IA32_EMULATION) && \ + current->thread.status & TS_COMPAT) #endif - return false; -} /* * Force syscall return via IRET by making it look as if there was diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index db3a0af..add5f90 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -36,7 +36,6 @@ void common(void) { BLANK(); OFFSET(TI_flags, thread_info, flags); - OFFSET(TI_status, thread_info, status); BLANK(); OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 93982ae..2f2b8c7 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -317,7 +317,6 @@ static void __init fpu__init_system_ctx_switch(void) on_boot_cpu = 0; WARN_ON_FPU(current->thread.fpu.fpstate_active); - current_thread_info()->status = 0; if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE) eagerfpu = ENABLE; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b812cd0..de9acaf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -510,7 +510,7 @@ void set_personality_ia32(bool x32) current->personality &= ~READ_IMPLIES_EXEC; /* in_compat_syscall() uses the presence of the x32 syscall bit flag to determine compat status */ - current_thread_info()->status &= ~TS_COMPAT; + current->thread.status &= ~TS_COMPAT; } else { set_thread_flag(TIF_IA32); clear_thread_flag(TIF_X32); @@ -518,7 +518,7 @@ void set_personality_ia32(bool x32) current->mm->context.ia32_compat = TIF_IA32; current->personality |= force_personality32; /* Prepare the first "return" to user space */ - current_thread_info()->status |= TS_COMPAT; + current->thread.status |= TS_COMPAT; } } EXPORT_SYMBOL_GPL(set_personality_ia32); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 5b88a1b..ce94c38 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -934,7 +934,7 @@ static int putreg32(struct task_struct *child, unsigned regno, u32 value) */ regs->orig_ax = value; if (syscall_get_nr(child, regs) >= 0) - task_thread_info(child)->status |= TS_I386_REGS_POKED; + child->thread.status |= TS_I386_REGS_POKED; break; case offsetof(struct user32, regs.eflags): diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 04cb321..da20ecb 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -783,7 +783,7 @@ static inline unsigned long get_nr_restart_syscall(const struct pt_regs *regs) * than the tracee. */ #ifdef CONFIG_IA32_EMULATION - if (current_thread_info()->status & (TS_COMPAT|TS_I386_REGS_POKED)) + if (current->thread.status & (TS_COMPAT|TS_I386_REGS_POKED)) return __NR_ia32_restart_syscall; #endif #ifdef CONFIG_X86_X32_ABI -- cgit v0.10.2 From 97245d00585d82540f4538cf72d92a1e853c7b0e Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 13 Sep 2016 14:29:22 -0700 Subject: x86/entry: Get rid of pt_regs_to_thread_info() It was a nice optimization while it lasted, but thread_info is moving and this optimization will no longer work. Quoting Linus: Oh Gods, Andy. That pt_regs_to_thread_info() thing made me want to do unspeakable acts on a poor innocent wax figure that looked _exactly_ like you. [ Changelog written by Andy. ] Signed-off-by: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6376aa81c68798cc81631673f52bd91a3e078944.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 871bbf9..bdd9cc5 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -31,13 +31,6 @@ #define CREATE_TRACE_POINTS #include -static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) -{ - unsigned long top_of_stack = - (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; - return (struct thread_info *)(top_of_stack - THREAD_SIZE); -} - #ifdef CONFIG_CONTEXT_TRACKING /* Called on entry from user mode with IRQs off. */ __visible inline void enter_from_user_mode(void) @@ -71,7 +64,7 @@ static long syscall_trace_enter(struct pt_regs *regs) { u32 arch = in_ia32_syscall() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned long ret = 0; bool emulated = false; u32 work; @@ -173,18 +166,17 @@ static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) /* Disable IRQs and retry */ local_irq_disable(); - cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags); + cached_flags = READ_ONCE(current_thread_info()->flags); if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) break; - } } /* Called with IRQs disabled. */ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); u32 cached_flags; if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) @@ -247,7 +239,7 @@ static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) */ __visible inline void syscall_return_slowpath(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); u32 cached_flags = READ_ONCE(ti->flags); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); @@ -270,7 +262,7 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) #ifdef CONFIG_X86_64 __visible void do_syscall_64(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned long nr = regs->orig_ax; enter_from_user_mode(); @@ -303,7 +295,7 @@ __visible void do_syscall_64(struct pt_regs *regs) */ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { - struct thread_info *ti = pt_regs_to_thread_info(regs); + struct thread_info *ti = current_thread_info(); unsigned int nr = (unsigned int)regs->orig_ax; #ifdef CONFIG_IA32_EMULATION -- cgit v0.10.2 From d896fa20a70c9e596438728561e058a74ed3196b Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 13 Sep 2016 14:29:23 -0700 Subject: um/Stop conflating task_struct::stack with thread_info thread_info may move in the future, so use the accessors. [ Andy Lutomirski wrote this changelog message and changed "task_thread_info(child)->cpu" to "task_cpu(child)". ] Signed-off-by: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3439705d9838940cc82733a7335fa8c654c37db8.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index a7ef7b1..5766ead 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -194,7 +194,7 @@ int peek_user(struct task_struct *child, long addr, long data) static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + int err, n, cpu = task_cpu(child); struct user_i387_struct fpregs; err = save_i387_registers(userspace_pid[cpu], @@ -211,7 +211,7 @@ static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *c static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) { - int n, cpu = ((struct thread_info *) child->stack)->cpu; + int n, cpu = task_cpu(child); struct user_i387_struct fpregs; n = copy_from_user(&fpregs, buf, sizeof(fpregs)); @@ -224,7 +224,7 @@ static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *c static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) { - int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + int err, n, cpu = task_cpu(child); struct user_fxsr_struct fpregs; err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs); @@ -240,7 +240,7 @@ static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct * static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) { - int n, cpu = ((struct thread_info *) child->stack)->cpu; + int n, cpu = task_cpu(child); struct user_fxsr_struct fpregs; n = copy_from_user(&fpregs, buf, sizeof(fpregs)); -- cgit v0.10.2 From c65eacbe290b8141554c71b2c94489e73ade8c8d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 13 Sep 2016 14:29:24 -0700 Subject: sched/core: Allow putting thread_info into task_struct If an arch opts in by setting CONFIG_THREAD_INFO_IN_TASK_STRUCT, then thread_info is defined as a single 'u32 flags' and is the first entry of task_struct. thread_info::task is removed (it serves no purpose if thread_info is embedded in task_struct), and thread_info::cpu gets its own slot in task_struct. This is heavily based on a patch written by Linus. Originally-from: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a0898196f0476195ca02713691a5037a14f2aac5.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/include/linux/init_task.h b/include/linux/init_task.h index f8834f8..9c04d44 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -15,6 +15,8 @@ #include #include +#include + #ifdef CONFIG_SMP # define INIT_PUSHABLE_TASKS(tsk) \ .pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), @@ -183,12 +185,19 @@ extern struct task_group root_task_group; # define INIT_KASAN(tsk) #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK +# define INIT_TASK_TI(tsk) .thread_info = INIT_THREAD_INFO(tsk), +#else +# define INIT_TASK_TI(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) */ #define INIT_TASK(tsk) \ { \ + INIT_TASK_TI(tsk) \ .state = 0, \ .stack = init_stack, \ .usage = ATOMIC_INIT(2), \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 20f9f47..a287e8b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1458,6 +1458,13 @@ struct tlbflush_unmap_batch { }; struct task_struct { +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* + * For reasons of header soup (see current_thread_info()), this + * must be the first element of task_struct. + */ + struct thread_info thread_info; +#endif volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; atomic_t usage; @@ -1467,6 +1474,9 @@ struct task_struct { #ifdef CONFIG_SMP struct llist_node wake_entry; int on_cpu; +#ifdef CONFIG_THREAD_INFO_IN_TASK + unsigned int cpu; /* current CPU */ +#endif unsigned int wakee_flips; unsigned long wakee_flip_decay_ts; struct task_struct *last_wakee; @@ -2588,7 +2598,9 @@ extern void set_curr_task(int cpu, struct task_struct *p); void yield(void); union thread_union { +#ifndef CONFIG_THREAD_INFO_IN_TASK struct thread_info thread_info; +#endif unsigned long stack[THREAD_SIZE/sizeof(long)]; }; @@ -3076,10 +3088,26 @@ static inline void threadgroup_change_end(struct task_struct *tsk) cgroup_threadgroup_change_end(tsk); } -#ifndef __HAVE_THREAD_FUNCTIONS +#ifdef CONFIG_THREAD_INFO_IN_TASK + +static inline struct thread_info *task_thread_info(struct task_struct *task) +{ + return &task->thread_info; +} +static inline void *task_stack_page(const struct task_struct *task) +{ + return task->stack; +} +#define setup_thread_stack(new,old) do { } while(0) +static inline unsigned long *end_of_stack(const struct task_struct *task) +{ + return task->stack; +} + +#elif !defined(__HAVE_THREAD_FUNCTIONS) #define task_thread_info(task) ((struct thread_info *)(task)->stack) -#define task_stack_page(task) ((task)->stack) +#define task_stack_page(task) ((void *)(task)->stack) static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org) { @@ -3379,7 +3407,11 @@ static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) static inline unsigned int task_cpu(const struct task_struct *p) { +#ifdef CONFIG_THREAD_INFO_IN_TASK + return p->cpu; +#else return task_thread_info(p)->cpu; +#endif } static inline int task_node(const struct task_struct *p) diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 2b5b10e..e2d0fd8 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -13,6 +13,21 @@ struct timespec; struct compat_timespec; +#ifdef CONFIG_THREAD_INFO_IN_TASK +struct thread_info { + u32 flags; /* low level flags */ +}; + +#define INIT_THREAD_INFO(tsk) \ +{ \ + .flags = 0, \ +} +#endif + +#ifdef CONFIG_THREAD_INFO_IN_TASK +#define current_thread_info() ((struct thread_info *)current) +#endif + /* * System call restart block. */ diff --git a/init/Kconfig b/init/Kconfig index cac3f09..ec8d438 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -26,6 +26,13 @@ config IRQ_WORK config BUILDTIME_EXTABLE_SORT bool +config THREAD_INFO_IN_TASK + bool + help + Select this to move thread_info off the stack into task_struct. To + make this work, an arch will need to remove all thread_info fields + except flags and fix any runtime bugs. + menu "General setup" config BROKEN diff --git a/init/init_task.c b/init/init_task.c index ba0a7f36..11f83be1 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -22,5 +22,8 @@ EXPORT_SYMBOL(init_task); * Initial thread structure. Alignment of this is handled by a special * linker map entry. */ -union thread_union init_thread_union __init_task_data = - { INIT_THREAD_INFO(init_task) }; +union thread_union init_thread_union __init_task_data = { +#ifndef CONFIG_THREAD_INFO_IN_TASK + INIT_THREAD_INFO(init_task) +#endif +}; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c64fc51..3655c96 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1000,7 +1000,11 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) * per-task data have been completed by this moment. */ smp_wmb(); +#ifdef CONFIG_THREAD_INFO_IN_TASK + p->cpu = cpu; +#else task_thread_info(p)->cpu = cpu; +#endif p->wake_cpu = cpu; #endif } -- cgit v0.10.2 From 15f4eae70d365bba26854c90b6002aaabb18c8aa Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 13 Sep 2016 14:29:25 -0700 Subject: x86: Move thread_info into task_struct Now that most of the thread_info users have been cleaned up, this is straightforward. Most of this code was written by Linus. Originally-from: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a50eab40abeaec9cb9a9e3cbdeafd32190206654.1473801993.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4c39728..2a83bc8 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -157,6 +157,7 @@ config X86 select SPARSE_IRQ select SRCU select SYSCTL_EXCEPTION_TRACE + select THREAD_INFO_IN_TASK select USER_STACKTRACE_SUPPORT select VIRT_TO_BUS select X86_DEV_DMA_OPS if X86_64 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index e7fba58..2b46384 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -179,7 +179,8 @@ GLOBAL(entry_SYSCALL_64_after_swapgs) * If we need to do entry work or if we guess we'll need to do * exit work, go straight to the slow path. */ - testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + movq PER_CPU_VAR(current_task), %r11 + testl $_TIF_WORK_SYSCALL_ENTRY|_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz entry_SYSCALL64_slow_path entry_SYSCALL_64_fastpath: @@ -217,7 +218,8 @@ entry_SYSCALL_64_fastpath: */ DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + movq PER_CPU_VAR(current_task), %r11 + testl $_TIF_ALLWORK_MASK, TASK_TI_flags(%r11) jnz 1f LOCKDEP_SYS_EXIT @@ -370,6 +372,7 @@ END(ptregs_\func) /* * %rdi: prev task * %rsi: next task + * rsi: task we're switching to */ ENTRY(__switch_to_asm) /* diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index c9dcfe7..2aaca53 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -52,20 +52,6 @@ struct task_struct; #include #include -struct thread_info { - struct task_struct *task; /* main task structure */ - __u32 flags; /* low level flags */ - __u32 cpu; /* current CPU */ -}; - -#define INIT_THREAD_INFO(tsk) \ -{ \ - .task = &tsk, \ - .flags = 0, \ - .cpu = 0, \ -} - -#define init_thread_info (init_thread_union.thread_info) #define init_stack (init_thread_union.stack) #else /* !__ASSEMBLY__ */ @@ -157,11 +143,6 @@ struct thread_info { */ #ifndef __ASSEMBLY__ -static inline struct thread_info *current_thread_info(void) -{ - return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); -} - static inline unsigned long current_stack_pointer(void) { unsigned long sp; @@ -223,33 +204,6 @@ static inline int arch_within_stack_frames(const void * const stack, # define cpu_current_top_of_stack (cpu_tss + TSS_sp0) #endif -/* - * ASM operand which evaluates to a 'thread_info' address of - * the current task, if it is known that "reg" is exactly "off" - * bytes below the top of the stack currently. - * - * ( The kernel stack's size is known at build time, it is usually - * 2 or 4 pages, and the bottom of the kernel stack contains - * the thread_info structure. So to access the thread_info very - * quickly from assembly code we can calculate down from the - * top of the kernel stack to the bottom, using constant, - * build-time calculations only. ) - * - * For example, to fetch the current thread_info->flags value into %eax - * on x86-64 defconfig kernels, in syscall entry code where RSP is - * currently at exactly SIZEOF_PTREGS bytes away from the top of the - * stack: - * - * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax - * - * will translate to: - * - * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax - * - * which is below the current RSP by almost 16K. - */ -#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) - #endif #ifdef CONFIG_COMPAT diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index add5f90..c62e015 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -35,9 +35,7 @@ void common(void) { #endif BLANK(); - OFFSET(TI_flags, thread_info, flags); - - BLANK(); + OFFSET(TASK_TI_flags, task_struct, thread_info.flags); OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); BLANK(); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 4a79037..9ebd0b0 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -40,8 +40,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) if (user_mode(regs)) return; - if (regs->sp >= curbase + sizeof(struct thread_info) + - sizeof(struct pt_regs) + STACK_TOP_MARGIN && + if (regs->sp >= curbase + sizeof(struct pt_regs) + STACK_TOP_MARGIN && regs->sp <= curbase + THREAD_SIZE) return; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index c1fa790..0b9ed8e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -549,9 +549,7 @@ unsigned long get_wchan(struct task_struct *p) * PADDING * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING * stack - * ----------- bottom = start + sizeof(thread_info) - * thread_info - * ----------- start + * ----------- bottom = start * * The tasks stack pointer points at the location where the * framepointer is stored. The data on the stack is: @@ -562,7 +560,7 @@ unsigned long get_wchan(struct task_struct *p) */ top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; top -= 2 * sizeof(unsigned long); - bottom = start + sizeof(struct thread_info); + bottom = start; sp = READ_ONCE(p->thread.sp); if (sp < bottom || sp > top) -- cgit v0.10.2 From 4bf5beef578e46393f11eb69dda7d17a065e05ff Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 14 Sep 2016 11:41:59 +0200 Subject: iommu/amd: Don't put completion-wait semaphore on stack The semaphore used by the AMD IOMMU to signal command completion lived on the stack until now, which was safe as the driver busy-waited on the semaphore with IRQs disabled, so the stack can't go away under the driver. But the recently introduced vmap-based stacks break this as the physical address of the semaphore can't be determinded easily anymore. The driver used the __pa() macro, but that only works in the direct-mapping. The result were Completion-Wait timeout errors seen by the IOMMU driver, breaking system boot. Since putting the semaphore on the stack is bad design anyway, move the semaphore into 'struct amd_iommu'. It is protected by the per-iommu lock and now in the direct mapping again. This fixes the Completion-Wait timeout errors and makes AMD IOMMU systems boot again with vmap-based stacks enabled. Reported-by: Borislav Petkov Signed-off-by: Joerg Roedel Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index 96de97a..4025291 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -940,15 +940,13 @@ static void build_inv_irt(struct iommu_cmd *cmd, u16 devid) * Writes the command to the IOMMUs command buffer and informs the * hardware about the new command. */ -static int iommu_queue_command_sync(struct amd_iommu *iommu, - struct iommu_cmd *cmd, - bool sync) +static int __iommu_queue_command_sync(struct amd_iommu *iommu, + struct iommu_cmd *cmd, + bool sync) { u32 left, tail, head, next_tail; - unsigned long flags; again: - spin_lock_irqsave(&iommu->lock, flags); head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); @@ -957,15 +955,14 @@ again: if (left <= 2) { struct iommu_cmd sync_cmd; - volatile u64 sem = 0; int ret; - build_completion_wait(&sync_cmd, (u64)&sem); - copy_cmd_to_buffer(iommu, &sync_cmd, tail); + iommu->cmd_sem = 0; - spin_unlock_irqrestore(&iommu->lock, flags); + build_completion_wait(&sync_cmd, (u64)&iommu->cmd_sem); + copy_cmd_to_buffer(iommu, &sync_cmd, tail); - if ((ret = wait_on_sem(&sem)) != 0) + if ((ret = wait_on_sem(&iommu->cmd_sem)) != 0) return ret; goto again; @@ -976,9 +973,21 @@ again: /* We need to sync now to make sure all commands are processed */ iommu->need_sync = sync; + return 0; +} + +static int iommu_queue_command_sync(struct amd_iommu *iommu, + struct iommu_cmd *cmd, + bool sync) +{ + unsigned long flags; + int ret; + + spin_lock_irqsave(&iommu->lock, flags); + ret = __iommu_queue_command_sync(iommu, cmd, sync); spin_unlock_irqrestore(&iommu->lock, flags); - return 0; + return ret; } static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) @@ -993,19 +1002,29 @@ static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) static int iommu_completion_wait(struct amd_iommu *iommu) { struct iommu_cmd cmd; - volatile u64 sem = 0; + unsigned long flags; int ret; if (!iommu->need_sync) return 0; - build_completion_wait(&cmd, (u64)&sem); - ret = iommu_queue_command_sync(iommu, &cmd, false); + build_completion_wait(&cmd, (u64)&iommu->cmd_sem); + + spin_lock_irqsave(&iommu->lock, flags); + + iommu->cmd_sem = 0; + + ret = __iommu_queue_command_sync(iommu, &cmd, false); if (ret) - return ret; + goto out_unlock; + + ret = wait_on_sem(&iommu->cmd_sem); - return wait_on_sem(&sem); +out_unlock: + spin_unlock_irqrestore(&iommu->lock, flags); + + return ret; } static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid) diff --git a/drivers/iommu/amd_iommu_types.h b/drivers/iommu/amd_iommu_types.h index caf5e38..9652848 100644 --- a/drivers/iommu/amd_iommu_types.h +++ b/drivers/iommu/amd_iommu_types.h @@ -524,6 +524,8 @@ struct amd_iommu { struct irq_domain *ir_domain; struct irq_domain *msi_domain; #endif + + volatile u64 __aligned(8) cmd_sem; }; #define ACPIHID_UID_LEN 256 -- cgit v0.10.2 From ff0071c03684485495e06f3936399eb9c93141a6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:42 -0700 Subject: x86/entry/64: Fix a minor comment rebase error When I rebased my thread_info changes onto Brian's switch_to() changes, I carefully checked that I fixed up all the code correctly, but I missed a comment :( Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 15f4eae70d36 ("x86: Move thread_info into task_struct") Link: http://lkml.kernel.org/r/089fe1e1cbe8b258b064fccbb1a5a5fd23861031.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 2b46384..80ab68a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -372,7 +372,6 @@ END(ptregs_\func) /* * %rdi: prev task * %rsi: next task - * rsi: task we're switching to */ ENTRY(__switch_to_asm) /* -- cgit v0.10.2 From c6c314a613cd7d03fb97713e0d642b493de42e69 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:43 -0700 Subject: sched/core: Add try_get_task_stack() and put_task_stack() There are a few places in the kernel that access stack memory belonging to a different task. Before we can start freeing task stacks before the task_struct is freed, we need a way for those code paths to pin the stack. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/17a434f50ad3d77000104f21666575e10a9c1fbd.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/include/linux/sched.h b/include/linux/sched.h index a287e8b..a958672 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3094,11 +3094,19 @@ static inline struct thread_info *task_thread_info(struct task_struct *task) { return &task->thread_info; } + +/* + * When accessing the stack of a non-current task that might exit, use + * try_get_task_stack() instead. task_stack_page will return a pointer + * that could get freed out from under you. + */ static inline void *task_stack_page(const struct task_struct *task) { return task->stack; } + #define setup_thread_stack(new,old) do { } while(0) + static inline unsigned long *end_of_stack(const struct task_struct *task) { return task->stack; @@ -3134,6 +3142,14 @@ static inline unsigned long *end_of_stack(struct task_struct *p) } #endif + +static inline void *try_get_task_stack(struct task_struct *tsk) +{ + return task_stack_page(tsk); +} + +static inline void put_task_stack(struct task_struct *tsk) {} + #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) diff --git a/init/Kconfig b/init/Kconfig index ec8d438..3b9a47f 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -33,6 +33,9 @@ config THREAD_INFO_IN_TASK make this work, an arch will need to remove all thread_info fields except flags and fix any runtime bugs. + One subtle change that will be needed is to use try_get_task_stack() + and put_task_stack() in save_thread_stack_tsk() and get_wchan(). + menu "General setup" config BROKEN -- cgit v0.10.2 From 23196f2e5f5d810578a772785807dcdc2b9fdce9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 15 Sep 2016 22:45:44 -0700 Subject: kthread: Pin the stack via try_get_task_stack()/put_task_stack() in to_live_kthread() function get_task_struct(tsk) no longer pins tsk->stack so all users of to_live_kthread() should do try_get_task_stack/put_task_stack to protect "struct kthread" which lives on kthread's stack. TODO: Kill to_live_kthread(), perhaps we can even kill "struct kthread" too, and rework kthread_stop(), it can use task_work_add() to sync with the exiting kernel thread. Message-Id: <20160629180357.GA7178@redhat.com> Signed-off-by: Oleg Nesterov Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cb9b16bbc19d4aea4507ab0552e4644c1211d130.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/kthread.c b/kernel/kthread.c index 9ff173d..4ab4c37 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -64,7 +64,7 @@ static inline struct kthread *to_kthread(struct task_struct *k) static struct kthread *to_live_kthread(struct task_struct *k) { struct completion *vfork = ACCESS_ONCE(k->vfork_done); - if (likely(vfork)) + if (likely(vfork) && try_get_task_stack(k)) return __to_kthread(vfork); return NULL; } @@ -425,8 +425,10 @@ void kthread_unpark(struct task_struct *k) { struct kthread *kthread = to_live_kthread(k); - if (kthread) + if (kthread) { __kthread_unpark(k, kthread); + put_task_stack(k); + } } EXPORT_SYMBOL_GPL(kthread_unpark); @@ -455,6 +457,7 @@ int kthread_park(struct task_struct *k) wait_for_completion(&kthread->parked); } } + put_task_stack(k); ret = 0; } return ret; @@ -490,6 +493,7 @@ int kthread_stop(struct task_struct *k) __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); + put_task_stack(k); } ret = k->exit_code; put_task_struct(k); -- cgit v0.10.2 From 1959a60182f48879635812a03a99c02231ea8677 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:45 -0700 Subject: x86/dumpstack: Pin the target stack when dumping it Specifically, pin the stack in save_stack_trace_tsk() and show_trace_log_lvl(). This will prevent a crash if the target task dies before or while dumping its stack once we start freeing task stacks early. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/cf0082cde65d1941a996d026f2b2cdbfaca17bfa.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 2d65cfa..122f37d7 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -163,6 +163,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack; int i; + if (!try_get_task_stack(task)) + return; + sp = sp ? : get_stack_pointer(task, regs); stack = sp; @@ -179,6 +182,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, } pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); + + put_task_stack(task); } diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 8cb6004..16c0d5f 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -218,6 +218,9 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack; int i; + if (!try_get_task_stack(task)) + return; + irq_stack_end = (unsigned long *)this_cpu_read(irq_stack_ptr); irq_stack = irq_stack_end - (IRQ_STACK_SIZE / sizeof(long)); @@ -253,6 +256,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, pr_cont("\n"); show_trace_log_lvl(task, regs, sp, bp, log_lvl); + + put_task_stack(task); } void show_regs(struct pt_regs *regs) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 785aef1..23fa81e 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -79,9 +79,14 @@ void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) { + if (!try_get_task_stack(tsk)) + return; + dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); if (trace->nr_entries < trace->max_entries) trace->entries[trace->nr_entries++] = ULONG_MAX; + + put_task_stack(tsk); } EXPORT_SYMBOL_GPL(save_stack_trace_tsk); -- cgit v0.10.2 From 74327a3e884a0ff895ba7b51d3488e6a177407b2 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:46 -0700 Subject: x86/process: Pin the target stack in get_wchan() This will prevent a crash if get_wchan() runs after the task stack is freed. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/337aeca8614024aa4d8d9c81053bbf8fcffbe4ad.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0b9ed8e..4002b47 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -532,15 +532,18 @@ unsigned long thread_saved_pc(struct task_struct *tsk) */ unsigned long get_wchan(struct task_struct *p) { - unsigned long start, bottom, top, sp, fp, ip; + unsigned long start, bottom, top, sp, fp, ip, ret = 0; int count = 0; if (!p || p == current || p->state == TASK_RUNNING) return 0; + if (!try_get_task_stack(p)) + return 0; + start = (unsigned long)task_stack_page(p); if (!start) - return 0; + goto out; /* * Layout of the stack page: @@ -564,16 +567,21 @@ unsigned long get_wchan(struct task_struct *p) sp = READ_ONCE(p->thread.sp); if (sp < bottom || sp > top) - return 0; + goto out; fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp); do { if (fp < bottom || fp > top) - return 0; + goto out; ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long))); - if (!in_sched_functions(ip)) - return ip; + if (!in_sched_functions(ip)) { + ret = ip; + goto out; + } fp = READ_ONCE_NOCHECK(*(unsigned long *)fp); } while (count++ < 16 && p->state != TASK_RUNNING); - return 0; + +out: + put_task_stack(p); + return ret; } -- cgit v0.10.2 From aa1f1a639621672b68f654dc815a7d8298ff396f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:47 -0700 Subject: lib/syscall: Pin the task stack in collect_syscall() This will avoid a potential read-after-free if collect_syscall() (e.g. /proc/PID/syscall) is called on an exiting task. Reported-by: Jann Horn Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/0bfd8e6d4729c97745d3781a29610a33d0a8091d.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/lib/syscall.c b/lib/syscall.c index e30e039..63239e0 100644 --- a/lib/syscall.c +++ b/lib/syscall.c @@ -7,9 +7,19 @@ static int collect_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc) { - struct pt_regs *regs = task_pt_regs(target); - if (unlikely(!regs)) + struct pt_regs *regs; + + if (!try_get_task_stack(target)) { + /* Task has no stack, so the task isn't in a syscall. */ + *callno = -1; + return 0; + } + + regs = task_pt_regs(target); + if (unlikely(!regs)) { + put_task_stack(target); return -EAGAIN; + } *sp = user_stack_pointer(regs); *pc = instruction_pointer(regs); @@ -18,6 +28,7 @@ static int collect_syscall(struct task_struct *target, long *callno, if (*callno != -1L && maxargs > 0) syscall_get_arguments(target, regs, 0, maxargs, args); + put_task_stack(target); return 0; } -- cgit v0.10.2 From 68f24b08ee892d47bdef925d676e1ae1ccc316f8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:48 -0700 Subject: sched/core: Free the stack early if CONFIG_THREAD_INFO_IN_TASK We currently keep every task's stack around until the task_struct itself is freed. This means that we keep the stack allocation alive for longer than necessary and that, under load, we free stacks in big batches whenever RCU drops the last task reference. Neither of these is good for reuse of cache-hot memory, and freeing in batches prevents us from usefully caching small numbers of vmalloced stacks. On architectures that have thread_info on the stack, we can't easily change this, but on architectures that set THREAD_INFO_IN_TASK, we can free it as soon as the task is dead. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/08ca06cde00ebed0046c5d26cbbf3fbb7ef5b812.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 9c04d44..325f649 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -186,7 +186,9 @@ extern struct task_group root_task_group; #endif #ifdef CONFIG_THREAD_INFO_IN_TASK -# define INIT_TASK_TI(tsk) .thread_info = INIT_THREAD_INFO(tsk), +# define INIT_TASK_TI(tsk) \ + .thread_info = INIT_THREAD_INFO(tsk), \ + .stack_refcount = ATOMIC_INIT(1), #else # define INIT_TASK_TI(tsk) #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index a958672..abb795a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1936,6 +1936,10 @@ struct task_struct { #ifdef CONFIG_VMAP_STACK struct vm_struct *stack_vm_area; #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + /* A live task holds one reference. */ + atomic_t stack_refcount; +#endif /* CPU-specific state of this task */ struct thread_struct thread; /* @@ -3143,12 +3147,22 @@ static inline unsigned long *end_of_stack(struct task_struct *p) #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK +static inline void *try_get_task_stack(struct task_struct *tsk) +{ + return atomic_inc_not_zero(&tsk->stack_refcount) ? + task_stack_page(tsk) : NULL; +} + +extern void put_task_stack(struct task_struct *tsk); +#else static inline void *try_get_task_stack(struct task_struct *tsk) { return task_stack_page(tsk); } static inline void put_task_stack(struct task_struct *tsk) {} +#endif #define task_stack_end_corrupted(task) \ (*(end_of_stack(task)) != STACK_END_MAGIC) diff --git a/kernel/fork.c b/kernel/fork.c index 0c240fd..5dd0a51 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -269,11 +269,40 @@ static void account_kernel_stack(struct task_struct *tsk, int account) } } -void free_task(struct task_struct *tsk) +static void release_task_stack(struct task_struct *tsk) { account_kernel_stack(tsk, -1); arch_release_thread_stack(tsk->stack); free_thread_stack(tsk); + tsk->stack = NULL; +#ifdef CONFIG_VMAP_STACK + tsk->stack_vm_area = NULL; +#endif +} + +#ifdef CONFIG_THREAD_INFO_IN_TASK +void put_task_stack(struct task_struct *tsk) +{ + if (atomic_dec_and_test(&tsk->stack_refcount)) + release_task_stack(tsk); +} +#endif + +void free_task(struct task_struct *tsk) +{ +#ifndef CONFIG_THREAD_INFO_IN_TASK + /* + * The task is finally done with both the stack and thread_info, + * so free both. + */ + release_task_stack(tsk); +#else + /* + * If the task had a separate stack allocation, it should be gone + * by now. + */ + WARN_ON_ONCE(atomic_read(&tsk->stack_refcount) != 0); +#endif rt_mutex_debug_task_free(tsk); ftrace_graph_exit_task(tsk); put_seccomp_filter(tsk); @@ -411,6 +440,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) #ifdef CONFIG_VMAP_STACK tsk->stack_vm_area = stack_vm_area; #endif +#ifdef CONFIG_THREAD_INFO_IN_TASK + atomic_set(&tsk->stack_refcount, 1); +#endif if (err) goto free_stack; @@ -1771,6 +1803,7 @@ bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); exit_creds(p); bad_fork_free: + put_task_stack(p); free_task(p); fork_out: return ERR_PTR(retval); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0b6238f..23c6037 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2772,6 +2772,10 @@ static struct rq *finish_task_switch(struct task_struct *prev) * task and put them back on the free list. */ kprobe_flush_task(prev); + + /* Task is done with its stack. */ + put_task_stack(prev); + put_task_struct(prev); } -- cgit v0.10.2 From ac496bf48d97f2503eaa353996a4dd5e4383eaf0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 15 Sep 2016 22:45:49 -0700 Subject: fork: Optimize task creation by caching two thread stacks per CPU if CONFIG_VMAP_STACK=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit vmalloc() is a bit slow, and pounding vmalloc()/vfree() will eventually force a global TLB flush. To reduce pressure on them, if CONFIG_VMAP_STACK=y, cache two thread stacks per CPU. This will let us quickly allocate a hopefully cache-hot, TLB-hot stack under heavy forking workloads (shell script style). On my silly pthread_create() benchmark, it saves about 2 µs per pthread_create()+join() with CONFIG_VMAP_STACK=y. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jann Horn Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/94811d8e3994b2e962f88866290017d498eb069c.1474003868.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/fork.c b/kernel/fork.c index 5dd0a51..c060c7e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -159,15 +159,41 @@ void __weak arch_release_thread_stack(unsigned long *stack) * kmemcache based allocator. */ # if THREAD_SIZE >= PAGE_SIZE || defined(CONFIG_VMAP_STACK) + +#ifdef CONFIG_VMAP_STACK +/* + * vmalloc() is a bit slow, and calling vfree() enough times will force a TLB + * flush. Try to minimize the number of calls by caching stacks. + */ +#define NR_CACHED_STACKS 2 +static DEFINE_PER_CPU(struct vm_struct *, cached_stacks[NR_CACHED_STACKS]); +#endif + static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { #ifdef CONFIG_VMAP_STACK - void *stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, - VMALLOC_START, VMALLOC_END, - THREADINFO_GFP | __GFP_HIGHMEM, - PAGE_KERNEL, - 0, node, - __builtin_return_address(0)); + void *stack; + int i; + + local_irq_disable(); + for (i = 0; i < NR_CACHED_STACKS; i++) { + struct vm_struct *s = this_cpu_read(cached_stacks[i]); + + if (!s) + continue; + this_cpu_write(cached_stacks[i], NULL); + + tsk->stack_vm_area = s; + local_irq_enable(); + return s->addr; + } + local_irq_enable(); + + stack = __vmalloc_node_range(THREAD_SIZE, THREAD_SIZE, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP | __GFP_HIGHMEM, + PAGE_KERNEL, + 0, node, __builtin_return_address(0)); /* * We can't call find_vm_area() in interrupt context, and @@ -187,10 +213,28 @@ static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) static inline void free_thread_stack(struct task_struct *tsk) { - if (task_stack_vm_area(tsk)) +#ifdef CONFIG_VMAP_STACK + if (task_stack_vm_area(tsk)) { + unsigned long flags; + int i; + + local_irq_save(flags); + for (i = 0; i < NR_CACHED_STACKS; i++) { + if (this_cpu_read(cached_stacks[i])) + continue; + + this_cpu_write(cached_stacks[i], tsk->stack_vm_area); + local_irq_restore(flags); + return; + } + local_irq_restore(flags); + vfree(tsk->stack); - else - __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); + return; + } +#endif + + __free_pages(virt_to_page(tsk->stack), THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_stack_cache; -- cgit v0.10.2 From 81539169f283329fd8bc58457cc15754f683ba69 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 08:05:20 -0500 Subject: x86/dumpstack: Remove NULL task pointer convention show_stack_log_lvl() and friends allow a NULL pointer for the task_struct to indicate the current task. This creates confusion and can cause sneaky bugs. Instead require the caller to pass 'current' directly. This only changes the internal workings of the dumpstack code. The dump_trace() and show_stack() interfaces still allow a NULL task pointer. Those interfaces should also probably be fixed as well. Signed-off-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index 780a83e..ed2be1b 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -94,7 +94,7 @@ get_frame_pointer(struct task_struct *task, struct pt_regs *regs) if (regs) return (unsigned long *)regs->bp; - if (!task || task == current) + if (task == current) return __builtin_frame_address(0); return (unsigned long *)((struct inactive_task_frame *)task->thread.sp)->bp; @@ -113,7 +113,7 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) if (regs) return (unsigned long *)kernel_stack_pointer(regs); - if (!task || task == current) + if (task == current) return __builtin_frame_address(0); return (unsigned long *)task->thread.sp; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index aa208e5..e0648f7 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -175,11 +175,13 @@ void show_stack(struct task_struct *task, unsigned long *sp) { unsigned long bp = 0; + task = task ? : current; + /* * Stack frames below this one aren't interesting. Don't show them * if we're printing for %current. */ - if (!sp && (!task || task == current)) { + if (!sp && task == current) { sp = get_stack_pointer(current, NULL); bp = (unsigned long)get_frame_pointer(current, NULL); } diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 122f37d7..4ff0008 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -205,7 +205,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; pr_emerg("Stack:\n"); - show_stack_log_lvl(NULL, regs, NULL, 0, KERN_EMERG); + show_stack_log_lvl(current, regs, NULL, 0, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 16c0d5f..008a298 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -278,7 +278,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(NULL, regs, NULL, 0, KERN_DEFAULT); + show_stack_log_lvl(current, regs, NULL, 0, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); -- cgit v0.10.2 From 7c7900f89770d7fba96100d8a9e18043a1af3973 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:12 -0500 Subject: x86/unwind: Add new unwind interface and implementations The x86 stack dump code is a bit of a mess. dump_trace() uses callbacks, and each user of it seems to have slightly different requirements, so there are several slightly different callbacks floating around. Also there are some upcoming features which will need more changes to the stack dump code, including the printing of stack pt_regs, reliable stack detection for live patching, and a DWARF unwinder. Each of those features would at least need more callbacks and/or callback interfaces, resulting in a much bigger mess than what we have today. Before doing all that, we should try to clean things up and replace dump_trace() with something cleaner and more flexible. The new unwinder is a simple state machine which was heavily inspired by a suggestion from Andy Lutomirski: https://lkml.kernel.org/r/CALCETrUbNTqaM2LRyXGRx=kVLRPeY5A3Pc6k4TtQxF320rUT=w@mail.gmail.com It's also similar to the libunwind API: http://www.nongnu.org/libunwind/man/libunwind(3).html Some if its advantages: - Simplicity: no more callback sprawl and less code duplication. - Flexibility: it allows the caller to stop and inspect the stack state at each step in the unwinding process. - Modularity: the unwinder code, console stack dump code, and stack metadata analysis code are all better separated so that changing one of them shouldn't have much of an impact on any of the others. Two implementations are added which conform to the new unwind interface: - The frame pointer unwinder which is used for CONFIG_FRAME_POINTER=y. - The "guess" unwinder which is used for CONFIG_FRAME_POINTER=n. This isn't an "unwinder" per se. All it does is scan the stack for kernel text addresses. But with no frame pointers, guesses are better than nothing in most cases. Suggested-by: Andy Lutomirski Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6dc2f909c47533d213d0505f0a113e64585bec82.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h new file mode 100644 index 0000000..c4b6d1c --- /dev/null +++ b/arch/x86/include/asm/unwind.h @@ -0,0 +1,73 @@ +#ifndef _ASM_X86_UNWIND_H +#define _ASM_X86_UNWIND_H + +#include +#include +#include +#include + +struct unwind_state { + struct stack_info stack_info; + unsigned long stack_mask; + struct task_struct *task; + int graph_idx; +#ifdef CONFIG_FRAME_POINTER + unsigned long *bp; +#else + unsigned long *sp; +#endif +}; + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame); + +bool unwind_next_frame(struct unwind_state *state); + +static inline bool unwind_done(struct unwind_state *state) +{ + return state->stack_info.type == STACK_TYPE_UNKNOWN; +} + +static inline +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + first_frame = first_frame ? : get_stack_pointer(task, regs); + + __unwind_start(state, task, regs, first_frame); +} + +#ifdef CONFIG_FRAME_POINTER + +static inline +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + if (unwind_done(state)) + return NULL; + + return state->bp + 1; +} + +unsigned long unwind_get_return_address(struct unwind_state *state); + +#else /* !CONFIG_FRAME_POINTER */ + +static inline +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +{ + return NULL; +} + +static inline +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + if (unwind_done(state)) + return 0; + + return ftrace_graph_ret_addr(state->task, &state->graph_idx, + *state->sp, state->sp); +} + +#endif /* CONFIG_FRAME_POINTER */ + +#endif /* _ASM_X86_UNWIND_H */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0503f5b..45257cf 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -125,6 +125,12 @@ obj-$(CONFIG_EFI) += sysfb_efi.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o obj-$(CONFIG_TRACING) += tracepoint.o +ifdef CONFIG_FRAME_POINTER +obj-y += unwind_frame.o +else +obj-y += unwind_guess.o +endif + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c new file mode 100644 index 0000000..a2456d4 --- /dev/null +++ b/arch/x86/kernel/unwind_frame.c @@ -0,0 +1,93 @@ +#include +#include +#include +#include +#include + +#define FRAME_HEADER_SIZE (sizeof(long) * 2) + +unsigned long unwind_get_return_address(struct unwind_state *state) +{ + unsigned long addr; + unsigned long *addr_p = unwind_get_return_address_ptr(state); + + if (unwind_done(state)) + return 0; + + addr = ftrace_graph_ret_addr(state->task, &state->graph_idx, *addr_p, + addr_p); + + return __kernel_text_address(addr) ? addr : 0; +} +EXPORT_SYMBOL_GPL(unwind_get_return_address); + +static bool update_stack_state(struct unwind_state *state, void *addr, + size_t len) +{ + struct stack_info *info = &state->stack_info; + + /* + * If addr isn't on the current stack, switch to the next one. + * + * We may have to traverse multiple stacks to deal with the possibility + * that 'info->next_sp' could point to an empty stack and 'addr' could + * be on a subsequent stack. + */ + while (!on_stack(info, addr, len)) + if (get_stack_info(info->next_sp, state->task, info, + &state->stack_mask)) + return false; + + return true; +} + +bool unwind_next_frame(struct unwind_state *state) +{ + unsigned long *next_bp; + + if (unwind_done(state)) + return false; + + next_bp = (unsigned long *)*state->bp; + + /* make sure the next frame's data is accessible */ + if (!update_stack_state(state, next_bp, FRAME_HEADER_SIZE)) + return false; + + /* move to the next frame */ + state->bp = next_bp; + return true; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + state->task = task; + + /* don't even attempt to start from user mode regs */ + if (regs && user_mode(regs)) { + state->stack_info.type = STACK_TYPE_UNKNOWN; + return; + } + + /* set up the starting stack frame */ + state->bp = get_frame_pointer(task, regs); + + /* initialize stack info and make sure the frame data is accessible */ + get_stack_info(state->bp, state->task, &state->stack_info, + &state->stack_mask); + update_stack_state(state, state->bp, FRAME_HEADER_SIZE); + + /* + * The caller can provide the address of the first frame directly + * (first_frame) or indirectly (regs->sp) to indicate which stack frame + * to start unwinding at. Skip ahead until we reach it. + */ + while (!unwind_done(state) && + (!on_stack(&state->stack_info, first_frame, sizeof(long)) || + state->bp < first_frame)) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); diff --git a/arch/x86/kernel/unwind_guess.c b/arch/x86/kernel/unwind_guess.c new file mode 100644 index 0000000..b5a834c --- /dev/null +++ b/arch/x86/kernel/unwind_guess.c @@ -0,0 +1,43 @@ +#include +#include +#include +#include +#include +#include + +bool unwind_next_frame(struct unwind_state *state) +{ + struct stack_info *info = &state->stack_info; + + if (unwind_done(state)) + return false; + + do { + for (state->sp++; state->sp < info->end; state->sp++) + if (__kernel_text_address(*state->sp)) + return true; + + state->sp = info->next_sp; + + } while (!get_stack_info(state->sp, state->task, info, + &state->stack_mask)); + + return false; +} +EXPORT_SYMBOL_GPL(unwind_next_frame); + +void __unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) +{ + memset(state, 0, sizeof(*state)); + + state->task = task; + state->sp = first_frame; + + get_stack_info(first_frame, state->task, &state->stack_info, + &state->stack_mask); + + if (!__kernel_text_address(*first_frame)) + unwind_next_frame(state); +} +EXPORT_SYMBOL_GPL(__unwind_start); -- cgit v0.10.2 From 35f4d9b32527c08c3da3982aedae5198dc663ce8 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:13 -0500 Subject: perf/x86: Convert perf_callchain_kernel() to use the new unwinder Convert perf_callchain_kernel() to use the new unwinder. dump_trace() has been deprecated. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a2df0c4f09b3d438e11b41681f10b0775a819a7f.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index 477dc38..0a8bd7f 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -37,6 +37,7 @@ #include #include #include +#include #include "perf_event.h" @@ -2247,31 +2248,12 @@ void arch_perf_update_userpage(struct perf_event *event, cyc2ns_read_end(data); } -/* - * callchain support - */ - -static int backtrace_stack(void *data, const char *name) -{ - return 0; -} - -static int backtrace_address(void *data, unsigned long addr, int reliable) -{ - struct perf_callchain_entry_ctx *entry = data; - - return perf_callchain_store(entry, addr); -} - -static const struct stacktrace_ops backtrace_ops = { - .stack = backtrace_stack, - .address = backtrace_address, - .walk_stack = print_context_stack_bp, -}; - void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *regs) { + struct unwind_state state; + unsigned long addr; + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { /* TODO: We don't support guest os callchain now */ return; @@ -2280,7 +2262,12 @@ perf_callchain_kernel(struct perf_callchain_entry_ctx *entry, struct pt_regs *re if (perf_callchain_store(entry, regs->ip)) return; - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); + for (unwind_start(&state, current, regs, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || perf_callchain_store(entry, addr)) + return; + } } static inline int -- cgit v0.10.2 From 49a612c6b06defbd6e6d334c683fea28006728e3 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:14 -0500 Subject: x86/stacktrace: Convert save_stack_trace_*() to use the new unwinder Convert save_stack_trace_*() to use the new unwinder. dump_trace() has been deprecated. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/815494c627d89887db0ce56ceffd58ad16ee6c21.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 23fa81e..0653788 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -8,73 +8,59 @@ #include #include #include +#include -static int save_stack_stack(void *data, const char *name) +static int save_stack_address(struct stack_trace *trace, unsigned long addr, + bool nosched) { - return 0; -} - -static int -__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) -{ - struct stack_trace *trace = data; -#ifdef CONFIG_FRAME_POINTER - if (!reliable) - return 0; -#endif if (nosched && in_sched_functions(addr)) return 0; + if (trace->skip > 0) { trace->skip--; return 0; } - if (trace->nr_entries < trace->max_entries) { - trace->entries[trace->nr_entries++] = addr; - return 0; - } else { - return -1; /* no more room, stop walking the stack */ - } -} -static int save_stack_address(void *data, unsigned long addr, int reliable) -{ - return __save_stack_address(data, addr, reliable, false); + if (trace->nr_entries >= trace->max_entries) + return -1; + + trace->entries[trace->nr_entries++] = addr; + return 0; } -static int -save_stack_address_nosched(void *data, unsigned long addr, int reliable) +static void __save_stack_trace(struct stack_trace *trace, + struct task_struct *task, struct pt_regs *regs, + bool nosched) { - return __save_stack_address(data, addr, reliable, true); -} + struct unwind_state state; + unsigned long addr; -static const struct stacktrace_ops save_stack_ops = { - .stack = save_stack_stack, - .address = save_stack_address, - .walk_stack = print_context_stack, -}; + if (regs) + save_stack_address(trace, regs->ip, nosched); -static const struct stacktrace_ops save_stack_ops_nosched = { - .stack = save_stack_stack, - .address = save_stack_address_nosched, - .walk_stack = print_context_stack, -}; + for (unwind_start(&state, task, regs, NULL); !unwind_done(&state); + unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr || save_stack_address(trace, addr, nosched)) + break; + } + + if (trace->nr_entries < trace->max_entries) + trace->entries[trace->nr_entries++] = ULONG_MAX; +} /* * Save stack-backtrace addresses into a stack_trace buffer. */ void save_stack_trace(struct stack_trace *trace) { - dump_trace(current, NULL, NULL, 0, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, current, NULL, false); } EXPORT_SYMBOL_GPL(save_stack_trace); void save_stack_trace_regs(struct pt_regs *regs, struct stack_trace *trace) { - dump_trace(current, regs, NULL, 0, &save_stack_ops, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, current, regs, false); } void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) @@ -82,9 +68,7 @@ void save_stack_trace_tsk(struct task_struct *tsk, struct stack_trace *trace) if (!try_get_task_stack(tsk)) return; - dump_trace(tsk, NULL, NULL, 0, &save_stack_ops_nosched, trace); - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = ULONG_MAX; + __save_stack_trace(trace, tsk, NULL, true); put_task_stack(tsk); } -- cgit v0.10.2 From ec2ad9ccf12dc965cad2d367a4063f68d6561a6b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:15 -0500 Subject: oprofile/x86: Convert x86_backtrace() to use the new unwinder Convert oprofile's x86_backtrace() to use the new unwinder. dump_trace() has been deprecated. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Robert Richter Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/412df8927705795e8ea60cffcf89a79e010713b1.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c index 7539148..a2488b6 100644 --- a/arch/x86/oprofile/backtrace.c +++ b/arch/x86/oprofile/backtrace.c @@ -16,27 +16,7 @@ #include #include - -static int backtrace_stack(void *data, const char *name) -{ - /* Yes, we want all stacks */ - return 0; -} - -static int backtrace_address(void *data, unsigned long addr, int reliable) -{ - unsigned int *depth = data; - - if ((*depth)--) - oprofile_add_trace(addr); - return 0; -} - -static struct stacktrace_ops backtrace_ops = { - .stack = backtrace_stack, - .address = backtrace_address, - .walk_stack = print_context_stack, -}; +#include #ifdef CONFIG_COMPAT static struct stack_frame_ia32 * @@ -113,14 +93,29 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth) struct stack_frame *head = (struct stack_frame *)frame_pointer(regs); if (!user_mode(regs)) { + struct unwind_state state; + unsigned long addr; + if (!depth) return; oprofile_add_trace(regs->ip); + if (!--depth) return; - dump_trace(NULL, regs, NULL, 0, &backtrace_ops, &depth); + for (unwind_start(&state, current, regs, NULL); + !unwind_done(&state); unwind_next_frame(&state)) { + addr = unwind_get_return_address(&state); + if (!addr) + break; + + oprofile_add_trace(addr); + + if (!--depth) + break; + } + return; } -- cgit v0.10.2 From e18bcccd1a4ecb41e99678e002ef833586185bf1 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:16 -0500 Subject: x86/dumpstack: Convert show_trace_log_lvl() to use the new unwinder Convert show_trace_log_lvl() to use the new unwinder. dump_trace() has been deprecated. show_trace_log_lvl() is special compared to other users of the unwinder. It's the only place where both reliable *and* unreliable addresses are needed. With frame pointers enabled, most callers of the unwinder don't want to know about unreliable addresses. But in this case, when we're dumping the stack to the console because something presumably went wrong, the unreliable addresses are useful: - They show stale data on the stack which can provide useful clues. - If something goes wrong with the unwinder, or if frame pointers are corrupt or missing, all the stack addresses still get shown. So in order to show all addresses on the stack, and at the same time figure out which addresses are reliable, we have to do the scanning and the unwinding in parallel. The scanning is done with the help of get_stack_info() to traverse the stacks. The unwinding is done separately by the new unwinder. In theory we could simplify show_trace_log_lvl() by instead pushing some of this logic into the unwind code. But then we would need some kind of "fake" frame logic in the unwinder which would add a lot of complexity and wouldn't be worth it in order to support only one user. Another benefit of this approach is that once we have a DWARF unwinder, we should be able to just plug it in with minimal impact to this code. Another change here is that callers of show_trace_log_lvl() don't need to provide the 'bp' argument. The unwinder already finds the relevant frame pointer by unwinding until it reaches the first frame after the provided stack pointer. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/703b5998604c712a1f801874b43f35d6dac52ede.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index ed2be1b..c9ccf06 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -119,13 +119,11 @@ get_stack_pointer(struct task_struct *task, struct pt_regs *regs) return (unsigned long *)task->thread.sp; } -extern void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); +void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl); -extern void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl); extern unsigned int code_bytes; diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index e0648f7..c08f32a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -17,7 +17,7 @@ #include #include - +#include int panic_on_unrecovered_nmi; int panic_on_io_nmi; @@ -142,56 +142,120 @@ print_context_stack_bp(struct task_struct *task, } EXPORT_SYMBOL_GPL(print_context_stack_bp); -static int print_trace_stack(void *data, const char *name) +void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *stack, char *log_lvl) { - printk("%s <%s> ", (char *)data, name); - return 0; -} + struct unwind_state state; + struct stack_info stack_info = {0}; + unsigned long visit_mask = 0; + int graph_idx = 0; -/* - * Print one address/symbol entries per line. - */ -static int print_trace_address(void *data, unsigned long addr, int reliable) -{ - printk_stack_address(addr, reliable, data); - return 0; -} + printk("%sCall Trace:\n", log_lvl); -static const struct stacktrace_ops print_trace_ops = { - .stack = print_trace_stack, - .address = print_trace_address, - .walk_stack = print_context_stack, -}; + unwind_start(&state, task, regs, stack); -void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl) -{ - printk("%sCall Trace:\n", log_lvl); - dump_trace(task, regs, stack, bp, &print_trace_ops, log_lvl); + /* + * Iterate through the stacks, starting with the current stack pointer. + * Each stack has a pointer to the next one. + * + * x86-64 can have several stacks: + * - task stack + * - interrupt stack + * - HW exception stacks (double fault, nmi, debug, mce) + * + * x86-32 can have up to three stacks: + * - task stack + * - softirq stack + * - hardirq stack + */ + for (; stack; stack = stack_info.next_sp) { + const char *str_begin, *str_end; + + /* + * If we overflowed the task stack into a guard page, jump back + * to the bottom of the usable stack. + */ + if (task_stack_page(task) - (void *)stack < PAGE_SIZE) + stack = task_stack_page(task); + + if (get_stack_info(stack, task, &stack_info, &visit_mask)) + break; + + stack_type_str(stack_info.type, &str_begin, &str_end); + if (str_begin) + printk("%s <%s> ", log_lvl, str_begin); + + /* + * Scan the stack, printing any text addresses we find. At the + * same time, follow proper stack frames with the unwinder. + * + * Addresses found during the scan which are not reported by + * the unwinder are considered to be additional clues which are + * sometimes useful for debugging and are prefixed with '?'. + * This also serves as a failsafe option in case the unwinder + * goes off in the weeds. + */ + for (; stack < stack_info.end; stack++) { + unsigned long real_addr; + int reliable = 0; + unsigned long addr = *stack; + unsigned long *ret_addr_p = + unwind_get_return_address_ptr(&state); + + if (!__kernel_text_address(addr)) + continue; + + if (stack == ret_addr_p) + reliable = 1; + + /* + * When function graph tracing is enabled for a + * function, its return address on the stack is + * replaced with the address of an ftrace handler + * (return_to_handler). In that case, before printing + * the "real" address, we want to print the handler + * address as an "unreliable" hint that function graph + * tracing was involved. + */ + real_addr = ftrace_graph_ret_addr(task, &graph_idx, + addr, stack); + if (real_addr != addr) + printk_stack_address(addr, 0, log_lvl); + printk_stack_address(real_addr, reliable, log_lvl); + + if (!reliable) + continue; + + /* + * Get the next frame from the unwinder. No need to + * check for an error: if anything goes wrong, the rest + * of the addresses will just be printed as unreliable. + */ + unwind_next_frame(&state); + } + + if (str_end) + printk("%s <%s> ", log_lvl, str_end); + } } void show_stack(struct task_struct *task, unsigned long *sp) { - unsigned long bp = 0; - task = task ? : current; /* * Stack frames below this one aren't interesting. Don't show them * if we're printing for %current. */ - if (!sp && task == current) { + if (!sp && task == current) sp = get_stack_pointer(current, NULL); - bp = (unsigned long)get_frame_pointer(current, NULL); - } - show_stack_log_lvl(task, NULL, sp, bp, ""); + show_stack_log_lvl(current, NULL, sp, ""); } void show_stack_regs(struct pt_regs *regs) { - show_stack_log_lvl(current, regs, NULL, 0, ""); + show_stack_log_lvl(current, regs, NULL, ""); } static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED; diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 4ff0008..e476eb7 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -156,9 +156,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } EXPORT_SYMBOL(dump_trace); -void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl) { unsigned long *stack; int i; @@ -181,7 +180,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, touch_nmi_watchdog(); } pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); put_task_stack(task); } @@ -205,7 +204,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; pr_emerg("Stack:\n"); - show_stack_log_lvl(current, regs, NULL, 0, KERN_EMERG); + show_stack_log_lvl(current, regs, NULL, KERN_EMERG); pr_emerg("Code:"); diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 008a298..4e9f2cf 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -209,9 +209,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, } EXPORT_SYMBOL(dump_trace); -void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl) +void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, + unsigned long *sp, char *log_lvl) { unsigned long *irq_stack_end; unsigned long *irq_stack; @@ -255,7 +254,7 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, } pr_cont("\n"); - show_trace_log_lvl(task, regs, sp, bp, log_lvl); + show_trace_log_lvl(task, regs, sp, log_lvl); put_task_stack(task); } @@ -278,7 +277,7 @@ void show_regs(struct pt_regs *regs) u8 *ip; printk(KERN_DEFAULT "Stack:\n"); - show_stack_log_lvl(current, regs, NULL, 0, KERN_DEFAULT); + show_stack_log_lvl(current, regs, NULL, KERN_DEFAULT); printk(KERN_DEFAULT "Code: "); -- cgit v0.10.2 From c8fe4609827aedc9c4b45de80e7cdc8ccfa8541b Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 16 Sep 2016 14:18:17 -0500 Subject: x86/dumpstack: Remove dump_trace() and related callbacks All previous users of dump_trace() have been converted to use the new unwind interfaces, so we can remove it and the related print_context_stack() and print_context_stack_bp() callback functions. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Byungchul Park Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Nilay Vaish Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/5b97da3572b40b5a4d8e185cf2429308d0987a13.1474045023.git.jpoimboe@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h index c9ccf06..37f2e0b 100644 --- a/arch/x86/include/asm/stacktrace.h +++ b/arch/x86/include/asm/stacktrace.h @@ -45,42 +45,6 @@ static inline bool on_stack(struct stack_info *info, void *addr, size_t len) extern int kstack_depth_to_print; -struct thread_info; -struct stacktrace_ops; - -typedef unsigned long (*walk_stack_t)(struct task_struct *task, - unsigned long *stack, - unsigned long bp, - const struct stacktrace_ops *ops, - void *data, - struct stack_info *info, - int *graph); - -extern unsigned long -print_context_stack(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - struct stack_info *info, int *graph); - -extern unsigned long -print_context_stack_bp(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - struct stack_info *info, int *graph); - -/* Generic stack tracer with callbacks */ - -struct stacktrace_ops { - int (*address)(void *data, unsigned long address, int reliable); - /* On negative return stop dumping */ - int (*stack)(void *data, const char *name); - walk_stack_t walk_stack; -}; - -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data); - #ifdef CONFIG_X86_32 #define STACKSLOTS_PER_LINE 8 #else diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c08f32a..999de3b 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -56,92 +56,6 @@ void printk_address(unsigned long address) pr_cont(" [<%p>] %pS\n", (void *)address, (void *)address); } -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -unsigned long -print_context_stack(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - struct stack_info *info, int *graph) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - - /* - * If we overflowed the stack into a guard page, jump back to the - * bottom of the usable stack. - */ - if ((unsigned long)task_stack_page(task) - (unsigned long)stack < - PAGE_SIZE) - stack = (unsigned long *)task_stack_page(task); - - while (on_stack(info, stack, sizeof(*stack))) { - unsigned long addr = *stack; - - if (__kernel_text_address(addr)) { - unsigned long real_addr; - int reliable = 0; - - if ((unsigned long) stack == bp + sizeof(long)) { - reliable = 1; - frame = frame->next_frame; - bp = (unsigned long) frame; - } - - /* - * When function graph tracing is enabled for a - * function, its return address on the stack is - * replaced with the address of an ftrace handler - * (return_to_handler). In that case, before printing - * the "real" address, we want to print the handler - * address as an "unreliable" hint that function graph - * tracing was involved. - */ - real_addr = ftrace_graph_ret_addr(task, graph, addr, - stack); - if (real_addr != addr) - ops->address(data, addr, 0); - - ops->address(data, real_addr, reliable); - } - stack++; - } - return bp; -} -EXPORT_SYMBOL_GPL(print_context_stack); - -unsigned long -print_context_stack_bp(struct task_struct *task, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data, - struct stack_info *info, int *graph) -{ - struct stack_frame *frame = (struct stack_frame *)bp; - unsigned long *retp = &frame->return_address; - - while (on_stack(info, stack, sizeof(*stack) * 2)) { - unsigned long addr = *retp; - unsigned long real_addr; - - if (!__kernel_text_address(addr)) - break; - - real_addr = ftrace_graph_ret_addr(task, graph, addr, retp); - if (ops->address(data, real_addr, 1)) - break; - - frame = frame->next_frame; - retp = &frame->return_address; - } - - return (unsigned long)frame; -} -EXPORT_SYMBOL_GPL(print_context_stack_bp); - void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl) { diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index e476eb7..06eb322 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -121,41 +121,6 @@ unknown: return -EINVAL; } -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - unsigned long visit_mask = 0; - int graph = 0; - - task = task ? : current; - stack = stack ? : get_stack_pointer(task, regs); - bp = bp ? : (unsigned long)get_frame_pointer(task, regs); - - for (;;) { - const char *begin_str, *end_str; - struct stack_info info; - - if (get_stack_info(stack, task, &info, &visit_mask)) - break; - - stack_type_str(info.type, &begin_str, &end_str); - - if (begin_str && ops->stack(data, begin_str) < 0) - break; - - bp = ops->walk_stack(task, stack, bp, ops, data, &info, &graph); - - if (end_str && ops->stack(data, end_str) < 0) - break; - - stack = info.next_sp; - - touch_nmi_watchdog(); - } -} -EXPORT_SYMBOL(dump_trace); - void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, char *log_lvl) { diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 4e9f2cf..36cf1a4 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -140,75 +140,6 @@ unknown: return -EINVAL; } -/* - * x86-64 can have up to three kernel stacks: - * process stack - * interrupt stack - * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack - */ - -void dump_trace(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, - const struct stacktrace_ops *ops, void *data) -{ - unsigned long visit_mask = 0; - struct stack_info info; - int graph = 0; - int done = 0; - - task = task ? : current; - stack = stack ? : get_stack_pointer(task, regs); - bp = bp ? : (unsigned long)get_frame_pointer(task, regs); - - /* - * Print function call entries in all stacks, starting at the - * current stack address. If the stacks consist of nested - * exceptions - */ - while (!done) { - const char *begin_str, *end_str; - - get_stack_info(stack, task, &info, &visit_mask); - - /* Default finish unless specified to continue */ - done = 1; - - switch (info.type) { - - /* Break out early if we are on the thread stack */ - case STACK_TYPE_TASK: - break; - - case STACK_TYPE_IRQ: - case STACK_TYPE_EXCEPTION ... STACK_TYPE_EXCEPTION_LAST: - - stack_type_str(info.type, &begin_str, &end_str); - - if (ops->stack(data, begin_str) < 0) - break; - - bp = ops->walk_stack(task, stack, bp, ops, - data, &info, &graph); - - ops->stack(data, end_str); - - stack = info.next_sp; - done = 0; - break; - - default: - ops->stack(data, "UNK"); - break; - } - } - - /* - * This handles the process stack: - */ - bp = ops->walk_stack(task, stack, bp, ops, data, &info, &graph); -} -EXPORT_SYMBOL(dump_trace); - void show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, char *log_lvl) { -- cgit v0.10.2 From 71f5443ebb1227c22e8decbcd28a1ea6deaf8257 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Tue, 20 Sep 2016 10:53:40 -0500 Subject: x86/dumpstack: Fix show_stack() task pointer regression With the following commit: e18bcccd1a4e ("x86/dumpstack: Convert show_trace_log_lvl() to use the new unwinder") The task pointer argument to show_stack_log_lvl() in show_stack() was inadvertently changed to 'current'. Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: byungchul.park@lge.com Cc: fweisbec@gmail.com Cc: keescook@chromium.org Cc: linux-tip-commits@vger.kernel.org Cc: luto@amacapital.net Cc: nilayvaish@gmail.com Cc: rostedt@goodmis.org Cc: tip-bot for Josh Poimboeuf Fixes: e18bcccd1a4e ("x86/dumpstack: Convert show_trace_log_lvl() to use the new unwinder") Link: http://lkml.kernel.org/r/20160920155340.yhewlx7vmgmov5fb@treble Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 999de3b..9b7cf5c 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -164,7 +164,7 @@ void show_stack(struct task_struct *task, unsigned long *sp) if (!sp && task == current) sp = get_stack_pointer(current, NULL); - show_stack_log_lvl(current, NULL, sp, ""); + show_stack_log_lvl(task, NULL, sp, ""); } void show_stack_regs(struct pt_regs *regs) -- cgit v0.10.2 From 317c2ce77d8ab73c24f4fb9c75e5bb441fbe3e30 Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Fri, 23 Sep 2016 16:49:39 -0500 Subject: x86/alternatives: Add stack frame dependency to alternative_call_2() Linus reported the following objtool warning: kernel/signal.o: warning: objtool: .altinstr_replacement+0x54: call without frame pointer save/setup The warning is valid. It's caused by the fact that gcc placed the call instruction in alternative_call_2()'s inline asm before the frame pointer setup, which breaks frame pointer convention and can result in a bad stack trace. Force a stack frame to be created before the call instruction by listing the stack pointer as an output operand in the inline asm statement. Reported-and-tested-by: Linus Torvalds Signed-off-by: Josh Poimboeuf Cc: Andy Lutomirski Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160923214939.j5o7c67nhepzmh3t@treble Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index e77a644..1b02038 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -217,10 +217,14 @@ static inline int alternatives_text_reserved(void *start, void *end) */ #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ output, input...) \ +{ \ + register void *__sp asm(_ASM_SP); \ asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ "call %P[new2]", feature2) \ - : output : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ - [new2] "i" (newfunc2), ## input) + : output, "+r" (__sp) \ + : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ + [new2] "i" (newfunc2), ## input); \ +} /* * use this macro(s) if you need more than one output parameter -- cgit v0.10.2 From 907241dccb4ce5d9413cf3c030b32b0cfc184914 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Fri, 23 Sep 2016 18:24:07 +0100 Subject: thread_info: Use unsigned long for flags The generic THREAD_INFO_IN_TASK definition of thread_info::flags is a u32, matching x86 prior to the introduction of THREAD_INFO_IN_TASK. However, common helpers like test_ti_thread_flag() implicitly assume that thread_info::flags has at least the size and alignment of unsigned long, and relying on padding and alignment provided by other elements of task_struct is somewhat fragile. Additionally, some architectures use more that 32 bits for thread_info::flags, and others may need to in future. With THREAD_INFO_IN_TASK, task struct follows thread_info with a long field, and thus we no longer save any space as we did back in commit: affa219b60a11b32 ("x86: change thread_info's flag field back to 32 bits") Given all this, it makes more sense for the generic thread_info::flags to be an unsigned long. In fact given contains/uses the helpers mentioned above, BE arches *must* use unsigned long (or something of the same size) today, or they wouldn't work. Make it so. Signed-off-by: Mark Rutland Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1474651447-30447-1-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e2d0fd8..45f004e 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -15,7 +15,7 @@ struct compat_timespec; #ifdef CONFIG_THREAD_INFO_IN_TASK struct thread_info { - u32 flags; /* low level flags */ + unsigned long flags; /* low level flags */ }; #define INIT_THREAD_INFO(tsk) \ -- cgit v0.10.2 From 1ef55be16ed69538f89e0a6508be5e62fdc9851c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 29 Sep 2016 12:48:12 -0700 Subject: x86/asm: Get rid of __read_cr4_safe() We use __read_cr4() vs __read_cr4_safe() inconsistently. On CR4-less CPUs, all CR4 bits are effectively clear, so we can make the code simpler and more robust by making __read_cr4() always fix up faults on 32-bit kernels. This may fix some bugs on old 486-like CPUs, but I don't have any easy way to test that. Signed-off-by: Andy Lutomirski Cc: Brian Gerst Cc: Borislav Petkov Cc: david@saggiorato.net Link: http://lkml.kernel.org/r/ea647033d357d9ce2ad2bbde5a631045f5052fb6.1475178370.git.luto@kernel.org Signed-off-by: Thomas Gleixner diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 2970d22..91b6f4e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -80,10 +80,6 @@ static inline unsigned long __read_cr4(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4); } -static inline unsigned long __read_cr4_safe(void) -{ - return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4_safe); -} static inline void __write_cr4(unsigned long x) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7fa9e77..fcf243f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -108,7 +108,6 @@ struct pv_cpu_ops { unsigned long (*read_cr0)(void); void (*write_cr0)(unsigned long); - unsigned long (*read_cr4_safe)(void); unsigned long (*read_cr4)(void); void (*write_cr4)(unsigned long); diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 587d791..19a2224 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -59,22 +59,19 @@ static inline void native_write_cr3(unsigned long val) static inline unsigned long native_read_cr4(void) { unsigned long val; - asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline unsigned long native_read_cr4_safe(void) -{ - unsigned long val; - /* This could fault if %cr4 does not exist. In x86_64, a cr4 always - * exists, so it will never fail. */ #ifdef CONFIG_X86_32 + /* + * This could fault if CR4 does not exist. Non-existent CR4 + * is functionally equivalent to CR4 == 0. Keep it simple and pretend + * that CR4 == 0 on CPUs that don't have CR4. + */ asm volatile("1: mov %%cr4, %0\n" "2:\n" _ASM_EXTABLE(1b, 2b) : "=r" (val), "=m" (__force_order) : "0" (0)); #else - val = native_read_cr4(); + /* CR4 always exists on x86_64. */ + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); #endif return val; } @@ -182,11 +179,6 @@ static inline unsigned long __read_cr4(void) return native_read_cr4(); } -static inline unsigned long __read_cr4_safe(void) -{ - return native_read_cr4_safe(); -} - static inline void __write_cr4(unsigned long x) { native_write_cr4(x); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index dee8a70..6fa8594 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -81,7 +81,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { - this_cpu_write(cpu_tlbstate.cr4, __read_cr4_safe()); + this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); } /* Set in this cpu's CR4. */ diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bef3400..bbf3d59 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -332,7 +332,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, .read_cr4 = native_read_cr4, - .read_cr4_safe = native_read_cr4_safe, .write_cr4 = native_write_cr4, #ifdef CONFIG_X86_64 .read_cr8 = native_read_cr8, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 404efdf..bd7be8e 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -90,7 +90,7 @@ void __show_regs(struct pt_regs *regs, int all) cr0 = read_cr0(); cr2 = read_cr2(); cr3 = read_cr3(); - cr4 = __read_cr4_safe(); + cr4 = __read_cr4(); printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", cr0, cr2, cr3, cr4); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 87f2330..3aabfdc 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1137,7 +1137,7 @@ void __init setup_arch(char **cmdline_p) * auditing all the early-boot CR4 manipulation would be needed to * rule it out. */ - mmu_cr4_features = __read_cr4_safe(); + mmu_cr4_features = __read_cr4(); memblock_set_current_limit(get_max_mapped()); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index b12c26e..53cace2 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -130,7 +130,7 @@ static void __save_processor_state(struct saved_context *ctxt) ctxt->cr0 = read_cr0(); ctxt->cr2 = read_cr2(); ctxt->cr3 = read_cr3(); - ctxt->cr4 = __read_cr4_safe(); + ctxt->cr4 = __read_cr4(); #ifdef CONFIG_X86_64 ctxt->cr8 = read_cr8(); #endif diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b86ebb1..e2cf8fc 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1237,7 +1237,6 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .write_cr0 = xen_write_cr0, .read_cr4 = native_read_cr4, - .read_cr4_safe = native_read_cr4_safe, .write_cr4 = xen_write_cr4, #ifdef CONFIG_X86_64 -- cgit v0.10.2