From c73e36b775a777abd67a1e15481923fcbd2040e1 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Fri, 3 Jul 2015 22:19:02 +0200 Subject: x86/asm/entry/32: Replace RESTORE_RSI_RDI with open-coded 32-bit reads This doesn't change much, but uses shorter 32-bit insns: -48 8b 74 24 68 mov 0x68(%rsp),%rsi -48 8b 7c 24 70 mov 0x70(%rsp),%rdi -48 8b 54 24 60 mov 0x60(%rsp),%rdx +8b 54 24 60 mov 0x60(%rsp),%edx +8b 74 24 68 mov 0x68(%rsp),%esi +8b 7c 24 70 mov 0x70(%rsp),%edi and does the loads in pt_regs order. Since these are the only uses of RESTORE_RSI_RDI[_RDX], drop these macros. Signed-off-by: Denys Vlasenko Cc: Alexei Starovoitov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Link: http://lkml.kernel.org/r/1435954742-2545-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index f4e6308..519207f 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -193,12 +193,6 @@ For 32-bit we have the following conventions - kernel is built with .macro RESTORE_C_REGS_EXCEPT_RCX_R11 RESTORE_C_REGS_HELPER 1,0,0,1,1 .endm - .macro RESTORE_RSI_RDI - RESTORE_C_REGS_HELPER 0,0,0,0,0 - .endm - .macro RESTORE_RSI_RDI_RDX - RESTORE_C_REGS_HELPER 0,0,0,0,1 - .endm .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 subq $-(15*8+\addskip), %rsp diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index bb187a6..b868cfc 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -140,7 +140,8 @@ sysexit_from_sys_call: */ andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) movl RIP(%rsp), %ecx /* User %eip */ - RESTORE_RSI_RDI + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi xorl %edx, %edx /* Do not leak kernel information */ xorq %r8, %r8 xorq %r9, %r9 @@ -366,7 +367,9 @@ cstar_dispatch: sysretl_from_sys_call: andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - RESTORE_RSI_RDI_RDX + movl RDX(%rsp), %edx + movl RSI(%rsp), %esi + movl RDI(%rsp), %edi movl RIP(%rsp), %ecx movl EFLAGS(%rsp), %r11d xorq %r10, %r10 -- cgit v0.10.2 From c6e5ca35c4685cd920b1d5279dbc9f4483d7dfd4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:43:55 +0200 Subject: x86/asm/tsc: Inline native_read_tsc() and remove __native_read_tsc() In the following commit: cdc7957d1954 ("x86: move native_read_tsc() offline") ... native_read_tsc() was moved out of line, presumably for some now-obsolete vDSO-related reason. Undo it. The entire rdtsc, shl, or sequence is only 11 bytes, and calls via rdtscl() and similar helpers were already inlined. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/d05ffe2aaf8468ca475ebc00efad7b2fa174af19.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 9793322..972b488 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -186,7 +186,7 @@ notrace static cycle_t vread_tsc(void) * but no one has ever seen it happen. */ rdtsc_barrier(); - ret = (cycle_t)__native_read_tsc(); + ret = (cycle_t)native_read_tsc(); last = gtod->cycle_last; diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index e6a707e..8871147 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -106,12 +106,10 @@ notrace static inline int native_write_msr_safe(unsigned int msr, return err; } -extern unsigned long long native_read_tsc(void); - extern int rdmsr_safe_regs(u32 regs[8]); extern int wrmsr_safe_regs(u32 regs[8]); -static __always_inline unsigned long long __native_read_tsc(void) +static __always_inline unsigned long long native_read_tsc(void) { DECLARE_ARGS(val, low, high); @@ -181,10 +179,10 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) } #define rdtscl(low) \ - ((low) = (u32)__native_read_tsc()) + ((low) = (u32)native_read_tsc()) #define rdtscll(val) \ - ((val) = __native_read_tsc()) + ((val) = native_read_tsc()) #define rdpmc(counter, low, high) \ do { \ diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 628954c..2bd69d6 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) static __always_inline u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) { - u64 delta = __native_read_tsc() - src->tsc_timestamp; + u64 delta = native_read_tsc() - src->tsc_timestamp; return pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); } diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index c2e00bb..bc5fa2a 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void) * on during the bootup the random pool has true entropy too. */ get_random_bytes(&canary, sizeof(canary)); - tsc = __native_read_tsc(); + tsc = native_read_tsc(); canary += tsc + (tsc << 32UL); current->stack_canary = canary; diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 94605c0..fd11128 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -42,7 +42,7 @@ static __always_inline cycles_t vget_cycles(void) if (!cpu_has_tsc) return 0; #endif - return (cycles_t)__native_read_tsc(); + return (cycles_t)native_read_tsc(); } extern void tsc_init(void); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index ede92c3..9fe111c 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void) old = dw_apb_clocksource_read(clocksource_apbt); old += loop; - t1 = __native_read_tsc(); + t1 = native_read_tsc(); do { new = dw_apb_clocksource_read(clocksource_apbt); } while (new < old); - t2 = __native_read_tsc(); + t2 = native_read_tsc(); shift = 5; if (unlikely(loop >> shift == 0)) { diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 5054497..e7710cd 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -308,12 +308,6 @@ unsigned long long sched_clock(void) __attribute__((alias("native_sched_clock"))); #endif -unsigned long long native_read_tsc(void) -{ - return __native_read_tsc(); -} -EXPORT_SYMBOL(native_read_tsc); - int check_tsc_unstable(void) { return tsc_unstable; -- cgit v0.10.2 From 881d7bf843d7139c6dfbffdec4903b3354423c49 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:43:56 +0200 Subject: x86/asm/tsc, kvm: Remove vget_cycles() The only caller was KVM's read_tsc(). The only difference between vget_cycles() and native_read_tsc() was that vget_cycles() returned zero instead of crashing on TSC-less systems. KVM already checks vclock_mode() before calling that function, so the extra check is unnecessary. Also, KVM (host-side) requires the TSC to exist. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Acked-by: Paolo Bonzini Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/20615df14ae2eb713ea7a5f5123c1dc4c7ca993d.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index fd11128..3da1cc1 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -32,19 +32,6 @@ static inline cycles_t get_cycles(void) return ret; } -static __always_inline cycles_t vget_cycles(void) -{ - /* - * We only do VDSOs on TSC capable CPUs, so this shouldn't - * access boot_cpu_data (which is not VDSO-safe): - */ -#ifndef CONFIG_X86_TSC - if (!cpu_has_tsc) - return 0; -#endif - return (cycles_t)native_read_tsc(); -} - extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bbaf44e..f771058 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1455,7 +1455,7 @@ static cycle_t read_tsc(void) * but no one has ever seen it happen. */ rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); + ret = (cycle_t)native_read_tsc(); last = pvclock_gtod_data.clock.cycle_last; -- cgit v0.10.2 From 9261e050b686c9fe229cd9918d997b3caaf20e34 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:43:57 +0200 Subject: x86/asm/tsc, x86/paravirt: Remove read_tsc() and read_tscp() paravirt hooks We've had ->read_tsc() and ->read_tscp() paravirt hooks since the very beginning of paravirt, i.e., d3561b7fa0fb ("[PATCH] paravirt: header and stubs for paravirtualisation"). AFAICT, the only paravirt guest implementation that ever replaced these calls was vmware, and it's gone. Arguably even vmware shouldn't have hooked RDTSC -- we fully support systems that don't have a TSC at all, so there's no point for a paravirt implementation to pretend that we have a TSC but to replace it. I also doubt that these hooks actually worked. Calls to rdtscl() and rdtscll(), which respected the hooks, were used seemingly interchangeably with native_read_tsc(), which did not. Just remove them. If anyone ever needs them again, they can try to make a case for why they need them. Before, on a paravirt config: text data bss dec hex filename 12618257 1816384 1093632 15528273 ecf151 vmlinux After: text data bss dec hex filename 12617207 1816384 1093632 15527223 eced37 vmlinux Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Cc: virtualization@lists.linux-foundation.org Link: http://lkml.kernel.org/r/d08a2600fb298af163681e5efd8e599d889a5b97.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 8871147..d1afac7 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -178,12 +178,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) return err; } -#define rdtscl(low) \ - ((low) = (u32)native_read_tsc()) - -#define rdtscll(val) \ - ((val) = native_read_tsc()) - #define rdpmc(counter, low, high) \ do { \ u64 _l = native_read_pmc((counter)); \ @@ -193,6 +187,14 @@ do { \ #define rdpmcl(counter, val) ((val) = native_read_pmc(counter)) +#endif /* !CONFIG_PARAVIRT */ + +#define rdtscl(low) \ + ((low) = (u32)native_read_tsc()) + +#define rdtscll(val) \ + ((val) = native_read_tsc()) + #define rdtscp(low, high, aux) \ do { \ unsigned long long _val = native_read_tscp(&(aux)); \ @@ -202,8 +204,6 @@ do { \ #define rdtscpll(val, aux) (val) = native_read_tscp(&(aux)) -#endif /* !CONFIG_PARAVIRT */ - /* * 64-bit version of wrmsr_safe(): */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index d143bfa..c2be037 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -174,19 +174,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p) return err; } -static inline u64 paravirt_read_tsc(void) -{ - return PVOP_CALL0(u64, pv_cpu_ops.read_tsc); -} - -#define rdtscl(low) \ -do { \ - u64 _l = paravirt_read_tsc(); \ - low = (int)_l; \ -} while (0) - -#define rdtscll(val) (val = paravirt_read_tsc()) - static inline unsigned long long paravirt_sched_clock(void) { return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock); @@ -215,27 +202,6 @@ do { \ #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter)) -static inline unsigned long long paravirt_rdtscp(unsigned int *aux) -{ - return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux); -} - -#define rdtscp(low, high, aux) \ -do { \ - int __aux; \ - unsigned long __val = paravirt_rdtscp(&__aux); \ - (low) = (u32)__val; \ - (high) = (u32)(__val >> 32); \ - (aux) = __aux; \ -} while (0) - -#define rdtscpll(val, aux) \ -do { \ - unsigned long __aux; \ - val = paravirt_rdtscp(&__aux); \ - (aux) = __aux; \ -} while (0) - static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) { PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index a6b8f9f..ce029e4 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -156,9 +156,7 @@ struct pv_cpu_ops { u64 (*read_msr)(unsigned int msr, int *err); int (*write_msr)(unsigned int msr, unsigned low, unsigned high); - u64 (*read_tsc)(void); u64 (*read_pmc)(int counter); - unsigned long long (*read_tscp)(unsigned int *aux); #ifdef CONFIG_X86_32 /* diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 58bcfb6..f68e48f 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -351,9 +351,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .wbinvd = native_wbinvd, .read_msr = native_read_msr_safe, .write_msr = native_write_msr_safe, - .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .read_tscp = native_read_tscp, .load_tr_desc = native_load_tr_desc, .set_ldt = native_set_ldt, .load_gdt = native_load_gdt, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index e1b0136..c89f50a 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); -DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); @@ -52,7 +51,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); - PATCH_SITE(pv_cpu_ops, read_tsc); #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 0b95c9b..32136bf 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1175,11 +1175,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .read_msr = xen_read_msr_safe, .write_msr = xen_write_msr_safe, - .read_tsc = native_read_tsc, .read_pmc = native_read_pmc, - .read_tscp = native_read_tscp, - .iret = xen_iret, #ifdef CONFIG_X86_64 .usergs_sysret32 = xen_sysret32, -- cgit v0.10.2 From 87be28aaf1458445d5f648688c2eec0f13b8f3b9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:43:58 +0200 Subject: x86/asm/tsc: Replace rdtscll() with native_read_tsc() Now that the ->read_tsc() paravirt hook is gone, rdtscll() is just a wrapper around native_read_tsc(). Unwrap it. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/d2449ae62c1b1fb90195bcfb19ef4a35883a04dc.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index d7b1f65..ea33236 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -82,7 +82,7 @@ static unsigned long get_random_long(void) if (has_cpuflag(X86_FEATURE_TSC)) { debug_putstr(" RDTSC"); - rdtscll(raw); + raw = native_read_tsc(); random ^= raw; use_i8254 = false; diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index d1afac7..7273b74 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -192,9 +192,6 @@ do { \ #define rdtscl(low) \ ((low) = (u32)native_read_tsc()) -#define rdtscll(val) \ - ((val) = native_read_tsc()) - #define rdtscp(low, high, aux) \ do { \ unsigned long long _val = native_read_tscp(&(aux)); \ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 3da1cc1..b488390 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -21,15 +21,12 @@ extern void disable_TSC(void); static inline cycles_t get_cycles(void) { - unsigned long long ret = 0; - #ifndef CONFIG_X86_TSC if (!cpu_has_tsc) return 0; #endif - rdtscll(ret); - return ret; + return native_read_tsc(); } extern void tsc_init(void); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 9fe111c..25efa53 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -263,7 +263,7 @@ static int apbt_clocksource_register(void) /* Verify whether apbt counter works */ t1 = dw_apb_clocksource_read(clocksource_apbt); - rdtscll(start); + start = native_read_tsc(); /* * We don't know the TSC frequency yet, but waiting for @@ -273,7 +273,7 @@ static int apbt_clocksource_register(void) */ do { rep_nop(); - rdtscll(now); + now = native_read_tsc(); } while ((now - start) < 200000UL); /* APBT is the only always on clocksource, it has to work! */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index dcb5285..51af1ed 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -457,7 +457,7 @@ static int lapic_next_deadline(unsigned long delta, { u64 tsc; - rdtscll(tsc); + tsc = native_read_tsc(); wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); return 0; } @@ -592,7 +592,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) unsigned long pm = acpi_pm_read_early(); if (cpu_has_tsc) - rdtscll(tsc); + tsc = native_read_tsc(); switch (lapic_cal_loops++) { case 0: @@ -1209,7 +1209,7 @@ void setup_local_APIC(void) long long max_loops = cpu_khz ? cpu_khz : 1000000; if (cpu_has_tsc) - rdtscll(tsc); + tsc = native_read_tsc(); if (disable_apic) { disable_ioapic_support(); @@ -1293,7 +1293,7 @@ void setup_local_APIC(void) } if (queued) { if (cpu_has_tsc && cpu_khz) { - rdtscll(ntsc); + ntsc = native_read_tsc(); max_loops = (cpu_khz << 10) - (ntsc - tsc); } else max_loops--; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index df919ff..a5283d2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -125,7 +125,7 @@ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - rdtscll(m->tsc); + m->tsc = native_read_tsc(); /* We hope get_seconds stays lockless */ m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; @@ -1784,7 +1784,7 @@ static void collect_tscs(void *data) { unsigned long *cpu_tsc = (unsigned long *)data; - rdtscll(cpu_tsc[smp_processor_id()]); + cpu_tsc[smp_processor_id()] = native_read_tsc(); } static int mce_apei_read_done; diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index f5d0730..334a2a9 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -110,7 +110,7 @@ static void init_espfix_random(void) */ if (!arch_get_random_long(&rand)) { /* The constant is an arbitrary large prime */ - rdtscll(rand); + rand = native_read_tsc(); rand *= 0xc345c6b72fd16123UL; } diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 10757d0..cc390fe 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -735,7 +735,7 @@ static int hpet_clocksource_register(void) /* Verify whether hpet counter works */ t1 = hpet_readl(HPET_COUNTER); - rdtscll(start); + start = native_read_tsc(); /* * We don't know the TSC frequency yet, but waiting for @@ -745,7 +745,7 @@ static int hpet_clocksource_register(void) */ do { rep_nop(); - rdtscll(now); + now = native_read_tsc(); } while ((now - start) < 200000UL); if (t1 == hpet_readl(HPET_COUNTER)) { diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c index 25b9937..bd8f4d4 100644 --- a/arch/x86/kernel/trace_clock.c +++ b/arch/x86/kernel/trace_clock.c @@ -15,7 +15,7 @@ u64 notrace trace_clock_x86_tsc(void) u64 ret; rdtsc_barrier(); - rdtscll(ret); + ret = native_read_tsc(); return ret; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e7710cd..e66f5dc 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) data = cyc2ns_write_begin(cpu); - rdtscll(tsc_now); + tsc_now = native_read_tsc(); ns_now = cycles_2_ns(tsc_now); /* @@ -290,7 +290,7 @@ u64 native_sched_clock(void) } /* read the Time Stamp Counter: */ - rdtscll(tsc_now); + tsc_now = native_read_tsc(); /* return the value in ns */ return cycles_2_ns(tsc_now); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e856dd5..4fa1cca 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void) { u64 host_tsc, tsc_offset; - rdtscll(host_tsc); + host_tsc = native_read_tsc(); tsc_offset = vmcs_read64(TSC_OFFSET); return host_tsc + tsc_offset; } diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 39d6a3d..9a52ad0 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -100,7 +100,7 @@ void use_tsc_delay(void) int read_current_timer(unsigned long *timer_val) { if (delay_fn == delay_tsc) { - rdtscll(*timer_val); + *timer_val = native_read_tsc(); return 0; } return -1; diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index 5820e85..ab13448 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -340,7 +340,7 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio, /* check result for the last window */ msr_now = pkg_state_counter(); - rdtscll(tsc_now); + tsc_now = native_read_tsc(); /* calculate pkg cstate vs tsc ratio */ if (!msr_last || !tsc_last) @@ -482,7 +482,7 @@ static void poll_pkg_cstate(struct work_struct *dummy) u64 val64; msr_now = pkg_state_counter(); - rdtscll(tsc_now); + tsc_now = native_read_tsc(); jiffies_now = jiffies; /* calculate pkg cstate vs tsc ratio */ diff --git a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c index 5224ee5..f02b0c0 100644 --- a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c +++ b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c @@ -81,11 +81,11 @@ static int __init cpufreq_test_tsc(void) printk(KERN_DEBUG "start--> \n"); then = read_pmtmr(); - rdtscll(then_tsc); + then_tsc = native_read_tsc(); for (i=0;i<20;i++) { mdelay(100); now = read_pmtmr(); - rdtscll(now_tsc); + now_tsc = native_read_tsc(); diff = (now - then) & 0xFFFFFF; diff_tsc = now_tsc - then_tsc; printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc); -- cgit v0.10.2 From ec69de52c648b1d9416a810943e68dbe9fe519f4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:43:59 +0200 Subject: x86/asm/tsc: Remove the rdtscp() and rdtscpll() macros They have no users. Leave native_read_tscp() which seems potentially useful despite also having no callers. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/6abfa3ef80534b5d73898a48c4d25e069303cbe5.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 7273b74..626f781 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -192,15 +192,6 @@ do { \ #define rdtscl(low) \ ((low) = (u32)native_read_tsc()) -#define rdtscp(low, high, aux) \ -do { \ - unsigned long long _val = native_read_tscp(&(aux)); \ - (low) = (u32)_val; \ - (high) = (u32)(_val >> 32); \ -} while (0) - -#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux)) - /* * 64-bit version of wrmsr_safe(): */ -- cgit v0.10.2 From 9cfa1a0279e22063a727fd204a75cf3672860d83 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:00 +0200 Subject: x86/asm/tsc: Use the full 64-bit TSC in delay_tsc() As a very minor optimization, delay_tsc() was only using the low 32 bits of the TSC. It's a delay function, so just use the whole thing. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/bd1a277c71321b67c4794970cb5ace05efe21ab6.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 9a52ad0..35115f3 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -49,16 +49,16 @@ static void delay_loop(unsigned long loops) /* TSC based delay: */ static void delay_tsc(unsigned long __loops) { - u32 bclock, now, loops = __loops; + u64 bclock, now, loops = __loops; int cpu; preempt_disable(); cpu = smp_processor_id(); rdtsc_barrier(); - rdtscl(bclock); + bclock = native_read_tsc(); for (;;) { rdtsc_barrier(); - rdtscl(now); + now = native_read_tsc(); if ((now - bclock) >= loops) break; @@ -80,7 +80,7 @@ static void delay_tsc(unsigned long __loops) loops -= (now - bclock); cpu = smp_processor_id(); rdtsc_barrier(); - rdtscl(bclock); + bclock = native_read_tsc(); } } preempt_enable(); -- cgit v0.10.2 From 3796366614598e48edf0561b86f18c230a7debc8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:01 +0200 Subject: x86/asm/tsc, x86/cpu/amd: Use the full 64-bit TSC to detect the 2.6.2 bug This code is timing 100k indirect calls, so the added overhead of counting the number of cycles elapsed as a 64-bit number should be insignificant. Drop the optimization of using a 32-bit count. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/d58f339a9c0dd8352b50d2f7a216f67ec2844f20.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index dd3a4ba..a69710d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -114,7 +114,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c) const int K6_BUG_LOOP = 1000000; int n; void (*f_vide)(void); - unsigned long d, d2; + u64 d, d2; printk(KERN_INFO "AMD K6 stepping B detected - "); @@ -125,10 +125,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c) n = K6_BUG_LOOP; f_vide = vide; - rdtscl(d); + d = native_read_tsc(); while (n--) f_vide(); - rdtscl(d2); + d2 = native_read_tsc(); d = d2-d; if (d > 20*K6_BUG_LOOP) -- cgit v0.10.2 From e18d1f8df176527332761ac29ee3097f8584c478 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:02 +0200 Subject: x86/asm/tsc, drivers/net/hamradio/baycom_epp: Replace rdtscl() with native_read_tsc() This is only used if BAYCOM_DEBUG is defined. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Acked-by: Thomas Sailer Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Cc: linux-hams@vger.kernel.org Link: http://lkml.kernel.org/r/1195ce0c7f34169ff3006341b77806184a46b9bf.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 83c7cce..44e5c3b 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -638,7 +638,7 @@ static int receive(struct net_device *dev, int cnt) #define GETTICK(x) \ ({ \ if (cpu_has_tsc) \ - rdtscl(x); \ + x = (unsigned int)native_read_tsc(); \ }) #else /* __i386__ */ #define GETTICK(x) -- cgit v0.10.2 From 3a2c16c8489d967de10b3b7f5cc0f7cab4337770 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:03 +0200 Subject: x86/asm/tsc, staging/lirc_serial: Remove TSC-based timing It wasn't compiled in by default. I suspect that the driver was and still is broken, though -- it's calling udelay with a parameter that's derived from loops_per_jiffy. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Greg Kroah-Hartman Cc: H. Peter Anvin Cc: Huang Rui Cc: Jarod Wilson Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: devel@driverdev.osuosl.org Cc: kvm ML Link: http://lkml.kernel.org/r/c95df47c5405b494d19d20b2852a9378c9f661f3.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/drivers/staging/media/lirc/lirc_serial.c b/drivers/staging/media/lirc/lirc_serial.c index dc79844..465796a 100644 --- a/drivers/staging/media/lirc/lirc_serial.c +++ b/drivers/staging/media/lirc/lirc_serial.c @@ -327,9 +327,6 @@ static void safe_udelay(unsigned long usecs) * time */ -/* So send_pulse can quickly convert microseconds to clocks */ -static unsigned long conv_us_to_clocks; - static int init_timing_params(unsigned int new_duty_cycle, unsigned int new_freq) { @@ -344,7 +341,6 @@ static int init_timing_params(unsigned int new_duty_cycle, /* How many clocks in a microsecond?, avoiding long long divide */ work = loops_per_sec; work *= 4295; /* 4295 = 2^32 / 1e6 */ - conv_us_to_clocks = work >> 32; /* * Carrier period in clocks, approach good up to 32GHz clock, @@ -357,10 +353,9 @@ static int init_timing_params(unsigned int new_duty_cycle, pulse_width = period * duty_cycle / 100; space_width = period - pulse_width; dprintk("in init_timing_params, freq=%d, duty_cycle=%d, " - "clk/jiffy=%ld, pulse=%ld, space=%ld, " - "conv_us_to_clocks=%ld\n", + "clk/jiffy=%ld, pulse=%ld, space=%ld\n", freq, duty_cycle, __this_cpu_read(cpu_info.loops_per_jiffy), - pulse_width, space_width, conv_us_to_clocks); + pulse_width, space_width); return 0; } #else /* ! USE_RDTSC */ @@ -431,63 +426,14 @@ static long send_pulse_irdeo(unsigned long length) return ret; } -#ifdef USE_RDTSC -/* Version that uses Pentium rdtsc instruction to measure clocks */ - -/* - * This version does sub-microsecond timing using rdtsc instruction, - * and does away with the fudged LIRC_SERIAL_TRANSMITTER_LATENCY - * Implicitly i586 architecture... - Steve - */ - -static long send_pulse_homebrew_softcarrier(unsigned long length) -{ - int flag; - unsigned long target, start, now; - - /* Get going quick as we can */ - rdtscl(start); - on(); - /* Convert length from microseconds to clocks */ - length *= conv_us_to_clocks; - /* And loop till time is up - flipping at right intervals */ - now = start; - target = pulse_width; - flag = 1; - /* - * FIXME: This looks like a hard busy wait, without even an occasional, - * polite, cpu_relax() call. There's got to be a better way? - * - * The i2c code has the result of a lot of bit-banging work, I wonder if - * there's something there which could be helpful here. - */ - while ((now - start) < length) { - /* Delay till flip time */ - do { - rdtscl(now); - } while ((now - start) < target); - - /* flip */ - if (flag) { - rdtscl(now); - off(); - target += space_width; - } else { - rdtscl(now); on(); - target += pulse_width; - } - flag = !flag; - } - rdtscl(now); - return ((now - start) - length) / conv_us_to_clocks; -} -#else /* ! USE_RDTSC */ /* Version using udelay() */ /* * here we use fixed point arithmetic, with 8 * fractional bits. that gets us within 0.1% or so of the right average * frequency, albeit with some jitter in pulse length - Steve + * + * This should use ndelay instead. */ /* To match 8 fractional bits used for pulse/space length */ @@ -520,7 +466,6 @@ static long send_pulse_homebrew_softcarrier(unsigned long length) } return (actual-length) >> 8; } -#endif /* USE_RDTSC */ static long send_pulse_homebrew(unsigned long length) { -- cgit v0.10.2 From 016bfc449a88c833e949414a41748b359843dbb1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:04 +0200 Subject: x86/asm/tsc, input/joystick/analog: Switch from rdtscl() to native_read_tsc() This timing code is hideous, and this doesn't help. It gets rid of one of the last users of rdtscl(), though. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Acked-by: Dmitry Torokhov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Cc: linux-input@vger.kernel.org Link: http://lkml.kernel.org/r/90d19b3cea0e05ca6f333d1598daa38afb993260.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c index 4284080..f871b4f 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -143,7 +143,7 @@ struct analog_port { #include -#define GET_TIME(x) do { if (cpu_has_tsc) rdtscl(x); else x = get_time_pit(); } while (0) +#define GET_TIME(x) do { if (cpu_has_tsc) x = (unsigned int)native_read_tsc(); else x = get_time_pit(); } while (0) #define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0))) #define TIME_NAME (cpu_has_tsc?"TSC":"PIT") static unsigned int get_time_pit(void) @@ -160,7 +160,7 @@ static unsigned int get_time_pit(void) return count; } #elif defined(__x86_64__) -#define GET_TIME(x) rdtscl(x) +#define GET_TIME(x) do { x = (unsigned int)native_read_tsc(); } while (0) #define DELTA(x,y) ((y)-(x)) #define TIME_NAME "TSC" #elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE) -- cgit v0.10.2 From 732f374ba50b64150bf954c2d4e9f6fae583cccf Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:05 +0200 Subject: x86/asm/tsc, drivers/input/gameport: Replace rdtscl() with native_read_tsc() It's unclear to me why this code exists in the first place. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Acked-by: Dmitry Torokhov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Cc: linux-input@vger.kernel.org Link: http://lkml.kernel.org/r/9e058e72f4cf1f13c6483c1360b39c3d188a2c2a.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c index e853a21..abc0cb2 100644 --- a/drivers/input/gameport/gameport.c +++ b/drivers/input/gameport/gameport.c @@ -149,9 +149,9 @@ static int old_gameport_measure_speed(struct gameport *gameport) for(i = 0; i < 50; i++) { local_irq_save(flags); - rdtscl(t1); + t1 = native_read_tsc(); for (t = 0; t < 50; t++) gameport_read(gameport); - rdtscl(t2); + t2 = native_read_tsc(); local_irq_restore(flags); udelay(i * 10); if (t2 - t1 < tx) tx = t2 - t1; -- cgit v0.10.2 From fe47ae6e1a5005b2e82f7eab57b5c3820453293a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:06 +0200 Subject: x86/asm/tsc: Remove rdtscl() It has no more callers, and it was never a very sensible interface to begin with. Users of the TSC should either read all 64 bits or explicitly throw out the high bits. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/250105f7cee519be9d7fc4464b5784caafc8f4fe.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 626f781..c89ed6c 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -189,9 +189,6 @@ do { \ #endif /* !CONFIG_PARAVIRT */ -#define rdtscl(low) \ - ((low) = (u32)native_read_tsc()) - /* * 64-bit version of wrmsr_safe(): */ -- cgit v0.10.2 From 4ea1636b04dbd66536fa387bae2eea463efc705b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:07 +0200 Subject: x86/asm/tsc: Rename native_read_tsc() to rdtsc() Now that there is no paravirt TSC, the "native" is inappropriate. The function does RDTSC, so give it the obvious name: rdtsc(). Suggested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/fd43e16281991f096c1e4d21574d9e1402c62d39.1434501121.git.luto@kernel.org [ Ported it to v4.2-rc1. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c index ea33236..6a9b96b 100644 --- a/arch/x86/boot/compressed/aslr.c +++ b/arch/x86/boot/compressed/aslr.c @@ -82,7 +82,7 @@ static unsigned long get_random_long(void) if (has_cpuflag(X86_FEATURE_TSC)) { debug_putstr(" RDTSC"); - raw = native_read_tsc(); + raw = rdtsc(); random ^= raw; use_i8254 = false; diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 972b488..0340d93 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -186,7 +186,7 @@ notrace static cycle_t vread_tsc(void) * but no one has ever seen it happen. */ rdtsc_barrier(); - ret = (cycle_t)native_read_tsc(); + ret = (cycle_t)rdtsc(); last = gtod->cycle_last; diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index c89ed6c..ff0c120 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -109,7 +109,16 @@ notrace static inline int native_write_msr_safe(unsigned int msr, extern int rdmsr_safe_regs(u32 regs[8]); extern int wrmsr_safe_regs(u32 regs[8]); -static __always_inline unsigned long long native_read_tsc(void) +/** + * rdtsc() - returns the current TSC without ordering constraints + * + * rdtsc() returns the result of RDTSC as a 64-bit integer. The + * only ordering constraint it supplies is the ordering implied by + * "asm volatile": it will put the RDTSC in the place you expect. The + * CPU can and will speculatively execute that RDTSC, though, so the + * results can be non-monotonic if compared on different CPUs. + */ +static __always_inline unsigned long long rdtsc(void) { DECLARE_ARGS(val, low, high); diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 2bd69d6..5c490db 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) static __always_inline u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) { - u64 delta = native_read_tsc() - src->tsc_timestamp; + u64 delta = rdtsc() - src->tsc_timestamp; return pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); } diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index bc5fa2a..58505f0 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void) * on during the bootup the random pool has true entropy too. */ get_random_bytes(&canary, sizeof(canary)); - tsc = native_read_tsc(); + tsc = rdtsc(); canary += tsc + (tsc << 32UL); current->stack_canary = canary; diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index b488390..3df7675 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -26,7 +26,7 @@ static inline cycles_t get_cycles(void) return 0; #endif - return native_read_tsc(); + return rdtsc(); } extern void tsc_init(void); diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index 25efa53..222a570 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -263,7 +263,7 @@ static int apbt_clocksource_register(void) /* Verify whether apbt counter works */ t1 = dw_apb_clocksource_read(clocksource_apbt); - start = native_read_tsc(); + start = rdtsc(); /* * We don't know the TSC frequency yet, but waiting for @@ -273,7 +273,7 @@ static int apbt_clocksource_register(void) */ do { rep_nop(); - now = native_read_tsc(); + now = rdtsc(); } while ((now - start) < 200000UL); /* APBT is the only always on clocksource, it has to work! */ @@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void) old = dw_apb_clocksource_read(clocksource_apbt); old += loop; - t1 = native_read_tsc(); + t1 = rdtsc(); do { new = dw_apb_clocksource_read(clocksource_apbt); } while (new < old); - t2 = native_read_tsc(); + t2 = rdtsc(); shift = 5; if (unlikely(loop >> shift == 0)) { diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 51af1ed..0d71cd9 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -457,7 +457,7 @@ static int lapic_next_deadline(unsigned long delta, { u64 tsc; - tsc = native_read_tsc(); + tsc = rdtsc(); wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); return 0; } @@ -592,7 +592,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev) unsigned long pm = acpi_pm_read_early(); if (cpu_has_tsc) - tsc = native_read_tsc(); + tsc = rdtsc(); switch (lapic_cal_loops++) { case 0: @@ -1209,7 +1209,7 @@ void setup_local_APIC(void) long long max_loops = cpu_khz ? cpu_khz : 1000000; if (cpu_has_tsc) - tsc = native_read_tsc(); + tsc = rdtsc(); if (disable_apic) { disable_ioapic_support(); @@ -1293,7 +1293,7 @@ void setup_local_APIC(void) } if (queued) { if (cpu_has_tsc && cpu_khz) { - ntsc = native_read_tsc(); + ntsc = rdtsc(); max_loops = (cpu_khz << 10) - (ntsc - tsc); } else max_loops--; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a69710d..51ad2af 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -125,10 +125,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c) n = K6_BUG_LOOP; f_vide = vide; - d = native_read_tsc(); + d = rdtsc(); while (n--) f_vide(); - d2 = native_read_tsc(); + d2 = rdtsc(); d = d2-d; if (d > 20*K6_BUG_LOOP) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index a5283d2..96ccecc 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -125,7 +125,7 @@ void mce_setup(struct mce *m) { memset(m, 0, sizeof(struct mce)); m->cpu = m->extcpu = smp_processor_id(); - m->tsc = native_read_tsc(); + m->tsc = rdtsc(); /* We hope get_seconds stays lockless */ m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; @@ -1784,7 +1784,7 @@ static void collect_tscs(void *data) { unsigned long *cpu_tsc = (unsigned long *)data; - cpu_tsc[smp_processor_id()] = native_read_tsc(); + cpu_tsc[smp_processor_id()] = rdtsc(); } static int mce_apei_read_done; diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 334a2a9..67315cd 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -110,7 +110,7 @@ static void init_espfix_random(void) */ if (!arch_get_random_long(&rand)) { /* The constant is an arbitrary large prime */ - rand = native_read_tsc(); + rand = rdtsc(); rand *= 0xc345c6b72fd16123UL; } diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index cc390fe..f75c590 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -735,7 +735,7 @@ static int hpet_clocksource_register(void) /* Verify whether hpet counter works */ t1 = hpet_readl(HPET_COUNTER); - start = native_read_tsc(); + start = rdtsc(); /* * We don't know the TSC frequency yet, but waiting for @@ -745,7 +745,7 @@ static int hpet_clocksource_register(void) */ do { rep_nop(); - now = native_read_tsc(); + now = rdtsc(); } while ((now - start) < 200000UL); if (t1 == hpet_readl(HPET_COUNTER)) { diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c index bd8f4d4..67efb8c 100644 --- a/arch/x86/kernel/trace_clock.c +++ b/arch/x86/kernel/trace_clock.c @@ -15,7 +15,7 @@ u64 notrace trace_clock_x86_tsc(void) u64 ret; rdtsc_barrier(); - ret = native_read_tsc(); + ret = rdtsc(); return ret; } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index e66f5dc..21d6e04 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) data = cyc2ns_write_begin(cpu); - tsc_now = native_read_tsc(); + tsc_now = rdtsc(); ns_now = cycles_2_ns(tsc_now); /* @@ -290,7 +290,7 @@ u64 native_sched_clock(void) } /* read the Time Stamp Counter: */ - tsc_now = native_read_tsc(); + tsc_now = rdtsc(); /* return the value in ns */ return cycles_2_ns(tsc_now); diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 954e98a..2f0ade4 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1172,7 +1172,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ @@ -1240,7 +1240,7 @@ static void start_apic_timer(struct kvm_lapic *apic) local_irq_save(flags); now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc()); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 602b974..8dfbad7 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1080,7 +1080,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { u64 tsc; - tsc = svm_scale_tsc(vcpu, native_read_tsc()); + tsc = svm_scale_tsc(vcpu, rdtsc()); return target_tsc - tsc; } @@ -3079,7 +3079,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_IA32_TSC: { msr_info->data = svm->vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, native_read_tsc()); + svm_scale_tsc(vcpu, rdtsc()); break; } diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 4fa1cca..10d69a6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void) { u64 host_tsc, tsc_offset; - host_tsc = native_read_tsc(); + host_tsc = rdtsc(); tsc_offset = vmcs_read64(TSC_OFFSET); return host_tsc + tsc_offset; } @@ -2317,7 +2317,7 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) { - return target_tsc - native_read_tsc(); + return target_tsc - rdtsc(); } static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f771058..dfa9713 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1455,7 +1455,7 @@ static cycle_t read_tsc(void) * but no one has ever seen it happen. */ rdtsc_barrier(); - ret = (cycle_t)native_read_tsc(); + ret = (cycle_t)rdtsc(); last = pvclock_gtod_data.clock.cycle_last; @@ -1646,7 +1646,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) return 1; } if (!use_master_clock) { - host_tsc = native_read_tsc(); + host_tsc = rdtsc(); kernel_ns = get_kernel_ns(); } @@ -2810,7 +2810,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - native_read_tsc() - vcpu->arch.last_host_tsc; + rdtsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { @@ -2838,7 +2838,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_host_tsc = native_read_tsc(); + vcpu->arch.last_host_tsc = rdtsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, @@ -6623,7 +6623,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) hw_breakpoint_restore(); vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, - native_read_tsc()); + rdtsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -7437,7 +7437,7 @@ int kvm_arch_hardware_enable(void) if (ret != 0) return ret; - local_tsc = native_read_tsc(); + local_tsc = rdtsc(); stable = !check_tsc_unstable(); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 35115f3..f24bc59 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -55,10 +55,10 @@ static void delay_tsc(unsigned long __loops) preempt_disable(); cpu = smp_processor_id(); rdtsc_barrier(); - bclock = native_read_tsc(); + bclock = rdtsc(); for (;;) { rdtsc_barrier(); - now = native_read_tsc(); + now = rdtsc(); if ((now - bclock) >= loops) break; @@ -80,7 +80,7 @@ static void delay_tsc(unsigned long __loops) loops -= (now - bclock); cpu = smp_processor_id(); rdtsc_barrier(); - bclock = native_read_tsc(); + bclock = rdtsc(); } } preempt_enable(); @@ -100,7 +100,7 @@ void use_tsc_delay(void) int read_current_timer(unsigned long *timer_val) { if (delay_fn == delay_tsc) { - *timer_val = native_read_tsc(); + *timer_val = rdtsc(); return 0; } return -1; diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index 15ada47..7c56d7e 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -765,7 +765,7 @@ static inline void intel_pstate_sample(struct cpudata *cpu) local_irq_save(flags); rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); - tsc = native_read_tsc(); + tsc = rdtsc(); local_irq_restore(flags); cpu->last_sample_time = cpu->sample.time; diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c index abc0cb2..4a2a9e3 100644 --- a/drivers/input/gameport/gameport.c +++ b/drivers/input/gameport/gameport.c @@ -149,9 +149,9 @@ static int old_gameport_measure_speed(struct gameport *gameport) for(i = 0; i < 50; i++) { local_irq_save(flags); - t1 = native_read_tsc(); + t1 = rdtsc(); for (t = 0; t < 50; t++) gameport_read(gameport); - t2 = native_read_tsc(); + t2 = rdtsc(); local_irq_restore(flags); udelay(i * 10); if (t2 - t1 < tx) tx = t2 - t1; diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c index f871b4f..6f8b084 100644 --- a/drivers/input/joystick/analog.c +++ b/drivers/input/joystick/analog.c @@ -143,7 +143,7 @@ struct analog_port { #include -#define GET_TIME(x) do { if (cpu_has_tsc) x = (unsigned int)native_read_tsc(); else x = get_time_pit(); } while (0) +#define GET_TIME(x) do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0) #define DELTA(x,y) (cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0))) #define TIME_NAME (cpu_has_tsc?"TSC":"PIT") static unsigned int get_time_pit(void) @@ -160,7 +160,7 @@ static unsigned int get_time_pit(void) return count; } #elif defined(__x86_64__) -#define GET_TIME(x) do { x = (unsigned int)native_read_tsc(); } while (0) +#define GET_TIME(x) do { x = (unsigned int)rdtsc(); } while (0) #define DELTA(x,y) ((y)-(x)) #define TIME_NAME "TSC" #elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE) diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c index 44e5c3b..72c9f1f 100644 --- a/drivers/net/hamradio/baycom_epp.c +++ b/drivers/net/hamradio/baycom_epp.c @@ -638,7 +638,7 @@ static int receive(struct net_device *dev, int cnt) #define GETTICK(x) \ ({ \ if (cpu_has_tsc) \ - x = (unsigned int)native_read_tsc(); \ + x = (unsigned int)rdtsc(); \ }) #else /* __i386__ */ #define GETTICK(x) diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c index ab13448..2ac0c70 100644 --- a/drivers/thermal/intel_powerclamp.c +++ b/drivers/thermal/intel_powerclamp.c @@ -340,7 +340,7 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio, /* check result for the last window */ msr_now = pkg_state_counter(); - tsc_now = native_read_tsc(); + tsc_now = rdtsc(); /* calculate pkg cstate vs tsc ratio */ if (!msr_last || !tsc_last) @@ -482,7 +482,7 @@ static void poll_pkg_cstate(struct work_struct *dummy) u64 val64; msr_now = pkg_state_counter(); - tsc_now = native_read_tsc(); + tsc_now = rdtsc(); jiffies_now = jiffies; /* calculate pkg cstate vs tsc ratio */ diff --git a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c index f02b0c0..6ff8383 100644 --- a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c +++ b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c @@ -81,11 +81,11 @@ static int __init cpufreq_test_tsc(void) printk(KERN_DEBUG "start--> \n"); then = read_pmtmr(); - then_tsc = native_read_tsc(); + then_tsc = rdtsc(); for (i=0;i<20;i++) { mdelay(100); now = read_pmtmr(); - now_tsc = native_read_tsc(); + now_tsc = rdtsc(); diff = (now - then) & 0xFFFFFF; diff_tsc = now_tsc - then_tsc; printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc); -- cgit v0.10.2 From 03b9730b769fc4d87e40f6104f4c5b2e43889f19 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:08 +0200 Subject: x86/asm/tsc: Add rdtsc_ordered() and use it in trivial call sites rdtsc_barrier(); rdtsc() is an unnecessary mouthful and requires more thought than should be necessary. Add an rdtsc_ordered() helper and replace the trivial call sites with it. This should not change generated code. The duplication of the fence asm is temporary. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/dddbf98a2af53312e9aa73a5a2b1622fe5d6f52b.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 0340d93..ca94fa6 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -175,20 +175,8 @@ static notrace cycle_t vread_pvclock(int *mode) notrace static cycle_t vread_tsc(void) { - cycle_t ret; - u64 last; - - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)rdtsc(); - - last = gtod->cycle_last; + cycle_t ret = (cycle_t)rdtsc_ordered(); + u64 last = gtod->cycle_last; if (likely(ret >= last)) return ret; diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index ff0c120..02bdd6c 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -127,6 +127,32 @@ static __always_inline unsigned long long rdtsc(void) return EAX_EDX_VAL(val, low, high); } +/** + * rdtsc_ordered() - read the current TSC in program order + * + * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer. + * It is ordered like a load to a global in-memory counter. It should + * be impossible to observe non-monotonic rdtsc_unordered() behavior + * across multiple CPUs as long as the TSC is synced. + */ +static __always_inline unsigned long long rdtsc_ordered(void) +{ + /* + * The RDTSC instruction is not ordered relative to memory + * access. The Intel SDM and the AMD APM are both vague on this + * point, but empirically an RDTSC instruction can be + * speculatively executed before prior loads. An RDTSC + * immediately after an appropriate barrier appears to be + * ordered as a normal load, that is, it provides the same + * ordering guarantees as reading from a global memory location + * that some other imaginary CPU is updating continuously with a + * time stamp. + */ + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, + "lfence", X86_FEATURE_LFENCE_RDTSC); + return rdtsc(); +} + static inline unsigned long long native_read_pmc(int counter) { DECLARE_ARGS(val, low, high); diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c index 67efb8c..80bb24d 100644 --- a/arch/x86/kernel/trace_clock.c +++ b/arch/x86/kernel/trace_clock.c @@ -12,10 +12,5 @@ */ u64 notrace trace_clock_x86_tsc(void) { - u64 ret; - - rdtsc_barrier(); - ret = rdtsc(); - - return ret; + return rdtsc_ordered(); } diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index dfa9713..8d73ec8 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1444,20 +1444,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc); static cycle_t read_tsc(void) { - cycle_t ret; - u64 last; - - /* - * Empirically, a fence (of type that depends on the CPU) - * before rdtsc is enough to ensure that rdtsc is ordered - * with respect to loads. The various CPU manuals are unclear - * as to whether rdtsc can be reordered with later loads, - * but no one has ever seen it happen. - */ - rdtsc_barrier(); - ret = (cycle_t)rdtsc(); - - last = pvclock_gtod_data.clock.cycle_last; + cycle_t ret = (cycle_t)rdtsc_ordered(); + u64 last = pvclock_gtod_data.clock.cycle_last; if (likely(ret >= last)) return ret; diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index f24bc59..4453d52 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -54,11 +54,9 @@ static void delay_tsc(unsigned long __loops) preempt_disable(); cpu = smp_processor_id(); - rdtsc_barrier(); - bclock = rdtsc(); + bclock = rdtsc_ordered(); for (;;) { - rdtsc_barrier(); - now = rdtsc(); + now = rdtsc_ordered(); if ((now - bclock) >= loops) break; @@ -79,8 +77,7 @@ static void delay_tsc(unsigned long __loops) if (unlikely(cpu != smp_processor_id())) { loops -= (now - bclock); cpu = smp_processor_id(); - rdtsc_barrier(); - bclock = rdtsc(); + bclock = rdtsc_ordered(); } } preempt_enable(); -- cgit v0.10.2 From eee6946e44510b61c35cf754f5505537c7a8eb77 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:09 +0200 Subject: x86/asm/tsc/sync: Use rdtsc_ordered() in check_tsc_warp() and drop extra barriers Using get_cycles was unnecessary: check_tsc_warp() is not called on TSC-less systems. Replace rdtsc_barrier(); get_cycles() with rdtsc_ordered(). While we're at it, make the somewhat more dangerous change of removing barrier_before_rdtsc after RDTSC in the TSC warp check code. This should be okay, though -- the vDSO TSC code doesn't have that barrier, so, if removing the barrier from the warp check would cause us to detect a warp that we otherwise wouldn't detect, then we have a genuine bug. Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/387c4c3a75f875bcde6cd68cee013273a744f364.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index dd8d079..78083bf 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -39,16 +39,15 @@ static cycles_t max_warp; static int nr_warps; /* - * TSC-warp measurement loop running on both CPUs: + * TSC-warp measurement loop running on both CPUs. This is not called + * if there is no TSC. */ static void check_tsc_warp(unsigned int timeout) { cycles_t start, now, prev, end; int i; - rdtsc_barrier(); - start = get_cycles(); - rdtsc_barrier(); + start = rdtsc_ordered(); /* * The measurement runs for 'timeout' msecs: */ @@ -63,9 +62,7 @@ static void check_tsc_warp(unsigned int timeout) */ arch_spin_lock(&sync_lock); prev = last_tsc; - rdtsc_barrier(); - now = get_cycles(); - rdtsc_barrier(); + now = rdtsc_ordered(); last_tsc = now; arch_spin_unlock(&sync_lock); @@ -126,7 +123,7 @@ void check_tsc_sync_source(int cpu) /* * No need to check if we already know that the TSC is not - * synchronized: + * synchronized or if we have no TSC. */ if (unsynchronized_tsc()) return; @@ -190,6 +187,7 @@ void check_tsc_sync_target(void) { int cpus = 2; + /* Also aborts if there is no TSC. */ if (unsynchronized_tsc() || tsc_clocksource_reliable) return; -- cgit v0.10.2 From 27c634054a3155e1d9a02f0e362e4f4ff8d28ee7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:10 +0200 Subject: x86/asm/tsc: Use rdtsc_ordered() in read_tsc() instead of get_cycles() There are two logical changes here. First, this removes a check for cpu_has_tsc. That check is unnecessary, as we don't register the TSC as a clocksource on systems that have no TSC. Second, it adds a barrier, thus preventing observable non-monotonicity. I suspect that the missing barrier was never a problem in practice because system calls themselves were heavy enough barriers to prevent user code from observing time warps due to speculation. (Without the corresponding barrier in the vDSO, however, non-monotonicity is easy to detect.) Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/c6ff621a053127a65b70f175443578db7a0711be.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 21d6e04..451bade0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -961,7 +961,7 @@ static struct clocksource clocksource_tsc; */ static cycle_t read_tsc(struct clocksource *cs) { - return (cycle_t)get_cycles(); + return (cycle_t)rdtsc_ordered(); } /* -- cgit v0.10.2 From 502dfeff239e8313bfbe906ca0a1a6827ac8481b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:11 +0200 Subject: x86/asm/tsc, x86/kvm: Drop open-coded barrier and use rdtsc_ordered() in kvmclock __pvclock_read_cycles() used to have two barriers, one of which was unnecessary, which got removed after an initial version of this patch was sent. But the barrier is still open-coded unnecessarily - get rid of that barrier and clean up the code by just using rdtsc_ordered(). Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Marcelo Tosatti Cc: Paolo Bonzini Cc: Peter Zijlstra Cc: Radim Krcmar Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/678981cc4761fb38a793c217c9cac42503cf3719.1434501121.git.luto@kernel.org [ Ported it to v4.2-rc1. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 5c490db..7a6bed5 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift) static __always_inline u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src) { - u64 delta = rdtsc() - src->tsc_timestamp; + u64 delta = rdtsc_ordered() - src->tsc_timestamp; return pvclock_scale_delta(delta, src->tsc_to_system_mul, src->tsc_shift); } @@ -76,13 +76,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u8 ret_flags; version = src->version; - /* Note: emulated platforms which do not advertise SSE2 support - * result in kvmclock not using the necessary RDTSC barriers. - * Without barriers, it is possible that RDTSC instruction reads from - * the time stamp counter outside rdtsc_barrier protected section - * below, resulting in violation of monotonicity. - */ - rdtsc_barrier(); + offset = pvclock_get_nsec_offset(src); ret = src->system_time + offset; ret_flags = src->flags; -- cgit v0.10.2 From bb8dd96032fc63babfc8b378a37dd7681eeec326 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 25 Jun 2015 18:44:12 +0200 Subject: x86/asm/tsc: Remove rdtsc_barrier() All callers have been converted to rdtsc_ordered(). Signed-off-by: Andy Lutomirski Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Richard Weinberger Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/9baa4ae9a1e7c7c282f9cb2f15bb6bf5c2004032.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index e51a8f8..818cb87 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -91,15 +91,4 @@ do { \ #define smp_mb__before_atomic() barrier() #define smp_mb__after_atomic() barrier() -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - */ -static __always_inline void rdtsc_barrier(void) -{ - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, - "lfence", X86_FEATURE_LFENCE_RDTSC); -} - #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index b9531d3..755481f 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -45,17 +45,4 @@ #define read_barrier_depends() do { } while (0) #define smp_read_barrier_depends() do { } while (0) -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - * - * (Could use an alternative three way for this if there was one.) - */ -static inline void rdtsc_barrier(void) -{ - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, - "lfence", X86_FEATURE_LFENCE_RDTSC); -} - #endif -- cgit v0.10.2 From 5a33fcb8d991209bac0a266ab499e4b53d116cdd Mon Sep 17 00:00:00 2001 From: George Spelvin Date: Thu, 25 Jun 2015 18:44:13 +0200 Subject: x86/asm/tsc: Save an instruction in DECLARE_ARGS users Before, the code to do RDTSC looked like: rdtsc shl $0x20, %rdx mov %eax, %eax or %rdx, %rax The "mov %eax, %eax" is required to clear the high 32 bits of RAX. By declaring low and high as 64-bit variables, the code is simplified to: rdtsc shl $0x20,%rdx or %rdx,%rax Yes, it's a 2-byte instruction that's not on a critical path, but there are principles to be upheld. Every user of EAX_EDX_RET has been checked. I tried to check users of EAX_EDX_ARGS, but there weren't any, so I deleted it to be safe. ( There's no benefit to making "high" 64 bits, but it was the simplest way to proceed. ) Signed-off-by: George Spelvin Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: jacob.jun.pan@linux.intel.com Link: http://lkml.kernel.org/r/20150618075906.4615.qmail@ns.horizon.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 02bdd6c..131eec2 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -47,14 +47,13 @@ static inline unsigned long long native_read_tscp(unsigned int *aux) * it means rax *or* rdx. */ #ifdef CONFIG_X86_64 -#define DECLARE_ARGS(val, low, high) unsigned low, high -#define EAX_EDX_VAL(val, low, high) ((low) | ((u64)(high) << 32)) -#define EAX_EDX_ARGS(val, low, high) "a" (low), "d" (high) +/* Using 64-bit values saves one instruction clearing the high half of low */ +#define DECLARE_ARGS(val, low, high) unsigned long low, high +#define EAX_EDX_VAL(val, low, high) ((low) | (high) << 32) #define EAX_EDX_RET(val, low, high) "=a" (low), "=d" (high) #else #define DECLARE_ARGS(val, low, high) unsigned long long val #define EAX_EDX_VAL(val, low, high) (val) -#define EAX_EDX_ARGS(val, low, high) "A" (val) #define EAX_EDX_RET(val, low, high) "=A" (val) #endif -- cgit v0.10.2 From c0bfd26e136cafc2b23c16225b4d7b1e14de81c1 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:10 -0400 Subject: x86/compat: Move copy_siginfo_*_user32() to signal_compat.c copy_siginfo_to_user32() and copy_siginfo_from_user32() are used by both the 32-bit compat and x32 ABIs. Move them to signal_compat.c. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-2-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index ae3a29a..a0a19b7 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -34,99 +34,6 @@ #include #include -int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) -{ - int err = 0; - bool ia32 = test_thread_flag(TIF_IA32); - - if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) - return -EFAULT; - - put_user_try { - /* If you change siginfo_t structure, please make sure that - this code is fixed accordingly. - It should never copy any pad contained in the structure - to avoid security leaks, but must copy the generic - 3 ints plus the relevant union member. */ - put_user_ex(from->si_signo, &to->si_signo); - put_user_ex(from->si_errno, &to->si_errno); - put_user_ex((short)from->si_code, &to->si_code); - - if (from->si_code < 0) { - put_user_ex(from->si_pid, &to->si_pid); - put_user_ex(from->si_uid, &to->si_uid); - put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); - } else { - /* - * First 32bits of unions are always present: - * si_pid === si_band === si_tid === si_addr(LS half) - */ - put_user_ex(from->_sifields._pad[0], - &to->_sifields._pad[0]); - switch (from->si_code >> 16) { - case __SI_FAULT >> 16: - break; - case __SI_SYS >> 16: - put_user_ex(from->si_syscall, &to->si_syscall); - put_user_ex(from->si_arch, &to->si_arch); - break; - case __SI_CHLD >> 16: - if (ia32) { - put_user_ex(from->si_utime, &to->si_utime); - put_user_ex(from->si_stime, &to->si_stime); - } else { - put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); - put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); - } - put_user_ex(from->si_status, &to->si_status); - /* FALL THROUGH */ - default: - case __SI_KILL >> 16: - put_user_ex(from->si_uid, &to->si_uid); - break; - case __SI_POLL >> 16: - put_user_ex(from->si_fd, &to->si_fd); - break; - case __SI_TIMER >> 16: - put_user_ex(from->si_overrun, &to->si_overrun); - put_user_ex(ptr_to_compat(from->si_ptr), - &to->si_ptr); - break; - /* This is not generated by the kernel as of now. */ - case __SI_RT >> 16: - case __SI_MESGQ >> 16: - put_user_ex(from->si_uid, &to->si_uid); - put_user_ex(from->si_int, &to->si_int); - break; - } - } - } put_user_catch(err); - - return err; -} - -int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) -{ - int err = 0; - u32 ptr32; - - if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) - return -EFAULT; - - get_user_try { - get_user_ex(to->si_signo, &from->si_signo); - get_user_ex(to->si_errno, &from->si_errno); - get_user_ex(to->si_code, &from->si_code); - - get_user_ex(to->si_pid, &from->si_pid); - get_user_ex(to->si_uid, &from->si_uid); - get_user_ex(ptr32, &from->si_ptr); - to->si_ptr = compat_ptr(ptr32); - } get_user_catch(err); - - return err; -} - /* * Do a signal return; undo the signal stack. */ diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 0f15af4..dc19730 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -23,6 +23,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o +obj-$(CONFIG_COMPAT) += signal_compat.o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c new file mode 100644 index 0000000..dc3c0b1 --- /dev/null +++ b/arch/x86/kernel/signal_compat.c @@ -0,0 +1,95 @@ +#include +#include + +int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) +{ + int err = 0; + bool ia32 = test_thread_flag(TIF_IA32); + + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) + return -EFAULT; + + put_user_try { + /* If you change siginfo_t structure, please make sure that + this code is fixed accordingly. + It should never copy any pad contained in the structure + to avoid security leaks, but must copy the generic + 3 ints plus the relevant union member. */ + put_user_ex(from->si_signo, &to->si_signo); + put_user_ex(from->si_errno, &to->si_errno); + put_user_ex((short)from->si_code, &to->si_code); + + if (from->si_code < 0) { + put_user_ex(from->si_pid, &to->si_pid); + put_user_ex(from->si_uid, &to->si_uid); + put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr); + } else { + /* + * First 32bits of unions are always present: + * si_pid === si_band === si_tid === si_addr(LS half) + */ + put_user_ex(from->_sifields._pad[0], + &to->_sifields._pad[0]); + switch (from->si_code >> 16) { + case __SI_FAULT >> 16: + break; + case __SI_SYS >> 16: + put_user_ex(from->si_syscall, &to->si_syscall); + put_user_ex(from->si_arch, &to->si_arch); + break; + case __SI_CHLD >> 16: + if (ia32) { + put_user_ex(from->si_utime, &to->si_utime); + put_user_ex(from->si_stime, &to->si_stime); + } else { + put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime); + put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime); + } + put_user_ex(from->si_status, &to->si_status); + /* FALL THROUGH */ + default: + case __SI_KILL >> 16: + put_user_ex(from->si_uid, &to->si_uid); + break; + case __SI_POLL >> 16: + put_user_ex(from->si_fd, &to->si_fd); + break; + case __SI_TIMER >> 16: + put_user_ex(from->si_overrun, &to->si_overrun); + put_user_ex(ptr_to_compat(from->si_ptr), + &to->si_ptr); + break; + /* This is not generated by the kernel as of now. */ + case __SI_RT >> 16: + case __SI_MESGQ >> 16: + put_user_ex(from->si_uid, &to->si_uid); + put_user_ex(from->si_int, &to->si_int); + break; + } + } + } put_user_catch(err); + + return err; +} + +int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) +{ + int err = 0; + u32 ptr32; + + if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t))) + return -EFAULT; + + get_user_try { + get_user_ex(to->si_signo, &from->si_signo); + get_user_ex(to->si_errno, &from->si_errno); + get_user_ex(to->si_code, &from->si_code); + + get_user_ex(to->si_pid, &from->si_pid); + get_user_ex(to->si_uid, &from->si_uid); + get_user_ex(ptr32, &from->si_ptr); + to->si_ptr = compat_ptr(ptr32); + } get_user_catch(err); + + return err; +} -- cgit v0.10.2 From b2e02b820d5b42479195b89d3d73f31bcedb264e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:11 -0400 Subject: x86/compat: Make mmap_is_ia32() common compat TIF_ADDR32 is set for both ia32 and x32 tasks, so change from CONFIG_IA32_EMULATION to CONFIG_COMPAT. Use config_enabled() to make the function more readable. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-3-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index f161c18..180b6fe 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -344,14 +344,9 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm, */ static inline int mmap_is_ia32(void) { -#ifdef CONFIG_X86_32 - return 1; -#endif -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_ADDR32)) - return 1; -#endif - return 0; + return config_enabled(CONFIG_X86_32) || + (config_enabled(CONFIG_COMPAT) && + test_thread_flag(TIF_ADDR32)); } /* Do not change the values. See get_align_mask() */ -- cgit v0.10.2 From b829d1be20ab51a3b76ec003118c9260d1fa424e Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:12 -0400 Subject: x86/compat: Move ucontext_x32 to sigframe.h ia32.h should only contain the code for 32-bit compatability. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-4-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h index d0e8e01..2801976 100644 --- a/arch/x86/include/asm/ia32.h +++ b/arch/x86/include/asm/ia32.h @@ -22,15 +22,6 @@ struct ucontext_ia32 { compat_sigset_t uc_sigmask; /* mask last for extensibility */ }; -struct ucontext_x32 { - unsigned int uc_flags; - unsigned int uc_link; - compat_stack_t uc_stack; - unsigned int uc__pad0; /* needed for alignment */ - struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ - compat_sigset_t uc_sigmask; /* mask last for extensibility */ -}; - /* This matches struct stat64 in glibc2.2, hence the absolutely * insane amounts of padding around dev_t's. */ diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h index 7c7c27c..1f3175b 100644 --- a/arch/x86/include/asm/sigframe.h +++ b/arch/x86/include/asm/sigframe.h @@ -4,6 +4,7 @@ #include #include #include +#include #ifdef CONFIG_X86_32 #define sigframe_ia32 sigframe @@ -69,6 +70,15 @@ struct rt_sigframe { #ifdef CONFIG_X86_X32_ABI +struct ucontext_x32 { + unsigned int uc_flags; + unsigned int uc_link; + compat_stack_t uc_stack; + unsigned int uc__pad0; /* needed for alignment */ + struct sigcontext uc_mcontext; /* the 64-bit sigcontext type */ + compat_sigset_t uc_sigmask; /* mask last for extensibility */ +}; + struct rt_sigframe_x32 { u64 pretcode; struct ucontext_x32 uc; -- cgit v0.10.2 From 7da770785f9740af1cb24b8fd63075543bd00711 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:13 -0400 Subject: x86/compat: Rename 'start_thread_ia32' to 'compat_start_thread' This function is shared between the 32-bit compat and x32 ABIs. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-5-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 180b6fe..2bf67c0 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t, #define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ elf_common_init(¤t->thread, regs, __USER_DS) -void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp); -#define compat_start_thread start_thread_ia32 +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp); +#define compat_start_thread compat_start_thread void set_personality_ia32(bool); #define COMPAT_SET_PERSONALITY(ex) \ diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 71d7849..0831ba3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -248,8 +248,8 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) __USER_CS, __USER_DS, 0); } -#ifdef CONFIG_IA32_EMULATION -void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp) +#ifdef CONFIG_COMPAT +void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp) { start_thread_common(regs, new_ip, new_sp, test_thread_flag(TIF_X32) -- cgit v0.10.2 From 601275c3e04c43b3b34237ab36c27fc1cfb8a189 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:14 -0400 Subject: x86/compat: Factor out ia32 compat code from compat_arch_ptrace() Move the ia32-specific code in compat_arch_ptrace() into its own function. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-6-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 9be72bc..7155957 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1123,6 +1123,73 @@ static int genregs32_set(struct task_struct *target, return ret; } +static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request, + compat_ulong_t caddr, compat_ulong_t cdata) +{ + unsigned long addr = caddr; + unsigned long data = cdata; + void __user *datap = compat_ptr(data); + int ret; + __u32 val; + + switch (request) { + case PTRACE_PEEKUSR: + ret = getreg32(child, addr, &val); + if (ret == 0) + ret = put_user(val, (__u32 __user *)datap); + break; + + case PTRACE_POKEUSR: + ret = putreg32(child, addr, data); + break; + + case PTRACE_GETREGS: /* Get all gp regs from the child. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_GENERAL, + 0, sizeof(struct user_regs_struct32), + datap); + + case PTRACE_SETREGS: /* Set all gp regs in the child. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_GENERAL, 0, + sizeof(struct user_regs_struct32), + datap); + + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_FP, 0, + sizeof(struct user_i387_ia32_struct), + datap); + + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + return copy_regset_from_user( + child, &user_x86_32_view, REGSET_FP, + 0, sizeof(struct user_i387_ia32_struct), datap); + + case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ + return copy_regset_to_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ + return copy_regset_from_user(child, &user_x86_32_view, + REGSET_XFP, 0, + sizeof(struct user32_fxsr_struct), + datap); + + case PTRACE_GET_THREAD_AREA: + case PTRACE_SET_THREAD_AREA: + return arch_ptrace(child, request, addr, data); + + default: + return compat_ptrace_request(child, request, addr, data); + } + + return ret; +} +#endif /* CONFIG_IA32_EMULATION */ + #ifdef CONFIG_X86_X32_ABI static long x32_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, @@ -1211,78 +1278,21 @@ static long x32_arch_ptrace(struct task_struct *child, } #endif +#ifdef CONFIG_COMPAT long compat_arch_ptrace(struct task_struct *child, compat_long_t request, compat_ulong_t caddr, compat_ulong_t cdata) { - unsigned long addr = caddr; - unsigned long data = cdata; - void __user *datap = compat_ptr(data); - int ret; - __u32 val; - #ifdef CONFIG_X86_X32_ABI if (!is_ia32_task()) return x32_arch_ptrace(child, request, caddr, cdata); #endif - - switch (request) { - case PTRACE_PEEKUSR: - ret = getreg32(child, addr, &val); - if (ret == 0) - ret = put_user(val, (__u32 __user *)datap); - break; - - case PTRACE_POKEUSR: - ret = putreg32(child, addr, data); - break; - - case PTRACE_GETREGS: /* Get all gp regs from the child. */ - return copy_regset_to_user(child, &user_x86_32_view, - REGSET_GENERAL, - 0, sizeof(struct user_regs_struct32), - datap); - - case PTRACE_SETREGS: /* Set all gp regs in the child. */ - return copy_regset_from_user(child, &user_x86_32_view, - REGSET_GENERAL, 0, - sizeof(struct user_regs_struct32), - datap); - - case PTRACE_GETFPREGS: /* Get the child FPU state. */ - return copy_regset_to_user(child, &user_x86_32_view, - REGSET_FP, 0, - sizeof(struct user_i387_ia32_struct), - datap); - - case PTRACE_SETFPREGS: /* Set the child FPU state. */ - return copy_regset_from_user( - child, &user_x86_32_view, REGSET_FP, - 0, sizeof(struct user_i387_ia32_struct), datap); - - case PTRACE_GETFPXREGS: /* Get the child extended FPU state. */ - return copy_regset_to_user(child, &user_x86_32_view, - REGSET_XFP, 0, - sizeof(struct user32_fxsr_struct), - datap); - - case PTRACE_SETFPXREGS: /* Set the child extended FPU state. */ - return copy_regset_from_user(child, &user_x86_32_view, - REGSET_XFP, 0, - sizeof(struct user32_fxsr_struct), - datap); - - case PTRACE_GET_THREAD_AREA: - case PTRACE_SET_THREAD_AREA: - return arch_ptrace(child, request, addr, data); - - default: - return compat_ptrace_request(child, request, addr, data); - } - - return ret; +#ifdef CONFIG_IA32_EMULATION + return ia32_arch_ptrace(child, request, caddr, cdata); +#else + return 0; +#endif } - -#endif /* CONFIG_IA32_EMULATION */ +#endif /* CONFIG_COMPAT */ #ifdef CONFIG_X86_64 -- cgit v0.10.2 From ab8b82ee6dad7c9c257f450d14719a0e3f327244 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:15 -0400 Subject: x86/compat: Don't build the 32-bit VDSO if not needed Build the 32-bit vdso only for native 32-bit or 32-bit compat is enabled. x32 should not force it to build. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-7-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index e970320..96c0617 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -8,7 +8,7 @@ KASAN_SANITIZE := n VDSO64-$(CONFIG_X86_64) := y VDSOX32-$(CONFIG_X86_X32_ABI) := y VDSO32-$(CONFIG_X86_32) := y -VDSO32-$(CONFIG_COMPAT) := y +VDSO32-$(CONFIG_IA32_EMULATION) := y # files to link into the vdso vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o @@ -20,7 +20,7 @@ obj-y += vma.o vdso_img-$(VDSO64-y) += 64 vdso_img-$(VDSOX32-y) += x32 vdso_img-$(VDSO32-y) += 32-int80 -vdso_img-$(CONFIG_COMPAT) += 32-syscall +vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall vdso_img-$(VDSO32-y) += 32-sysenter obj-$(VDSO32-y) += vdso32-setup.o @@ -126,7 +126,7 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE # Build multiple 32-bit vDSO images to choose from at boot time. # vdso32.so-$(VDSO32-y) += int80 -vdso32.so-$(CONFIG_COMPAT) += syscall +vdso32.so-$(CONFIG_IA32_EMULATION) += syscall vdso32.so-$(VDSO32-y) += sysenter vdso32-images = $(vdso32.so-y:%=vdso32-%.so) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 1c9f750..4345431 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -177,7 +177,7 @@ up_fail: return ret; } -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static int load_vdso32(void) { int ret; @@ -219,8 +219,11 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm, return map_vdso(&vdso_image_x32, true); } #endif - +#ifdef CONFIG_IA32_EMULATION return load_vdso32(); +#else + return 0; +#endif } #endif #else diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 2bf67c0..141c561 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -78,7 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t; #ifdef CONFIG_X86_64 extern unsigned int vdso64_enabled; #endif -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) extern unsigned int vdso32_enabled; #endif -- cgit v0.10.2 From c338867d0e4224771c68d0a7727289b86c23eccd Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:16 -0400 Subject: x86/compat: Check for both 32-bit compat and x32 in get_gate_vma() Change this to CONFIG_COMPAT so both 32-bit compat and x32 will do the check. Signed-off-by: Brian Gerst Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-8-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 2dcc6ff..26a46f4 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -290,7 +290,7 @@ static struct vm_area_struct gate_vma = { struct vm_area_struct *get_gate_vma(struct mm_struct *mm) { -#ifdef CONFIG_IA32_EMULATION +#ifdef CONFIG_COMPAT if (!mm || mm->context.ia32_compat) return NULL; #endif -- cgit v0.10.2 From 10ed34935e7e828ce4ce566647a2d6b8240e4dee Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:17 -0400 Subject: x86/compat, x86/perf: Don't build perf_callchain_user32() on x32 perf_callchain_user32() is not needed for x32. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-9-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 3658de4..641413d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2196,7 +2196,7 @@ static unsigned long get_segment_base(unsigned int segment) return get_desc_base(desc + idx); } -#ifdef CONFIG_COMPAT +#ifdef CONFIG_IA32_EMULATION #include -- cgit v0.10.2 From 5e2aad2460bd38d0777052486893b32902efcdcd Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:18 -0400 Subject: x86/compat: Remove unneeded #include Including sys_ia32.h is not needed in signal.c. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-10-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 206996c..6c22aad 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -35,7 +35,6 @@ #ifdef CONFIG_X86_64 #include #include -#include #endif /* CONFIG_X86_64 */ #include -- cgit v0.10.2 From 3bead553ab657d482c3fd6559a1fd7f024414a63 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:19 -0400 Subject: x86/compat: Define ARCH_WANT_OLD_COMPAT_IPC only for 32-bit compat x32 does not need CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-11-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 55bced1..6e910ba 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2517,6 +2517,7 @@ config IA32_EMULATION select BINFMT_ELF select COMPAT_BINFMT_ELF select HAVE_UID16 + select ARCH_WANT_OLD_COMPAT_IPC ---help--- Include code to run legacy 32-bit programs under a 64-bit kernel. You should likely turn this on, unless you're @@ -2544,7 +2545,6 @@ config X86_X32 config COMPAT def_bool y depends on IA32_EMULATION || X86_X32 - select ARCH_WANT_OLD_COMPAT_IPC if COMPAT config COMPAT_FOR_U64_ALIGNMENT -- cgit v0.10.2 From 0c3619ea6756833e5c636c886cb55ca5b77f5d73 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:20 -0400 Subject: x86/compat: Clean up HAVE_UID16 config Merge the 32-bit compat config setting for HAVE_UID16 with the 32-bit native one. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-12-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 6e910ba..d823a33 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -132,7 +132,7 @@ config X86 select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_SYSCALL_TRACEPOINTS - select HAVE_UID16 if X86_32 + select HAVE_UID16 if X86_32 || IA32_EMULATION select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_USER_RETURN_NOTIFIER select IRQ_FORCED_THREADING @@ -2516,7 +2516,6 @@ config IA32_EMULATION depends on X86_64 select BINFMT_ELF select COMPAT_BINFMT_ELF - select HAVE_UID16 select ARCH_WANT_OLD_COMPAT_IPC ---help--- Include code to run legacy 32-bit programs under a -- cgit v0.10.2 From 9b54050bfe438d9e1108211d28cb0b995b1f347c Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Mon, 22 Jun 2015 07:55:21 -0400 Subject: x86/compat: Separate ia32 and x32 compat ABIs The x32 ABI is now independent of the ia32 compat ABI. Common code is now conditional on CONFIG_COMPAT, but unshared code like syscall entry, signal handling, and the VDSO are under separate config options. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1434974121-32575-13-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d823a33..aa94fd0 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2530,7 +2530,7 @@ config IA32_AOUT config X86_X32 bool "x32 ABI for 64-bit mode" - depends on X86_64 && IA32_EMULATION + depends on X86_64 ---help--- Include code to run binaries for the x32 native 32-bit ABI for 64-bit processors. An x32 process gets access to the -- cgit v0.10.2 From 5e5c684a2c78b98dcba3d6fce56773a375f63980 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:18 -0700 Subject: x86/entry, selftests/x86: Add a test for 32-bit fast syscall arg faults This test passes on 4.0 and fails on some newer kernels. Fortunately, the failure is likely not a big deal. This test will make sure that we don't break it further (e.g. OOPSing) as we clean up the entry code and that we eventually fix the regression. There's arguably no need to preserve the old ABI here -- anything that makes it into a fast (vDSO) syscall with a bad stack is about to crash no matter what we do. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/9cfcc51005168cb1b06b31991931214d770fc59a.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index caa60d5..e8df47e 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -5,7 +5,7 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs -TARGETS_C_32BIT_ONLY := entry_from_vm86 +TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c new file mode 100644 index 0000000..7db4fc9 --- /dev/null +++ b/tools/testing/selftests/x86/syscall_arg_fault.c @@ -0,0 +1,130 @@ +/* + * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args + * Copyright (c) 2015 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Our sigaltstack scratch space. */ +static unsigned char altstack_data[SIGSTKSZ]; + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static volatile sig_atomic_t sig_traps; +static sigjmp_buf jmpbuf; + +static volatile sig_atomic_t n_errs; + +static void sigsegv(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) { + printf("[FAIL]\tAX had the wrong value: 0x%x\n", + ctx->uc_mcontext.gregs[REG_EAX]); + n_errs++; + } else { + printf("[OK]\tSeems okay\n"); + } + + siglongjmp(jmpbuf, 1); +} + +static void sigill(int sig, siginfo_t *info, void *ctx_void) +{ + printf("[SKIP]\tIllegal instruction\n"); + siglongjmp(jmpbuf, 1); +} + +int main() +{ + stack_t stack = { + .ss_sp = altstack_data, + .ss_size = SIGSTKSZ, + }; + if (sigaltstack(&stack, NULL) != 0) + err(1, "sigaltstack"); + + sethandler(SIGSEGV, sigsegv, SA_ONSTACK); + sethandler(SIGILL, sigill, SA_ONSTACK); + + /* + * Exercise another nasty special case. The 32-bit SYSCALL + * and SYSENTER instructions (even in compat mode) each + * clobber one register. A Linux system call has a syscall + * number and six arguments, and the user stack pointer + * needs to live in some register on return. That means + * that we need eight registers, but SYSCALL and SYSENTER + * only preserve seven registers. As a result, one argument + * ends up on the stack. The stack is user memory, which + * means that the kernel can fail to read it. + * + * The 32-bit fast system calls don't have a defined ABI: + * we're supposed to invoke them through the vDSO. So we'll + * fudge it: we set all regs to invalid pointer values and + * invoke the entry instruction. The return will fail no + * matter what, and we completely lose our program state, + * but we can fix it up with a signal handler. + */ + + printf("[RUN]\tSYSENTER with invalid state\n"); + if (sigsetjmp(jmpbuf, 1) == 0) { + asm volatile ( + "movl $-1, %%eax\n\t" + "movl $-1, %%ebx\n\t" + "movl $-1, %%ecx\n\t" + "movl $-1, %%edx\n\t" + "movl $-1, %%esi\n\t" + "movl $-1, %%edi\n\t" + "movl $-1, %%ebp\n\t" + "movl $-1, %%esp\n\t" + "sysenter" + : : : "memory", "flags"); + } + + printf("[RUN]\tSYSCALL with invalid state\n"); + if (sigsetjmp(jmpbuf, 1) == 0) { + asm volatile ( + "movl $-1, %%eax\n\t" + "movl $-1, %%ebx\n\t" + "movl $-1, %%ecx\n\t" + "movl $-1, %%edx\n\t" + "movl $-1, %%esi\n\t" + "movl $-1, %%edi\n\t" + "movl $-1, %%ebp\n\t" + "movl $-1, %%esp\n\t" + "syscall\n\t" + "pushl $0" /* make sure we segfault cleanly */ + : : : "memory", "flags"); + } + + return 0; +} -- cgit v0.10.2 From 5e99cb7c35ca0580da8e892f91c655d35ecf8798 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:19 -0700 Subject: x86/entry/64/compat: Fix bad fast syscall arg failure path If user code does SYSCALL32 or SYSENTER without a valid stack, then our attempt to determine the syscall args will result in a failed uaccess fault. Previously, we would try to recover by jumping to the syscall exit code, but we'd run the syscall exit work even though we never made it to the syscall entry work. Clean it up by treating the failure path as a non-syscall entry and exit pair. This fixes strace's output when running the syscall_arg_fault test. Without this fix, strace would get out of sync and would fail to associate syscall entries with syscall exits. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/903010762c07a3d67df914fea2da84b52b0f8f1d.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3bb2c43..141a5d4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -613,7 +613,7 @@ ret_from_intr: testb $3, CS(%rsp) jz retint_kernel /* Interrupt came from user space */ -retint_user: +GLOBAL(retint_user) GET_THREAD_INFO(%rcx) /* %rcx: thread info. Interrupts are off. */ diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index b868cfc..e5ebdd9 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -428,8 +428,39 @@ cstar_tracesys: END(entry_SYSCALL_compat) ia32_badarg: - ASM_CLAC - movq $-EFAULT, RAX(%rsp) + /* + * So far, we've entered kernel mode, set AC, turned on IRQs, and + * saved C regs except r8-r11. We haven't done any of the other + * standard entry work, though. We want to bail, but we shouldn't + * treat this as a syscall entry since we don't even know what the + * args are. Instead, treat this as a non-syscall entry, finish + * the entry work, and immediately exit after setting AX = -EFAULT. + * + * We're really just being polite here. Killing the task outright + * would be a reasonable action, too. Given that the only valid + * way to have gotten here is through the vDSO, and we already know + * that the stack pointer is bad, the task isn't going to survive + * for long no matter what we do. + */ + + ASM_CLAC /* undo STAC */ + movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */ + + /* Fill in the rest of pt_regs */ + xorl %eax, %eax + movq %rax, R11(%rsp) + movq %rax, R10(%rsp) + movq %rax, R9(%rsp) + movq %rax, R8(%rsp) + SAVE_EXTRA_REGS + + /* Turn IRQs back off. */ + DISABLE_INTERRUPTS(CLBR_NONE) + TRACE_IRQS_OFF + + /* And exit again. */ + jmp retint_user + ia32_ret_from_sys_call: xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) -- cgit v0.10.2 From ccaee5f851470dec6894a6835b6fadffc2bb7514 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 3 Jul 2015 12:44:20 -0700 Subject: um: Fix do_signal() prototype Once x86 exports its do_signal(), the prototypes will clash. Fix the clash and also improve the code a bit: remove the unnecessary kern_do_signal() indirection. This allows interrupt_end() to share the 'regs' parameter calculation. Also remove the unused return code to match x86. Minimally build and boot tested. Signed-off-by: Ingo Molnar Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Richard Weinberger Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/67c57eac09a589bac3c6c5ff22f9623ec55a184a.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h index 83a91f9..35ab97e 100644 --- a/arch/um/include/shared/kern_util.h +++ b/arch/um/include/shared/kern_util.h @@ -22,7 +22,8 @@ extern int kmalloc_ok; extern unsigned long alloc_stack(int order, int atomic); extern void free_stack(unsigned long stack, int order); -extern int do_signal(void); +struct pt_regs; +extern void do_signal(struct pt_regs *regs); extern void interrupt_end(void); extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs); diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 68b9119..a6d9226 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -90,12 +90,14 @@ void *__switch_to(struct task_struct *from, struct task_struct *to) void interrupt_end(void) { + struct pt_regs *regs = ¤t->thread.regs; + if (need_resched()) schedule(); if (test_thread_flag(TIF_SIGPENDING)) - do_signal(); + do_signal(regs); if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME)) - tracehook_notify_resume(¤t->thread.regs); + tracehook_notify_resume(regs); } void exit_thread(void) diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c index 4f60e4a..57acbd6 100644 --- a/arch/um/kernel/signal.c +++ b/arch/um/kernel/signal.c @@ -64,7 +64,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) signal_setup_done(err, ksig, singlestep); } -static int kern_do_signal(struct pt_regs *regs) +void do_signal(struct pt_regs *regs) { struct ksignal ksig; int handled_sig = 0; @@ -110,10 +110,4 @@ static int kern_do_signal(struct pt_regs *regs) */ if (!handled_sig) restore_saved_sigmask(); - return handled_sig; -} - -int do_signal(void) -{ - return kern_do_signal(¤t->thread.regs); } diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index f1b3eb1..2077248 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -291,7 +291,7 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr, /* We are under mmap_sem, release it such that current can terminate */ up_write(¤t->mm->mmap_sem); force_sig(SIGKILL, current); - do_signal(); + do_signal(¤t->thread.regs); } } diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c index 557232f..d8a9fce 100644 --- a/arch/um/kernel/trap.c +++ b/arch/um/kernel/trap.c @@ -173,7 +173,7 @@ static void bad_segv(struct faultinfo fi, unsigned long ip) void fatal_sigsegv(void) { force_sigsegv(SIGSEGV, current); - do_signal(); + do_signal(¤t->thread.regs); /* * This is to tell gcc that we're not returning - do_signal * can, in general, return, but in this case, it's not, since -- cgit v0.10.2 From f9281648ecd5081803bb2da84b9ccb0cf48436cd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:21 -0700 Subject: context_tracking: Add ct_state() and CT_WARN_ON() This will let us sprinkle sanity checks around the kernel without making too much of a mess. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/5da41fb2ceb29eac671f427c67040401ba2a1fa0.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index b96bd29..008fc67 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -49,13 +49,28 @@ static inline void exception_exit(enum ctx_state prev_ctx) } } + +/** + * ct_state() - return the current context tracking state if known + * + * Returns the current cpu's context tracking state if context tracking + * is enabled. If context tracking is disabled, returns + * CONTEXT_DISABLED. This should be used primarily for debugging. + */ +static inline enum ctx_state ct_state(void) +{ + return context_tracking_is_enabled() ? + this_cpu_read(context_tracking.state) : CONTEXT_DISABLED; +} #else static inline void user_enter(void) { } static inline void user_exit(void) { } static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } +static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } #endif /* !CONFIG_CONTEXT_TRACKING */ +#define CT_WARN_ON(cond) WARN_ON(context_tracking_is_enabled() && (cond)) #ifdef CONFIG_CONTEXT_TRACKING_FORCE extern void context_tracking_init(void); diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index 678ecdf..ee956c5 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -14,6 +14,7 @@ struct context_tracking { bool active; int recursion; enum ctx_state { + CONTEXT_DISABLED = -1, /* returned by ct_state() if unknown */ CONTEXT_KERNEL = 0, CONTEXT_USER, CONTEXT_GUEST, -- cgit v0.10.2 From e727c7d7a11e109849582e9165d54b254eb181d7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:22 -0700 Subject: notifiers, RCU: Assert that RCU is watching in notify_die() Low-level arch entries often call notify_die(), and it's easy for arch code to fail to exit an RCU quiescent state first. Assert that we're not quiescent in notify_die(). Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Paul E. McKenney Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/1f5fe6c23d5b432a23267102f2d72b787d80fdd8.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/kernel/notifier.c b/kernel/notifier.c index ae9fc7c..980e433 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str, .signr = sig, }; + rcu_lockdep_assert(rcu_is_watching(), + "notify_die called but RCU thinks we're quiescent"); return atomic_notifier_call_chain(&die_chain, val, &args); } NOKPROBE_SYMBOL(notify_die); -- cgit v0.10.2 From 1f484aa6904697f390027c12fba130fa94b20831 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:23 -0700 Subject: x86/entry: Move C entry and exit code to arch/x86/entry/common.c The entry and exit C helpers were confusingly scattered between ptrace.c and signal.c, even though they aren't specific to ptrace or signal handling. Move them together in a new file. This change just moves code around. It doesn't change anything. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/324d686821266544d8572423cc281f961da445f4.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile index 7a14497..bd55ded 100644 --- a/arch/x86/entry/Makefile +++ b/arch/x86/entry/Makefile @@ -2,6 +2,7 @@ # Makefile for the x86 low level entry code # obj-y := entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o +obj-y += common.o obj-y += vdso/ obj-y += vsyscall/ diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c new file mode 100644 index 0000000..917d0c3 --- /dev/null +++ b/arch/x86/entry/common.c @@ -0,0 +1,253 @@ +/* + * common.c - C code for kernel entry and exit + * Copyright (c) 2015 Andrew Lutomirski + * GPL v2 + * + * Based on asm and ptrace code by many authors. The code here originated + * in ptrace.c and signal.c. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#define CREATE_TRACE_POINTS +#include + +static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) +{ +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + audit_syscall_entry(regs->orig_ax, regs->di, + regs->si, regs->dx, regs->r10); + } else +#endif + { + audit_syscall_entry(regs->orig_ax, regs->bx, + regs->cx, regs->dx, regs->si); + } +} + +/* + * We can return 0 to resume the syscall or anything else to go to phase + * 2. If we resume the syscall, we need to put something appropriate in + * regs->orig_ax. + * + * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax + * are fully functional. + * + * For phase 2's benefit, our return value is: + * 0: resume the syscall + * 1: go to phase 2; no seccomp phase 2 needed + * anything else: go to phase 2; pass return value to seccomp + */ +unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) +{ + unsigned long ret = 0; + u32 work; + + BUG_ON(regs != task_pt_regs(current)); + + work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + + /* + * If TIF_NOHZ is set, we are required to call user_exit() before + * doing anything that could touch RCU. + */ + if (work & _TIF_NOHZ) { + user_exit(); + work &= ~_TIF_NOHZ; + } + +#ifdef CONFIG_SECCOMP + /* + * Do seccomp first -- it should minimize exposure of other + * code, and keeping seccomp fast is probably more valuable + * than the rest of this. + */ + if (work & _TIF_SECCOMP) { + struct seccomp_data sd; + + sd.arch = arch; + sd.nr = regs->orig_ax; + sd.instruction_pointer = regs->ip; +#ifdef CONFIG_X86_64 + if (arch == AUDIT_ARCH_X86_64) { + sd.args[0] = regs->di; + sd.args[1] = regs->si; + sd.args[2] = regs->dx; + sd.args[3] = regs->r10; + sd.args[4] = regs->r8; + sd.args[5] = regs->r9; + } else +#endif + { + sd.args[0] = regs->bx; + sd.args[1] = regs->cx; + sd.args[2] = regs->dx; + sd.args[3] = regs->si; + sd.args[4] = regs->di; + sd.args[5] = regs->bp; + } + + BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); + BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); + + ret = seccomp_phase1(&sd); + if (ret == SECCOMP_PHASE1_SKIP) { + regs->orig_ax = -1; + ret = 0; + } else if (ret != SECCOMP_PHASE1_OK) { + return ret; /* Go directly to phase 2 */ + } + + work &= ~_TIF_SECCOMP; + } +#endif + + /* Do our best to finish without phase 2. */ + if (work == 0) + return ret; /* seccomp and/or nohz only (ret == 0 here) */ + +#ifdef CONFIG_AUDITSYSCALL + if (work == _TIF_SYSCALL_AUDIT) { + /* + * If there is no more work to be done except auditing, + * then audit in phase 1. Phase 2 always audits, so, if + * we audit here, then we can't go on to phase 2. + */ + do_audit_syscall_entry(regs, arch); + return 0; + } +#endif + + return 1; /* Something is enabled that we can't handle in phase 1 */ +} + +/* Returns the syscall nr to run (which should match regs->orig_ax). */ +long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, + unsigned long phase1_result) +{ + long ret = 0; + u32 work = ACCESS_ONCE(current_thread_info()->flags) & + _TIF_WORK_SYSCALL_ENTRY; + + BUG_ON(regs != task_pt_regs(current)); + + /* + * If we stepped into a sysenter/syscall insn, it trapped in + * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. + * If user-mode had set TF itself, then it's still clear from + * do_debug() and we need to set it again to restore the user + * state. If we entered on the slow path, TF was already set. + */ + if (work & _TIF_SINGLESTEP) + regs->flags |= X86_EFLAGS_TF; + +#ifdef CONFIG_SECCOMP + /* + * Call seccomp_phase2 before running the other hooks so that + * they can see any changes made by a seccomp tracer. + */ + if (phase1_result > 1 && seccomp_phase2(phase1_result)) { + /* seccomp failures shouldn't expose any additional code. */ + return -1; + } +#endif + + if (unlikely(work & _TIF_SYSCALL_EMU)) + ret = -1L; + + if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && + tracehook_report_syscall_entry(regs)) + ret = -1L; + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_enter(regs, regs->orig_ax); + + do_audit_syscall_entry(regs, arch); + + return ret ?: regs->orig_ax; +} + +long syscall_trace_enter(struct pt_regs *regs) +{ + u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; + unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); + + if (phase1_result == 0) + return regs->orig_ax; + else + return syscall_trace_enter_phase2(regs, arch, phase1_result); +} + +void syscall_trace_leave(struct pt_regs *regs) +{ + bool step; + + /* + * We may come here right after calling schedule_user() + * or do_notify_resume(), in which case we can be in RCU + * user mode. + */ + user_exit(); + + audit_syscall_exit(regs); + + if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && + !test_thread_flag(TIF_SYSCALL_EMU); + if (step || test_thread_flag(TIF_SYSCALL_TRACE)) + tracehook_report_syscall_exit(regs, step); + + user_enter(); +} + +/* + * notification of userspace execution resumption + * - triggered by the TIF_WORK_MASK flags + */ +__visible void +do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) +{ + user_exit(); + + if (thread_info_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + + /* deal with pending signal delivery */ + if (thread_info_flags & _TIF_SIGPENDING) + do_signal(regs); + + if (thread_info_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); + + user_enter(); +} diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 31eab86..b42408b 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -30,6 +30,7 @@ typedef sigset_t compat_sigset_t; #endif /* __ASSEMBLY__ */ #include #ifndef __ASSEMBLY__ +extern void do_signal(struct pt_regs *regs); extern void do_notify_resume(struct pt_regs *, void *, __u32); #define __ARCH_HAS_SA_RESTORER diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7155957..558f50e 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -37,12 +37,10 @@ #include #include #include +#include #include "tls.h" -#define CREATE_TRACE_POINTS -#include - enum x86_regset { REGSET_GENERAL, REGSET_FP, @@ -1444,201 +1442,3 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, /* Send us the fake SIGTRAP */ force_sig_info(SIGTRAP, &info, tsk); } - -static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) -{ -#ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - audit_syscall_entry(regs->orig_ax, regs->di, - regs->si, regs->dx, regs->r10); - } else -#endif - { - audit_syscall_entry(regs->orig_ax, regs->bx, - regs->cx, regs->dx, regs->si); - } -} - -/* - * We can return 0 to resume the syscall or anything else to go to phase - * 2. If we resume the syscall, we need to put something appropriate in - * regs->orig_ax. - * - * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax - * are fully functional. - * - * For phase 2's benefit, our return value is: - * 0: resume the syscall - * 1: go to phase 2; no seccomp phase 2 needed - * anything else: go to phase 2; pass return value to seccomp - */ -unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) -{ - unsigned long ret = 0; - u32 work; - - BUG_ON(regs != task_pt_regs(current)); - - work = ACCESS_ONCE(current_thread_info()->flags) & - _TIF_WORK_SYSCALL_ENTRY; - - /* - * If TIF_NOHZ is set, we are required to call user_exit() before - * doing anything that could touch RCU. - */ - if (work & _TIF_NOHZ) { - user_exit(); - work &= ~_TIF_NOHZ; - } - -#ifdef CONFIG_SECCOMP - /* - * Do seccomp first -- it should minimize exposure of other - * code, and keeping seccomp fast is probably more valuable - * than the rest of this. - */ - if (work & _TIF_SECCOMP) { - struct seccomp_data sd; - - sd.arch = arch; - sd.nr = regs->orig_ax; - sd.instruction_pointer = regs->ip; -#ifdef CONFIG_X86_64 - if (arch == AUDIT_ARCH_X86_64) { - sd.args[0] = regs->di; - sd.args[1] = regs->si; - sd.args[2] = regs->dx; - sd.args[3] = regs->r10; - sd.args[4] = regs->r8; - sd.args[5] = regs->r9; - } else -#endif - { - sd.args[0] = regs->bx; - sd.args[1] = regs->cx; - sd.args[2] = regs->dx; - sd.args[3] = regs->si; - sd.args[4] = regs->di; - sd.args[5] = regs->bp; - } - - BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0); - BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1); - - ret = seccomp_phase1(&sd); - if (ret == SECCOMP_PHASE1_SKIP) { - regs->orig_ax = -1; - ret = 0; - } else if (ret != SECCOMP_PHASE1_OK) { - return ret; /* Go directly to phase 2 */ - } - - work &= ~_TIF_SECCOMP; - } -#endif - - /* Do our best to finish without phase 2. */ - if (work == 0) - return ret; /* seccomp and/or nohz only (ret == 0 here) */ - -#ifdef CONFIG_AUDITSYSCALL - if (work == _TIF_SYSCALL_AUDIT) { - /* - * If there is no more work to be done except auditing, - * then audit in phase 1. Phase 2 always audits, so, if - * we audit here, then we can't go on to phase 2. - */ - do_audit_syscall_entry(regs, arch); - return 0; - } -#endif - - return 1; /* Something is enabled that we can't handle in phase 1 */ -} - -/* Returns the syscall nr to run (which should match regs->orig_ax). */ -long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, - unsigned long phase1_result) -{ - long ret = 0; - u32 work = ACCESS_ONCE(current_thread_info()->flags) & - _TIF_WORK_SYSCALL_ENTRY; - - BUG_ON(regs != task_pt_regs(current)); - - /* - * If we stepped into a sysenter/syscall insn, it trapped in - * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP. - * If user-mode had set TF itself, then it's still clear from - * do_debug() and we need to set it again to restore the user - * state. If we entered on the slow path, TF was already set. - */ - if (work & _TIF_SINGLESTEP) - regs->flags |= X86_EFLAGS_TF; - -#ifdef CONFIG_SECCOMP - /* - * Call seccomp_phase2 before running the other hooks so that - * they can see any changes made by a seccomp tracer. - */ - if (phase1_result > 1 && seccomp_phase2(phase1_result)) { - /* seccomp failures shouldn't expose any additional code. */ - return -1; - } -#endif - - if (unlikely(work & _TIF_SYSCALL_EMU)) - ret = -1L; - - if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) && - tracehook_report_syscall_entry(regs)) - ret = -1L; - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_enter(regs, regs->orig_ax); - - do_audit_syscall_entry(regs, arch); - - return ret ?: regs->orig_ax; -} - -long syscall_trace_enter(struct pt_regs *regs) -{ - u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64; - unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch); - - if (phase1_result == 0) - return regs->orig_ax; - else - return syscall_trace_enter_phase2(regs, arch, phase1_result); -} - -void syscall_trace_leave(struct pt_regs *regs) -{ - bool step; - - /* - * We may come here right after calling schedule_user() - * or do_notify_resume(), in which case we can be in RCU - * user mode. - */ - user_exit(); - - audit_syscall_exit(regs); - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_exit(regs, regs->ax); - - /* - * If TIF_SYSCALL_EMU is set, we only get here because of - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). - * We already reported this syscall instruction in - * syscall_trace_enter(). - */ - step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && - !test_thread_flag(TIF_SYSCALL_EMU); - if (step || test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, step); - - user_enter(); -} diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 6c22aad..7e88cc7 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -700,7 +700,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * want to handle. Thus you cannot kill init even with a SIGKILL even by * mistake. */ -static void do_signal(struct pt_regs *regs) +void do_signal(struct pt_regs *regs) { struct ksignal ksig; @@ -735,32 +735,6 @@ static void do_signal(struct pt_regs *regs) restore_saved_sigmask(); } -/* - * notification of userspace execution resumption - * - triggered by the TIF_WORK_MASK flags - */ -__visible void -do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) -{ - user_exit(); - - if (thread_info_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - } - if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) - fire_user_return_notifiers(); - - user_enter(); -} - void signal_fault(struct pt_regs *regs, void __user *frame, char *where) { struct task_struct *me = current; -- cgit v0.10.2 From 02fdcd5eac9d653d1addbd69b0c58d73650e1c00 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:24 -0700 Subject: x86/traps, context_tracking: Assert that we're in CONTEXT_KERNEL in exception entries Other than the super-atomic exception entries, all exception entries are supposed to switch our context tracking state to CONTEXT_KERNEL. Assert that they do. These assertions appear trivial at this point, as exception_enter() is the function responsible for switching context, but I'm planning on reworking x86's exception context tracking, and these assertions will help make sure that all of this code keeps working. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/20fa1ee2d943233a184aaf96ff75394d3b34dfba.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index f579192..2a783c4 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -292,6 +292,8 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, enum ctx_state prev_state = exception_enter(); siginfo_t info; + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { conditional_sti(regs); @@ -376,6 +378,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) siginfo_t *info; prev_state = exception_enter(); + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); if (notify_die(DIE_TRAP, "bounds", regs, error_code, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) goto exit; @@ -457,6 +460,7 @@ do_general_protection(struct pt_regs *regs, long error_code) enum ctx_state prev_state; prev_state = exception_enter(); + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); conditional_sti(regs); if (v8086_mode(regs)) { @@ -514,6 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) return; prev_state = ist_enter(regs); + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) @@ -750,6 +755,7 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) enum ctx_state prev_state; prev_state = exception_enter(); + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); math_error(regs, error_code, X86_TRAP_MF); exception_exit(prev_state); } @@ -760,6 +766,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code) enum ctx_state prev_state; prev_state = exception_enter(); + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); math_error(regs, error_code, X86_TRAP_XF); exception_exit(prev_state); } @@ -776,6 +783,7 @@ do_device_not_available(struct pt_regs *regs, long error_code) enum ctx_state prev_state; prev_state = exception_enter(); + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); BUG_ON(use_eager_fpu()); #ifdef CONFIG_MATH_EMULATION @@ -805,6 +813,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) enum ctx_state prev_state; prev_state = exception_enter(); + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); local_irq_enable(); info.si_signo = SIGILL; -- cgit v0.10.2 From feed36cde0a10adb957445a37e48f957f30b2273 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:25 -0700 Subject: x86/entry: Add enter_from_user_mode() and use it in syscalls Changing the x86 context tracking hooks is dangerous because there are no good checks that we track our context correctly. Add a helper to check that we're actually in CONTEXT_USER when we enter from user mode and wire it up for syscall entries. Subsequent patches will wire this up for all non-NMI entries as well. NMIs are their own special beast and cannot currently switch overall context tracking state. Instead, they have their own special RCU hooks. This is a tiny speedup if !CONFIG_CONTEXT_TRACKING (removes a branch) and a tiny slowdown if CONFIG_CONTEXT_TRACING (adds a layer of indirection). Eventually, we should fix up the core context tracking code to supply a function that does what we want (and can be much simpler than user_exit), which will enable us to get rid of the extra call. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/853b42420066ec3fb856779cdc223a6dcb5d355b.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 917d0c3..9a327ee 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -28,6 +28,15 @@ #define CREATE_TRACE_POINTS #include +#ifdef CONFIG_CONTEXT_TRACKING +/* Called on entry from user mode with IRQs off. */ +__visible void enter_from_user_mode(void) +{ + CT_WARN_ON(ct_state() != CONTEXT_USER); + user_exit(); +} +#endif + static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) { #ifdef CONFIG_X86_64 @@ -65,14 +74,16 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) work = ACCESS_ONCE(current_thread_info()->flags) & _TIF_WORK_SYSCALL_ENTRY; +#ifdef CONFIG_CONTEXT_TRACKING /* * If TIF_NOHZ is set, we are required to call user_exit() before * doing anything that could touch RCU. */ if (work & _TIF_NOHZ) { - user_exit(); + enter_from_user_mode(); work &= ~_TIF_NOHZ; } +#endif #ifdef CONFIG_SECCOMP /* -- cgit v0.10.2 From c5c46f59e4e7c1ab244b8d38f2b61d317df90bba Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:26 -0700 Subject: x86/entry: Add new, comprehensible entry and exit handlers written in C The current x86 entry and exit code, written in a mixture of assembly and C code, is incomprehensible due to being open-coded in a lot of places without coherent documentation. It appears to work primary by luck and duct tape: i.e. obvious runtime failures were fixed on-demand, without re-thinking the design. Due to those reasons our confidence level in that code is low, and it is very difficult to incrementally improve. Add new code written in C, in preparation for simply deleting the old entry code. prepare_exit_to_usermode() is a new function that will handle all slow path exits to user mode. It is called with IRQs disabled and it leaves us in a state in which it is safe to immediately return to user mode. IRQs must not be re-enabled at any point after prepare_exit_to_usermode() returns and user mode is actually entered. (We can, of course, fail to enter user mode and treat that failure as a fresh entry to kernel mode.) All callers of do_notify_resume() will be migrated to call prepare_exit_to_usermode() instead; prepare_exit_to_usermode() needs to do everything that do_notify_resume() does today, but it also takes care of scheduling and context tracking. Unlike do_notify_resume(), it does not need to be called in a loop. syscall_return_slowpath() is exactly what it sounds like: it will be called on any syscall exit slow path. It will replace syscall_trace_leave() and it calls prepare_exit_to_usermode() on the way out. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/c57c8b87661a4152801d7d3786eac2d1a2f209dd.1435952415.git.luto@kernel.org [ Improved the changelog a bit. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 9a327ee..febc530 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -207,6 +207,7 @@ long syscall_trace_enter(struct pt_regs *regs) return syscall_trace_enter_phase2(regs, arch, phase1_result); } +/* Deprecated. */ void syscall_trace_leave(struct pt_regs *regs) { bool step; @@ -237,8 +238,117 @@ void syscall_trace_leave(struct pt_regs *regs) user_enter(); } +static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) +{ + unsigned long top_of_stack = + (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; + return (struct thread_info *)(top_of_stack - THREAD_SIZE); +} + +/* Called with IRQs disabled. */ +__visible void prepare_exit_to_usermode(struct pt_regs *regs) +{ + if (WARN_ON(!irqs_disabled())) + local_irq_disable(); + + /* + * In order to return to user mode, we need to have IRQs off with + * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, + * _TIF_UPROBE, or _TIF_NEED_RESCHED set. Several of these flags + * can be set at any time on preemptable kernels if we have IRQs on, + * so we need to loop. Disabling preemption wouldn't help: doing the + * work to clear some of the flags can sleep. + */ + while (true) { + u32 cached_flags = + READ_ONCE(pt_regs_to_thread_info(regs)->flags); + + if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | + _TIF_UPROBE | _TIF_NEED_RESCHED))) + break; + + /* We have work to do. */ + local_irq_enable(); + + if (cached_flags & _TIF_NEED_RESCHED) + schedule(); + + if (cached_flags & _TIF_UPROBE) + uprobe_notify_resume(regs); + + /* deal with pending signal delivery */ + if (cached_flags & _TIF_SIGPENDING) + do_signal(regs); + + if (cached_flags & _TIF_NOTIFY_RESUME) { + clear_thread_flag(TIF_NOTIFY_RESUME); + tracehook_notify_resume(regs); + } + + if (cached_flags & _TIF_USER_RETURN_NOTIFY) + fire_user_return_notifiers(); + + /* Disable IRQs and retry */ + local_irq_disable(); + } + + user_enter(); +} + +/* + * Called with IRQs on and fully valid regs. Returns with IRQs off in a + * state such that we can immediately switch to user mode. + */ +__visible void syscall_return_slowpath(struct pt_regs *regs) +{ + struct thread_info *ti = pt_regs_to_thread_info(regs); + u32 cached_flags = READ_ONCE(ti->flags); + bool step; + + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + + if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled", + regs->orig_ax)) + local_irq_enable(); + + /* + * First do one-time work. If these work items are enabled, we + * want to run them exactly once per syscall exit with IRQs on. + */ + if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | + _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) { + audit_syscall_exit(regs); + + if (cached_flags & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely( + (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) + == _TIF_SINGLESTEP); + if (step || cached_flags & _TIF_SYSCALL_TRACE) + tracehook_report_syscall_exit(regs, step); + } + +#ifdef CONFIG_COMPAT + /* + * Compat syscalls set TS_COMPAT. Make sure we clear it before + * returning to user mode. + */ + ti->status &= ~TS_COMPAT; +#endif + + local_irq_disable(); + prepare_exit_to_usermode(regs); +} + /* - * notification of userspace execution resumption + * Deprecated notification of userspace execution resumption * - triggered by the TIF_WORK_MASK flags */ __visible void -- cgit v0.10.2 From cb6f64ed5a04036eef07e70b57dd5dd78f2fbcef Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:27 -0700 Subject: x86/entry/64: Really create an error-entry-from-usermode code path In 539f51136500 ("x86/asm/entry/64: Disentangle error_entry/exit gsbase/ebx/usermode code"), I arranged the code slightly wrong -- IRET faults would skip the code path that was intended to execute on all error entries from user mode. Fix it up. While we're at it, make all the labels in error_entry local. This does not fix a bug, but we'll need it, and it slightly shrinks the code. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/91e17891e49fa3d61357eadc451529ad48143ee1.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 141a5d4..ccfcba9 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1143,12 +1143,17 @@ ENTRY(error_entry) SAVE_EXTRA_REGS 8 xorl %ebx, %ebx testb $3, CS+8(%rsp) - jz error_kernelspace + jz .Lerror_kernelspace - /* We entered from user mode */ +.Lerror_entry_from_usermode_swapgs: + /* + * We entered from user mode or we're pretending to have entered + * from user mode due to an IRET fault. + */ SWAPGS -error_entry_done: +.Lerror_entry_from_usermode_after_swapgs: +.Lerror_entry_done: TRACE_IRQS_OFF ret @@ -1158,31 +1163,30 @@ error_entry_done: * truncated RIP for IRET exceptions returning to compat mode. Check * for these here too. */ -error_kernelspace: +.Lerror_kernelspace: incl %ebx leaq native_irq_return_iret(%rip), %rcx cmpq %rcx, RIP+8(%rsp) - je error_bad_iret + je .Lerror_bad_iret movl %ecx, %eax /* zero extend */ cmpq %rax, RIP+8(%rsp) - je bstep_iret + je .Lbstep_iret cmpq $gs_change, RIP+8(%rsp) - jne error_entry_done + jne .Lerror_entry_done /* * hack: gs_change can fail with user gsbase. If this happens, fix up * gsbase and proceed. We'll fix up the exception and land in * gs_change's error handler with kernel gsbase. */ - SWAPGS - jmp error_entry_done + jmp .Lerror_entry_from_usermode_swapgs -bstep_iret: +.Lbstep_iret: /* Fix truncated RIP */ movq %rcx, RIP+8(%rsp) /* fall through */ -error_bad_iret: +.Lerror_bad_iret: /* * We came from an IRET to user mode, so we have user gsbase. * Switch to kernel gsbase: @@ -1198,7 +1202,7 @@ error_bad_iret: call fixup_bad_iret mov %rax, %rsp decl %ebx - jmp error_entry_done + jmp .Lerror_entry_from_usermode_after_swapgs END(error_entry) -- cgit v0.10.2 From 29ea1b258b98a862e59d72556714b75051ae93fb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:28 -0700 Subject: x86/entry/64: Migrate 64-bit and compat syscalls to the new exit handlers and remove old assembly code These need to be migrated together, as the compat case used to jump into the middle of the 64-bit exit code. Remove the old assembly code. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/d4d1d70de08ac3640badf50048a9e8f18fe2497f.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index ccfcba9..4ca5b78 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -229,6 +229,11 @@ entry_SYSCALL_64_fastpath: */ USERGS_SYSRET64 +GLOBAL(int_ret_from_sys_call_irqs_off) + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + jmp int_ret_from_sys_call + /* Do syscall entry tracing */ tracesys: movq %rsp, %rdi @@ -272,69 +277,11 @@ tracesys_phase2: * Has correct iret frame. */ GLOBAL(int_ret_from_sys_call) - DISABLE_INTERRUPTS(CLBR_NONE) -int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ - TRACE_IRQS_OFF - movl $_TIF_ALLWORK_MASK, %edi - /* edi: mask to check */ -GLOBAL(int_with_check) - LOCKDEP_SYS_EXIT_IRQ - GET_THREAD_INFO(%rcx) - movl TI_flags(%rcx), %edx - andl %edi, %edx - jnz int_careful - andl $~TS_COMPAT, TI_status(%rcx) - jmp syscall_return - - /* - * Either reschedule or signal or syscall exit tracking needed. - * First do a reschedule test. - * edx: work, edi: workmask - */ -int_careful: - bt $TIF_NEED_RESCHED, %edx - jnc int_very_careful - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - SCHEDULE_USER - popq %rdi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - - /* handle signals and tracing -- both require a full pt_regs */ -int_very_careful: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) SAVE_EXTRA_REGS - /* Check for syscall exit trace */ - testl $_TIF_WORK_SYSCALL_EXIT, %edx - jz int_signal - pushq %rdi - leaq 8(%rsp), %rdi /* &ptregs -> arg1 */ - call syscall_trace_leave - popq %rdi - andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi - jmp int_restore_rest - -int_signal: - testl $_TIF_DO_NOTIFY_MASK, %edx - jz 1f - movq %rsp, %rdi /* &ptregs -> arg1 */ - xorl %esi, %esi /* oldset -> arg2 */ - call do_notify_resume -1: movl $_TIF_WORK_MASK, %edi -int_restore_rest: + movq %rsp, %rdi + call syscall_return_slowpath /* returns with IRQs disabled */ RESTORE_EXTRA_REGS - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp int_with_check - -syscall_return: - /* The IRETQ could re-enable interrupts: */ - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_IRETQ + TRACE_IRQS_IRETQ /* we're about to change IF */ /* * Try to use SYSRET instead of IRET if we're returning to diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e5ebdd9..d9bbd31 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -210,10 +210,10 @@ sysexit_from_sys_call: .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz ia32_ret_from_sys_call movl %eax, %esi /* second arg, syscall return value */ cmpl $-MAX_ERRNO, %eax /* is it an error ? */ jbe 1f @@ -232,7 +232,7 @@ sysexit_from_sys_call: movq %rax, R10(%rsp) movq %rax, R9(%rsp) movq %rax, R8(%rsp) - jmp int_with_check + jmp int_ret_from_sys_call_irqs_off .endm sysenter_auditsys: -- cgit v0.10.2 From ff467594f2a4be01a0fa5e9ffc223fa930d232dd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:29 -0700 Subject: x86/asm/entry/64: Save all regs on interrupt entry To prepare for the big rewrite of the error and interrupt exit paths, we will need pt_regs completely filled in. It's already completely filled in when error_exit runs, so rearrange interrupt handling to match it. This will slow down interrupt handling very slightly (eight instructions), but the simplification it enables will be more than worth it. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/d8a766a7f558b30e6e01352854628a2d9943460c.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 519207f..3c71dd9 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -135,9 +135,6 @@ For 32-bit we have the following conventions - kernel is built with movq %rbp, 4*8+\offset(%rsp) movq %rbx, 5*8+\offset(%rsp) .endm - .macro SAVE_EXTRA_REGS_RBP offset=0 - movq %rbp, 4*8+\offset(%rsp) - .endm .macro RESTORE_EXTRA_REGS offset=0 movq 0*8+\offset(%rsp), %r15 diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 4ca5b78..65029f4 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -502,21 +502,13 @@ END(irq_entries_start) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func cld - /* - * Since nothing in interrupt handling code touches r12...r15 members - * of "struct pt_regs", and since interrupts can nest, we can save - * four stack slots and simultaneously provide - * an unwind-friendly stack layout by saving "truncated" pt_regs - * exactly up to rbp slot, without these members. - */ - ALLOC_PT_GPREGS_ON_STACK -RBP - SAVE_C_REGS -RBP - /* this goes to 0(%rsp) for unwinder, not for saving the value: */ - SAVE_EXTRA_REGS_RBP -RBP + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS - leaq -RBP(%rsp), %rdi /* arg1 for \func (pointer to pt_regs) */ + movq %rsp,%rdi /* arg1 for \func (pointer to pt_regs) */ - testb $3, CS-RBP(%rsp) + testb $3, CS(%rsp) jz 1f SWAPGS 1: @@ -553,9 +545,7 @@ ret_from_intr: decl PER_CPU_VAR(irq_count) /* Restore saved previous stack */ - popq %rsi - /* return code expects complete pt_regs - adjust rsp accordingly: */ - leaq -RBP(%rsi), %rsp + popq %rsp testb $3, CS(%rsp) jz retint_kernel @@ -580,7 +570,7 @@ retint_swapgs: /* return to user-space */ TRACE_IRQS_IRETQ SWAPGS - jmp restore_c_regs_and_iret + jmp restore_regs_and_iret /* Returning to kernel space */ retint_kernel: @@ -604,6 +594,8 @@ retint_kernel: * At this label, code paths which return to kernel and to user, * which come from interrupts/exception and from syscalls, merge. */ +restore_regs_and_iret: + RESTORE_EXTRA_REGS restore_c_regs_and_iret: RESTORE_C_REGS REMOVE_PT_GPREGS_FROM_STACK 8 @@ -674,12 +666,10 @@ retint_signal: jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_EXTRA_REGS movq $-1, ORIG_RAX(%rsp) xorl %esi, %esi /* oldset */ movq %rsp, %rdi /* &pt_regs */ call do_notify_resume - RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) @@ -1160,7 +1150,6 @@ END(error_entry) */ ENTRY(error_exit) movl %ebx, %eax - RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl %eax, %eax -- cgit v0.10.2 From a586f98e9767fb0dfdb989002866b4024f00ce08 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:30 -0700 Subject: x86/asm/entry/64: Simplify IRQ stack pt_regs handling There's no need for both RSI and RDI to point to the original stack. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/3a0481f809dd340c7d3f54ce3fd6d66ef2a578cd.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 65029f4..83eb63d 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -506,8 +506,6 @@ END(irq_entries_start) SAVE_C_REGS SAVE_EXTRA_REGS - movq %rsp,%rdi /* arg1 for \func (pointer to pt_regs) */ - testb $3, CS(%rsp) jz 1f SWAPGS @@ -519,14 +517,14 @@ END(irq_entries_start) * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ - movq %rsp, %rsi + movq %rsp, %rdi incl PER_CPU_VAR(irq_count) cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rsi + pushq %rdi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF - call \func + call \func /* rdi points to pt_regs */ .endm /* -- cgit v0.10.2 From 02bc7768fe447ae305e924b931fa629073a4a1b9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:31 -0700 Subject: x86/asm/entry/64: Migrate error and IRQ exit work to C and remove old assembly code Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/60e90901eee611e59e958bfdbbe39969b4f88fe5.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 83eb63d..168ee26 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -508,7 +508,16 @@ END(irq_entries_start) testb $3, CS(%rsp) jz 1f + + /* + * IRQ from user mode. Switch to kernel gsbase and inform context + * tracking that we're in kernel mode. + */ SWAPGS +#ifdef CONFIG_CONTEXT_TRACKING + call enter_from_user_mode +#endif + 1: /* * Save previous stack pointer, optionally switch to interrupt stack. @@ -547,26 +556,13 @@ ret_from_intr: testb $3, CS(%rsp) jz retint_kernel - /* Interrupt came from user space */ -GLOBAL(retint_user) - GET_THREAD_INFO(%rcx) - /* %rcx: thread info. Interrupts are off. */ -retint_with_reschedule: - movl $_TIF_WORK_MASK, %edi -retint_check: + /* Interrupt came from user space */ LOCKDEP_SYS_EXIT_IRQ - movl TI_flags(%rcx), %edx - andl %edi, %edx - jnz retint_careful - -retint_swapgs: /* return to user-space */ - /* - * The iretq could re-enable interrupts: - */ - DISABLE_INTERRUPTS(CLBR_ANY) +GLOBAL(retint_user) + mov %rsp,%rdi + call prepare_exit_to_usermode TRACE_IRQS_IRETQ - SWAPGS jmp restore_regs_and_iret @@ -644,35 +640,6 @@ native_irq_return_ldt: popq %rax jmp native_irq_return_iret #endif - - /* edi: workmask, edx: work */ -retint_careful: - bt $TIF_NEED_RESCHED, %edx - jnc retint_signal - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - pushq %rdi - SCHEDULE_USER - popq %rdi - GET_THREAD_INFO(%rcx) - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - jmp retint_check - -retint_signal: - testl $_TIF_DO_NOTIFY_MASK, %edx - jz retint_swapgs - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movq $-1, ORIG_RAX(%rsp) - xorl %esi, %esi /* oldset */ - movq %rsp, %rdi /* &pt_regs */ - call do_notify_resume - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - GET_THREAD_INFO(%rcx) - jmp retint_with_reschedule - END(common_interrupt) /* @@ -1088,7 +1055,12 @@ ENTRY(error_entry) SWAPGS .Lerror_entry_from_usermode_after_swapgs: +#ifdef CONFIG_CONTEXT_TRACKING + call enter_from_user_mode +#endif + .Lerror_entry_done: + TRACE_IRQS_OFF ret diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index d9bbd31..25aca51 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -458,6 +458,11 @@ ia32_badarg: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF + /* Now finish entering normal kernel mode. */ +#ifdef CONFIG_CONTEXT_TRACKING + call enter_from_user_mode +#endif + /* And exit again. */ jmp retint_user -- cgit v0.10.2 From 8c84014f3bbb112d07e73f30a10ac8a3a72f8649 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:32 -0700 Subject: x86/entry: Remove exception_enter() from most trap handlers On 64-bit kernels, we don't need it any more: we handle context tracking directly on entry from user mode and exit to user mode. On 32-bit kernels, we don't support context tracking at all, so these callbacks had no effect. Note: this doesn't change do_page_fault(). Before we do that, we need to make sure that there is no code that can page fault from kernel mode with CONTEXT_USER. The 32-bit fast system call stack argument code is the only offender I'm aware of right now. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/ae22f4dfebd799c916574089964592be218151f9.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index c5380be..c3496619 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -112,8 +112,8 @@ asmlinkage void smp_threshold_interrupt(void); asmlinkage void smp_deferred_error_interrupt(void); #endif -extern enum ctx_state ist_enter(struct pt_regs *regs); -extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state); +extern void ist_enter(struct pt_regs *regs); +extern void ist_exit(struct pt_regs *regs); extern void ist_begin_non_atomic(struct pt_regs *regs); extern void ist_end_non_atomic(void); diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 96ccecc..99940d1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -1029,7 +1029,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) { struct mca_config *cfg = &mca_cfg; struct mce m, *final; - enum ctx_state prev_state; int i; int worst = 0; int severity; @@ -1055,7 +1054,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) int flags = MF_ACTION_REQUIRED; int lmce = 0; - prev_state = ist_enter(regs); + ist_enter(regs); this_cpu_inc(mce_exception_count); @@ -1227,7 +1226,7 @@ out: local_irq_disable(); ist_end_non_atomic(); done: - ist_exit(regs, prev_state); + ist_exit(regs); } EXPORT_SYMBOL_GPL(do_machine_check); diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 737b0ad..12402e1 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly; /* Machine check handler for Pentium class Intel CPUs: */ static void pentium_machine_check(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; u32 loaddr, hi, lotype; - prev_state = ist_enter(regs); + ist_enter(regs); rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi); rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi); @@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code) add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs, prev_state); + ist_exit(regs); } /* Set up machine check reporting for processors with Intel style MCE: */ diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 44f1382..01dd870 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -15,12 +15,12 @@ /* Machine check handler for WinChip C6: */ static void winchip_machine_check(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state = ist_enter(regs); + ist_enter(regs); printk(KERN_EMERG "CPU0: Machine Check Exception.\n"); add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - ist_exit(regs, prev_state); + ist_exit(regs); } /* Set up machine check reporting on the Winchip C6 series */ diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 2a783c4..8e65d8a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -108,13 +108,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) preempt_count_dec(); } -enum ctx_state ist_enter(struct pt_regs *regs) +void ist_enter(struct pt_regs *regs) { - enum ctx_state prev_state; - if (user_mode(regs)) { - /* Other than that, we're just an exception. */ - prev_state = exception_enter(); + CT_WARN_ON(ct_state() != CONTEXT_KERNEL); } else { /* * We might have interrupted pretty much anything. In @@ -123,32 +120,25 @@ enum ctx_state ist_enter(struct pt_regs *regs) * but we need to notify RCU. */ rcu_nmi_enter(); - prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */ } /* - * We are atomic because we're on the IST stack (or we're on x86_32, - * in which case we still shouldn't schedule). - * - * This must be after exception_enter(), because exception_enter() - * won't do anything if in_interrupt() returns true. + * We are atomic because we're on the IST stack; or we're on + * x86_32, in which case we still shouldn't schedule; or we're + * on x86_64 and entered from user mode, in which case we're + * still atomic unless ist_begin_non_atomic is called. */ preempt_count_add(HARDIRQ_OFFSET); /* This code is a bit fragile. Test it. */ rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work"); - - return prev_state; } -void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) +void ist_exit(struct pt_regs *regs) { - /* Must be before exception_exit. */ preempt_count_sub(HARDIRQ_OFFSET); - if (user_mode(regs)) - return exception_exit(prev_state); - else + if (!user_mode(regs)) rcu_nmi_exit(); } @@ -162,7 +152,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) * a double fault, it can be safe to schedule. ist_begin_non_atomic() * begins a non-atomic section within an ist_enter()/ist_exit() region. * Callers are responsible for enabling interrupts themselves inside - * the non-atomic section, and callers must call is_end_non_atomic() + * the non-atomic section, and callers must call ist_end_non_atomic() * before ist_exit(). */ void ist_begin_non_atomic(struct pt_regs *regs) @@ -289,7 +279,6 @@ NOKPROBE_SYMBOL(do_trap); static void do_error_trap(struct pt_regs *regs, long error_code, char *str, unsigned long trapnr, int signr) { - enum ctx_state prev_state = exception_enter(); siginfo_t info; CT_WARN_ON(ct_state() != CONTEXT_KERNEL); @@ -300,8 +289,6 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, do_trap(trapnr, signr, str, regs, error_code, fill_trap_info(regs, signr, trapnr, &info)); } - - exception_exit(prev_state); } #define DO_ERROR(trapnr, signr, str, name) \ @@ -353,7 +340,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) } #endif - ist_enter(regs); /* Discard prev_state because we won't return. */ + ist_enter(regs); notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV); tsk->thread.error_code = error_code; @@ -373,15 +360,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code) dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; const struct bndcsr *bndcsr; siginfo_t *info; - prev_state = exception_enter(); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); if (notify_die(DIE_TRAP, "bounds", regs, error_code, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) - goto exit; + return; conditional_sti(regs); if (!user_mode(regs)) @@ -438,9 +423,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) die("bounds", regs, error_code); } -exit: - exception_exit(prev_state); return; + exit_trap: /* * This path out is for all the cases where we could not @@ -450,36 +434,33 @@ exit_trap: * time.. */ do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL); - exception_exit(prev_state); } dotraplinkage void do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; - enum ctx_state prev_state; - prev_state = exception_enter(); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); conditional_sti(regs); if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); - goto exit; + return; } tsk = current; if (!user_mode(regs)) { if (fixup_exception(regs)) - goto exit; + return; tsk->thread.error_code = error_code; tsk->thread.trap_nr = X86_TRAP_GP; if (notify_die(DIE_GPF, "general protection fault", regs, error_code, X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP) die("general protection fault", regs, error_code); - goto exit; + return; } tsk->thread.error_code = error_code; @@ -495,16 +476,12 @@ do_general_protection(struct pt_regs *regs, long error_code) } force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); -exit: - exception_exit(prev_state); } NOKPROBE_SYMBOL(do_general_protection); /* May run on IST stack. */ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; - #ifdef CONFIG_DYNAMIC_FTRACE /* * ftrace must be first, everything else may cause a recursive crash. @@ -517,7 +494,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) if (poke_int3_handler(regs)) return; - prev_state = ist_enter(regs); + ist_enter(regs); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, @@ -544,7 +521,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) preempt_conditional_cli(regs); debug_stack_usage_dec(); exit: - ist_exit(regs, prev_state); + ist_exit(regs); } NOKPROBE_SYMBOL(do_int3); @@ -620,12 +597,11 @@ NOKPROBE_SYMBOL(fixup_bad_iret); dotraplinkage void do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; - enum ctx_state prev_state; int user_icebp = 0; unsigned long dr6; int si_code; - prev_state = ist_enter(regs); + ist_enter(regs); get_debugreg(dr6, 6); @@ -700,7 +676,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) debug_stack_usage_dec(); exit: - ist_exit(regs, prev_state); + ist_exit(regs); } NOKPROBE_SYMBOL(do_debug); @@ -752,23 +728,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; - - prev_state = exception_enter(); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); math_error(regs, error_code, X86_TRAP_MF); - exception_exit(prev_state); } dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; - - prev_state = exception_enter(); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); math_error(regs, error_code, X86_TRAP_XF); - exception_exit(prev_state); } dotraplinkage void @@ -780,9 +748,6 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { - enum ctx_state prev_state; - - prev_state = exception_enter(); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); BUG_ON(use_eager_fpu()); @@ -794,7 +759,6 @@ do_device_not_available(struct pt_regs *regs, long error_code) info.regs = regs; math_emulate(&info); - exception_exit(prev_state); return; } #endif @@ -802,7 +766,6 @@ do_device_not_available(struct pt_regs *regs, long error_code) #ifdef CONFIG_X86_32 conditional_sti(regs); #endif - exception_exit(prev_state); } NOKPROBE_SYMBOL(do_device_not_available); @@ -810,9 +773,7 @@ NOKPROBE_SYMBOL(do_device_not_available); dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; - enum ctx_state prev_state; - prev_state = exception_enter(); CT_WARN_ON(ct_state() != CONTEXT_KERNEL); local_irq_enable(); @@ -825,7 +786,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code, &info); } - exception_exit(prev_state); } #endif -- cgit v0.10.2 From 06a7b36c7bd932e60997bedbae32b3d8e6722281 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:33 -0700 Subject: x86/entry: Remove SCHEDULE_USER and asm/context-tracking.h SCHEDULE_USER is no longer used, and asm/context-tracking.h contained nothing else. Remove the header entirely. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/854e9b45f69af20e26c47099eb236321563ebcee.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 168ee26..041a37a 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/include/asm/context_tracking.h b/arch/x86/include/asm/context_tracking.h deleted file mode 100644 index 1fe4970..0000000 --- a/arch/x86/include/asm/context_tracking.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef _ASM_X86_CONTEXT_TRACKING_H -#define _ASM_X86_CONTEXT_TRACKING_H - -#ifdef CONFIG_CONTEXT_TRACKING -# define SCHEDULE_USER call schedule_user -#else -# define SCHEDULE_USER call schedule -#endif - -#endif -- cgit v0.10.2 From 0333a209cbf600e980fc55c24878a56f25f48b65 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 3 Jul 2015 12:44:34 -0700 Subject: x86/irq, context_tracking: Document how IRQ context tracking works and add an RCU assertion Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: paulmck@linux.vnet.ibm.com Link: http://lkml.kernel.org/r/e8bdc4ed0193fb2fd130f3d6b7b8023e2ec1ab62.1435952415.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 88b36648..6233de0 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -216,8 +216,23 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; + /* + * NB: Unlike exception entries, IRQ entries do not reliably + * handle context tracking in the low-level entry code. This is + * because syscall entries execute briefly with IRQs on before + * updating context tracking state, so we can take an IRQ from + * kernel mode with CONTEXT_USER. The low-level entry code only + * updates the context if we came from user mode, so we won't + * switch to CONTEXT_KERNEL. We'll fix that once the syscall + * code is cleaned up enough that we can cleanly defer enabling + * IRQs. + */ + entering_irq(); + /* entering_irq() tells RCU that we're not quiescent. Check it. */ + rcu_lockdep_assert(rcu_is_watching(), "IRQ failed to wake up RCU"); + irq = __this_cpu_read(vector_irq[vector]); if (!handle_irq(irq, regs)) { -- cgit v0.10.2 From 8f7f06b87acd2e017d6c536f59e10045dd8d0578 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 7 Jul 2015 10:55:28 -0700 Subject: x86/entry/64: Fix IRQ state confusion and related warning on compat syscalls with CONFIG_AUDITSYSCALL=n int_ret_from_sys_call now expects IRQs to be enabled. I got this right in the real sysexit_audit and sysretl_audit asm paths, but I missed it in the #defined-away versions when CONFIG_AUDITSYSCALL=n. This is a straightforward fix for CONFIG_AUDITSYSCALL=n Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 29ea1b258b98 ("x86/entry/64: Migrate 64-bit and compat syscalls to the new exit handlers and remove old assembly code") Link: http://lkml.kernel.org/r/25cf0a01e01c6008118dd8f8d9f043020416700c.1436291493.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 25aca51..d757153 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -22,8 +22,8 @@ #define __AUDIT_ARCH_LE 0x40000000 #ifndef CONFIG_AUDITSYSCALL -# define sysexit_audit ia32_ret_from_sys_call -# define sysretl_audit ia32_ret_from_sys_call +# define sysexit_audit ia32_ret_from_sys_call_irqs_off +# define sysretl_audit ia32_ret_from_sys_call_irqs_off #endif .section .entry.text, "ax" @@ -466,6 +466,10 @@ ia32_badarg: /* And exit again. */ jmp retint_user +ia32_ret_from_sys_call_irqs_off: + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) + ia32_ret_from_sys_call: xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) -- cgit v0.10.2 From d132803e6c611d50c19baedc8ae520203a2baca7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 15 Jul 2015 14:25:16 -0700 Subject: x86/entry: Fix _TIF_USER_RETURN_NOTIFY check in prepare_exit_to_usermode Linus noticed that the early return check was missing _TIF_USER_RETURN_NOTIFY. If the only work flag was _TIF_USER_RETURN_NOTIFY, we'd skip user return notifiers. Fix it. (This is the only missing bit.) This fixes double faults on a KVM host. It's the same issue as last time, except that this time it's very easy to trigger. Apparently no one uses -next as a KVM host. ( I'm still not quite sure what it is that KVM does that blows up so badly if we miss a user return notifier. My best guess is that KVM lets KERNEL_GS_BASE (i.e. the user's gs base) be negative and fixes it up in a user return notifier. If we actually end up in user mode with a negative gs base, we blow up pretty badly. ) Reported-by: Linus Torvalds Signed-off-by: Andy Lutomirski Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: c5c46f59e4e7 ("x86/entry: Add new, comprehensible entry and exit handlers written in C") Link: http://lkml.kernel.org/r/3f801104d24ee7a6bb1446408d9950777aa63277.1436995419.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index febc530..a3e9c7f 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -264,7 +264,8 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) READ_ONCE(pt_regs_to_thread_info(regs)->flags); if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | - _TIF_UPROBE | _TIF_NEED_RESCHED))) + _TIF_UPROBE | _TIF_NEED_RESCHED | + _TIF_USER_RETURN_NOTIFY))) break; /* We have work to do. */ -- cgit v0.10.2 From bf9f2ee28d475ada0005c59382852cb70f1419ac Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 20 Jul 2015 11:52:23 -0700 Subject: x86/nmi: Remove the 'b2b' parameter from nmi_handle() It has never had any effect. Remove it for comprehensibility. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c91fa38507760d9e54a4b8737fa6409bde896b33.1437418322.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index c3e985d..f76d650 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w) a->handler, whole_msecs, decimal_msecs); } -static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b) +static int nmi_handle(unsigned int type, struct pt_regs *regs) { struct nmi_desc *desc = nmi_to_desc(type); struct nmiaction *a; @@ -213,7 +213,7 @@ static void pci_serr_error(unsigned char reason, struct pt_regs *regs) { /* check to see if anyone registered against these types of errors */ - if (nmi_handle(NMI_SERR, regs, false)) + if (nmi_handle(NMI_SERR, regs)) return; pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n", @@ -247,7 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs) unsigned long i; /* check to see if anyone registered against these types of errors */ - if (nmi_handle(NMI_IO_CHECK, regs, false)) + if (nmi_handle(NMI_IO_CHECK, regs)) return; pr_emerg( @@ -284,7 +284,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) * as only the first one is ever run (unless it can actually determine * if it caused the NMI) */ - handled = nmi_handle(NMI_UNKNOWN, regs, false); + handled = nmi_handle(NMI_UNKNOWN, regs); if (handled) { __this_cpu_add(nmi_stats.unknown, handled); return; @@ -332,7 +332,7 @@ static void default_do_nmi(struct pt_regs *regs) __this_cpu_write(last_nmi_rip, regs->ip); - handled = nmi_handle(NMI_LOCAL, regs, b2b); + handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); if (handled) { /* -- cgit v0.10.2 From 0233606ce5cf12c1a0e27cb197066ea5bc2bb488 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 19 Jul 2015 21:09:04 -0400 Subject: x86/entry/vm86: Clean up saved_fs/gs There is no need to save FS and non-lazy GS outside the 32-bit regs. Lazy GS still needs to be saved because it wasn't saved on syscall entry. Save it in the gs slot of regs32, which is present but unused. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1437354550-25858-2-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 43e6519..f4e4e3f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -410,8 +410,6 @@ struct thread_struct { unsigned long v86flags; unsigned long v86mask; unsigned long saved_sp0; - unsigned int saved_fs; - unsigned int saved_gs; #endif /* IO permissions: */ unsigned long *io_bitmap_ptr; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index fc9db6e..761a2f9 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -159,8 +159,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) ret = KVM86->regs32; - ret->fs = current->thread.saved_fs; - set_user_gs(ret, current->thread.saved_gs); + lazy_load_gs(ret->gs); return ret; } @@ -315,8 +314,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk */ info->regs32->ax = VM86_SIGNAL; tsk->thread.saved_sp0 = tsk->thread.sp0; - tsk->thread.saved_fs = info->regs32->fs; - tsk->thread.saved_gs = get_user_gs(info->regs32); + lazy_save_gs(info->regs32->gs); tss = &per_cpu(cpu_tss, get_cpu()); tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; -- cgit v0.10.2 From df1ae9a5dc66d9fd57109240042372b1065d984a Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 19 Jul 2015 21:09:05 -0400 Subject: x86/entry/vm86: Preserve 'orig_ax' There is no legitimate reason for usermode to modify the 'orig_ax' field on entry to vm86 mode, so copy it from the 32-bit regs. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1437354550-25858-3-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 761a2f9..9a2dc80 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -294,6 +294,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK; info->regs.pt.flags |= X86_VM_MASK; + info->regs.pt.orig_ax = info->regs32->orig_ax; + switch (info->cpu_type) { case CPU_286: tsk->thread.v86mask = 0; -- cgit v0.10.2 From ed0b2edb61ba4e557de759093d965654186f28b2 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Sun, 19 Jul 2015 21:09:06 -0400 Subject: x86/entry/vm86: Move userspace accesses to do_sys_vm86() Move the userspace accesses down into the common function in preparation for the next set of patches. Also change to copying the fields explicitly instead of assuming a fixed order in pt_regs and the kernel data structures. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1437354550-25858-4-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f4e4e3f..35ad554 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -405,7 +405,7 @@ struct thread_struct { unsigned long error_code; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ - struct vm86_struct __user *vm86_info; + struct vm86plus_struct __user *vm86_info; unsigned long screen_bitmap; unsigned long v86flags; unsigned long v86mask; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 9a2dc80..e6c2b47 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -90,46 +90,13 @@ #define SAFE_MASK (0xDD5) #define RETURN_MASK (0xDFF) -/* convert kernel_vm86_regs to vm86_regs */ -static int copy_vm86_regs_to_user(struct vm86_regs __user *user, - const struct kernel_vm86_regs *regs) -{ - int ret = 0; - - /* - * kernel_vm86_regs is missing gs, so copy everything up to - * (but not including) orig_eax, and then rest including orig_eax. - */ - ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax)); - ret += copy_to_user(&user->orig_eax, ®s->pt.orig_ax, - sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.orig_ax)); - - return ret; -} - -/* convert vm86_regs to kernel_vm86_regs */ -static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs, - const struct vm86_regs __user *user, - unsigned extra) -{ - int ret = 0; - - /* copy ax-fs inclusive */ - ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax)); - /* copy orig_ax-__gsh+extra */ - ret += copy_from_user(®s->pt.orig_ax, &user->orig_eax, - sizeof(struct kernel_vm86_regs) - - offsetof(struct kernel_vm86_regs, pt.orig_ax) + - extra); - return ret; -} - struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) { struct tss_struct *tss; struct pt_regs *ret; - unsigned long tmp; + struct task_struct *tsk = current; + struct vm86plus_struct __user *user; + long err = 0; /* * This gets called from entry.S with interrupts disabled, but @@ -138,23 +105,50 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) */ local_irq_enable(); - if (!current->thread.vm86_info) { + if (!tsk->thread.vm86_info) { pr_alert("no vm86_info: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask); - tmp = copy_vm86_regs_to_user(¤t->thread.vm86_info->regs, regs); - tmp += put_user(current->thread.screen_bitmap, ¤t->thread.vm86_info->screen_bitmap); - if (tmp) { + set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | tsk->thread.v86mask); + user = tsk->thread.vm86_info; + + if (!access_ok(VERIFY_WRITE, user, VMPI.is_vm86pus ? + sizeof(struct vm86plus_struct) : + sizeof(struct vm86_struct))) { + pr_alert("could not access userspace vm86_info\n"); + do_exit(SIGSEGV); + } + + put_user_try { + put_user_ex(regs->pt.bx, &user->regs.ebx); + put_user_ex(regs->pt.cx, &user->regs.ecx); + put_user_ex(regs->pt.dx, &user->regs.edx); + put_user_ex(regs->pt.si, &user->regs.esi); + put_user_ex(regs->pt.di, &user->regs.edi); + put_user_ex(regs->pt.bp, &user->regs.ebp); + put_user_ex(regs->pt.ax, &user->regs.eax); + put_user_ex(regs->pt.ip, &user->regs.eip); + put_user_ex(regs->pt.cs, &user->regs.cs); + put_user_ex(regs->pt.flags, &user->regs.eflags); + put_user_ex(regs->pt.sp, &user->regs.esp); + put_user_ex(regs->pt.ss, &user->regs.ss); + put_user_ex(regs->es, &user->regs.es); + put_user_ex(regs->ds, &user->regs.ds); + put_user_ex(regs->fs, &user->regs.fs); + put_user_ex(regs->gs, &user->regs.gs); + + put_user_ex(tsk->thread.screen_bitmap, &user->screen_bitmap); + } put_user_catch(err); + if (err) { pr_alert("could not access userspace vm86_info\n"); do_exit(SIGSEGV); } tss = &per_cpu(cpu_tss, get_cpu()); - current->thread.sp0 = current->thread.saved_sp0; - current->thread.sysenter_cs = __KERNEL_CS; - load_sp0(tss, ¤t->thread); - current->thread.saved_sp0 = 0; + tsk->thread.sp0 = tsk->thread.saved_sp0; + tsk->thread.sysenter_cs = __KERNEL_CS; + load_sp0(tss, &tsk->thread); + tsk->thread.saved_sp0 = 0; put_cpu(); ret = KVM86->regs32; @@ -199,7 +193,8 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); -static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); +static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, + struct kernel_vm86_struct *info); SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86) { @@ -208,21 +203,8 @@ SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86) * This remains on the stack until we * return to 32 bit user space. */ - struct task_struct *tsk = current; - int tmp; - if (tsk->thread.saved_sp0) - return -EPERM; - tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, - offsetof(struct kernel_vm86_struct, vm86plus) - - sizeof(info.regs)); - if (tmp) - return -EFAULT; - memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus); - info.regs32 = current_pt_regs(); - tsk->thread.vm86_info = v86; - do_sys_vm86(&info, tsk); - return 0; /* we never return here */ + return do_sys_vm86((struct vm86plus_struct __user *) v86, false, &info); } @@ -233,11 +215,7 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) * This remains on the stack until we * return to 32 bit user space. */ - struct task_struct *tsk; - int tmp; - struct vm86plus_struct __user *v86; - tsk = current; switch (cmd) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: @@ -255,34 +233,69 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) } /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ - if (tsk->thread.saved_sp0) - return -EPERM; - v86 = (struct vm86plus_struct __user *)arg; - tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, - offsetof(struct kernel_vm86_struct, regs32) - - sizeof(info.regs)); - if (tmp) - return -EFAULT; - info.regs32 = current_pt_regs(); - info.vm86plus.is_vm86pus = 1; - tsk->thread.vm86_info = (struct vm86_struct __user *)v86; - do_sys_vm86(&info, tsk); - return 0; /* we never return here */ + return do_sys_vm86((struct vm86plus_struct __user *) arg, true, &info); } -static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk) +static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, + struct kernel_vm86_struct *info) { struct tss_struct *tss; -/* - * make sure the vm86() system call doesn't try to do anything silly - */ - info->regs.pt.ds = 0; - info->regs.pt.es = 0; - info->regs.pt.fs = 0; -#ifndef CONFIG_X86_32_LAZY_GS - info->regs.pt.gs = 0; -#endif + struct task_struct *tsk = current; + unsigned long err = 0; + + if (tsk->thread.saved_sp0) + return -EPERM; + + if (!access_ok(VERIFY_READ, v86, plus ? + sizeof(struct vm86_struct) : + sizeof(struct vm86plus_struct))) + return -EFAULT; + + memset(info, 0, sizeof(*info)); + get_user_try { + unsigned short seg; + get_user_ex(info->regs.pt.bx, &v86->regs.ebx); + get_user_ex(info->regs.pt.cx, &v86->regs.ecx); + get_user_ex(info->regs.pt.dx, &v86->regs.edx); + get_user_ex(info->regs.pt.si, &v86->regs.esi); + get_user_ex(info->regs.pt.di, &v86->regs.edi); + get_user_ex(info->regs.pt.bp, &v86->regs.ebp); + get_user_ex(info->regs.pt.ax, &v86->regs.eax); + get_user_ex(info->regs.pt.ip, &v86->regs.eip); + get_user_ex(seg, &v86->regs.cs); + info->regs.pt.cs = seg; + get_user_ex(info->regs.pt.flags, &v86->regs.eflags); + get_user_ex(info->regs.pt.sp, &v86->regs.esp); + get_user_ex(seg, &v86->regs.ss); + info->regs.pt.ss = seg; + get_user_ex(info->regs.es, &v86->regs.es); + get_user_ex(info->regs.ds, &v86->regs.ds); + get_user_ex(info->regs.fs, &v86->regs.fs); + get_user_ex(info->regs.gs, &v86->regs.gs); + + get_user_ex(info->flags, &v86->flags); + get_user_ex(info->screen_bitmap, &v86->screen_bitmap); + get_user_ex(info->cpu_type, &v86->cpu_type); + } get_user_catch(err); + if (err) + return err; + + if (copy_from_user(&info->int_revectored, &v86->int_revectored, + sizeof(struct revectored_struct))) + return -EFAULT; + if (copy_from_user(&info->int21_revectored, &v86->int21_revectored, + sizeof(struct revectored_struct))) + return -EFAULT; + if (plus) { + if (copy_from_user(&info->vm86plus, &v86->vm86plus, + sizeof(struct vm86plus_info_struct))) + return -EFAULT; + info->vm86plus.is_vm86pus = 1; + } + + info->regs32 = current_pt_regs(); + tsk->thread.vm86_info = v86; /* * The flags register is also special: we cannot trust that the user @@ -344,7 +357,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk "jmp resume_userspace" : /* no outputs */ :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); - /* we never return here */ + unreachable(); /* we never return here */ } static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval) -- cgit v0.10.2 From 9dea5dc921b5f4045a18c63eb92e84dc274d17eb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 14 Jul 2015 15:24:24 -0700 Subject: x86/entry/syscalls: Wire up 32-bit direct socket calls On x86_64, there's no socketcall syscall; instead all of the socket calls are real syscalls. For 32-bit programs, we're stuck offering the socketcall syscall, but it would be nice to expose the direct calls as well. This will enable seccomp to filter socket calls (for new userspace only, but that's fine for some applications) and it will provide a tiny performance boost. Signed-off-by: Andy Lutomirski Cc: Alexander Larsson Cc: Andy Lutomirski Cc: Cosimo Cecchi Cc: Dan Nicholson Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rajalakshmi Srinivasaraghavan Cc: Thomas Gleixner Cc: Tulio Magno Quites Machado Filho Cc: libc-alpha Link: http://lkml.kernel.org/r/cb5138299d37d5800e2d135b01a7667fa6115854.1436912629.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index ef8187f..25e3cf1 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -365,3 +365,18 @@ 356 i386 memfd_create sys_memfd_create 357 i386 bpf sys_bpf 358 i386 execveat sys_execveat stub32_execveat +359 i386 socket sys_socket +360 i386 socketpair sys_socketpair +361 i386 bind sys_bind +362 i386 connect sys_connect +363 i386 listen sys_listen +364 i386 accept4 sys_accept4 +365 i386 getsockopt sys_getsockopt compat_sys_getsockopt +366 i386 setsockopt sys_setsockopt compat_sys_setsockopt +367 i386 getsockname sys_getsockname +368 i386 getpeername sys_getpeername +369 i386 sendto sys_sendto +370 i386 sendmsg sys_sendmsg compat_sys_sendmsg +371 i386 recvfrom sys_recvfrom compat_sys_recvfrom +372 i386 recvmsg sys_recvmsg compat_sys_recvmsg +373 i386 shutdown sys_shutdown -- cgit v0.10.2 From 3490565b633c705d2fb1f6ede51228952664663d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 13 Jul 2015 20:31:03 +0200 Subject: locking/spinlocks: Force inlining of spinlock ops With both gcc 4.7.2 and 4.9.2, sometimes GCC mysteriously doesn't inline very small functions we expect to be inlined. See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 In particular, with this config: http://busybox.net/~vda/kernel_config there are more than a thousand copies of tiny spinlock-related functions: $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin' 473 000000000000000b t spin_unlock_irqrestore 292 000000000000000b t spin_unlock 215 000000000000000b t spin_lock 134 000000000000000b t spin_unlock_irq 130 000000000000000b t spin_unlock_bh 120 000000000000000b t spin_lock_irq 106 000000000000000b t spin_lock_bh Disassembly: ffffffff81004720 : ffffffff81004720: 55 push %rbp ffffffff81004721: 48 89 e5 mov %rsp,%rbp ffffffff81004724: e8 f8 4e e2 02 callq <_raw_spin_lock> ffffffff81004729: 5d pop %rbp ffffffff8100472a: c3 retq This patch fixes this via s/inline/__always_inline/ in spinlock.h. This decreases vmlinux by about 40k: text data bss dec hex filename 82375570 22255544 20627456 125258570 7774b4a vmlinux.before 82335059 22255416 20627456 125217931 776ac8b vmlinux Signed-off-by: Denys Vlasenko Cc: Andrew Morton Cc: Andy Lutomirski Cc: Bart Van Assche Cc: Borislav Petkov Cc: Brian Gerst Cc: David Rientjes Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Thomas Graf Link: http://lkml.kernel.org/r/1436812263-15243-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 0063b24..ffcd053 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -296,7 +296,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) * Map the spin_lock functions to the raw variants for PREEMPT_RT=n */ -static inline raw_spinlock_t *spinlock_check(spinlock_t *lock) +static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock) { return &lock->rlock; } @@ -307,17 +307,17 @@ do { \ raw_spin_lock_init(&(_lock)->rlock); \ } while (0) -static inline void spin_lock(spinlock_t *lock) +static __always_inline void spin_lock(spinlock_t *lock) { raw_spin_lock(&lock->rlock); } -static inline void spin_lock_bh(spinlock_t *lock) +static __always_inline void spin_lock_bh(spinlock_t *lock) { raw_spin_lock_bh(&lock->rlock); } -static inline int spin_trylock(spinlock_t *lock) +static __always_inline int spin_trylock(spinlock_t *lock) { return raw_spin_trylock(&lock->rlock); } @@ -337,7 +337,7 @@ do { \ raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock); \ } while (0) -static inline void spin_lock_irq(spinlock_t *lock) +static __always_inline void spin_lock_irq(spinlock_t *lock) { raw_spin_lock_irq(&lock->rlock); } @@ -352,32 +352,32 @@ do { \ raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \ } while (0) -static inline void spin_unlock(spinlock_t *lock) +static __always_inline void spin_unlock(spinlock_t *lock) { raw_spin_unlock(&lock->rlock); } -static inline void spin_unlock_bh(spinlock_t *lock) +static __always_inline void spin_unlock_bh(spinlock_t *lock) { raw_spin_unlock_bh(&lock->rlock); } -static inline void spin_unlock_irq(spinlock_t *lock) +static __always_inline void spin_unlock_irq(spinlock_t *lock) { raw_spin_unlock_irq(&lock->rlock); } -static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) +static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags) { raw_spin_unlock_irqrestore(&lock->rlock, flags); } -static inline int spin_trylock_bh(spinlock_t *lock) +static __always_inline int spin_trylock_bh(spinlock_t *lock) { return raw_spin_trylock_bh(&lock->rlock); } -static inline int spin_trylock_irq(spinlock_t *lock) +static __always_inline int spin_trylock_irq(spinlock_t *lock) { return raw_spin_trylock_irq(&lock->rlock); } @@ -387,22 +387,22 @@ static inline int spin_trylock_irq(spinlock_t *lock) raw_spin_trylock_irqsave(spinlock_check(lock), flags); \ }) -static inline void spin_unlock_wait(spinlock_t *lock) +static __always_inline void spin_unlock_wait(spinlock_t *lock) { raw_spin_unlock_wait(&lock->rlock); } -static inline int spin_is_locked(spinlock_t *lock) +static __always_inline int spin_is_locked(spinlock_t *lock) { return raw_spin_is_locked(&lock->rlock); } -static inline int spin_is_contended(spinlock_t *lock) +static __always_inline int spin_is_contended(spinlock_t *lock) { return raw_spin_is_contended(&lock->rlock); } -static inline int spin_can_lock(spinlock_t *lock) +static __always_inline int spin_can_lock(spinlock_t *lock) { return raw_spin_can_lock(&lock->rlock); } -- cgit v0.10.2 From 5aef51c340cb50ed9a3997dc5d782324372078bd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 10 Jul 2015 08:34:23 -0700 Subject: x86/kconfig/32: Rename CONFIG_VM86 and default it to 'n' VM86 is entirely broken if ptrace, syscall auditing, or NOHZ_FULL is in use. The code is a big undocumented mess, it's a real PITA to test, and it looks like a big chunk of vm86_32.c is dead code. It also plays awful games with the entry asm. No one should be using it anyway. Use DOSBOX or KVM instead. Let's accelerate its slow death. Remove it from EXPERT and default it to n. Distros should not enable it. In the unlikely event that some user needs it, they can easily re-enable it. While we're at it, rename it to CONFIG_X86_LEGACY_VM86 so that 'make oldconfig' users will be prompted again. I left CONFIG_VM86 as an alias to avoid a treewide replacement of the names. We can clean that up once the current asm and vm86 code churn settles down. Signed-off-by: Andy Lutomirski Cc: Arjan van de Ven Cc: Austin S Hemmelgarn Cc: Borislav Petkov Cc: Brian Gerst Cc: Kees Cook Cc: Linus Torvalds Cc: Matthew Garrett Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d29c6cc442d32d4df58849d2f8c89fb39ff88d61.1436542295.git.luto@kernel.org [ Refined it some more. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index aa94fd0..2cb2211 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -996,15 +996,36 @@ config X86_THERMAL_VECTOR def_bool y depends on X86_MCE_INTEL -config VM86 - bool "Enable VM86 support" if EXPERT - default y +config X86_LEGACY_VM86 + bool "Legacy VM86 support (obsolete)" + default n depends on X86_32 ---help--- - This option is required by programs like DOSEMU to run - 16-bit real mode legacy code on x86 processors. It also may - be needed by software like XFree86 to initialize some video - cards via BIOS. Disabling this option saves about 6K. + This option allows user programs to put the CPU into V8086 + mode, which is an 80286-era approximation of 16-bit real mode. + + Some very old versions of X and/or vbetool require this option + for user mode setting. Similarly, DOSEMU will use it if + available to accelerate real mode DOS programs. However, any + recent version of DOSEMU, X, or vbetool should be fully + functional even without kernel VM86 support, as they will all + fall back to (pretty well performing) software emulation. + + Anything that works on a 64-bit kernel is unlikely to need + this option, as 64-bit kernels don't, and can't, support V8086 + mode. This option is also unrelated to 16-bit protected mode + and is not needed to run most 16-bit programs under Wine. + + Enabling this option adds considerable attack surface to the + kernel and slows down system calls and exception handling. + + Unless you use very old userspace or need the last drop of + performance in your real mode DOS games and can't use KVM, + say N here. + +config VM86 + bool + default X86_LEGACY_VM86 config X86_16BIT bool "Enable support for 16-bit segments" if EXPERT -- cgit v0.10.2 From f2a50f8b7da45ff2de93a71393e715a2ab9f3b68 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 9 Jul 2015 19:17:29 -0700 Subject: x86/selftests, x86/vm86: Improve entry_from_vm86 selftest The entry_from_vm86 selftest was very weak. Improve it: test more types of kernel entries from vm86 mode and test them more carefully. While we're at it, try to improve behavior on non-SEP CPUs. The old code was buggy because I misunderstood the intended semantics of #UD in vm86, so I didn't handle a possible signal. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Shuah Khan Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d8ef1d7368ac70d8342481563ed50f9a7d2eea6f.1436492057.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c index 5c38a18..f004b2a 100644 --- a/tools/testing/selftests/x86/entry_from_vm86.c +++ b/tools/testing/selftests/x86/entry_from_vm86.c @@ -28,6 +28,55 @@ static unsigned long load_addr = 0x10000; static int nerrs = 0; +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void clearhandler(int sig) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static sig_atomic_t got_signal; + +static void sighandler(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + + if (ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_VM || + (ctx->uc_mcontext.gregs[REG_CS] & 3) != 3) { + printf("[FAIL]\tSignal frame should not reflect vm86 mode\n"); + nerrs++; + } + + const char *signame; + if (sig == SIGSEGV) + signame = "SIGSEGV"; + else if (sig == SIGILL) + signame = "SIGILL"; + else + signame = "unexpected signal"; + + printf("[INFO]\t%s: FLAGS = 0x%lx, CS = 0x%hx\n", signame, + (unsigned long)ctx->uc_mcontext.gregs[REG_EFL], + (unsigned short)ctx->uc_mcontext.gregs[REG_CS]); + + got_signal = 1; +} + asm ( ".pushsection .rodata\n\t" ".type vmcode_bound, @object\n\t" @@ -38,6 +87,14 @@ asm ( "int3\n\t" "vmcode_sysenter:\n\t" "sysenter\n\t" + "vmcode_syscall:\n\t" + "syscall\n\t" + "vmcode_sti:\n\t" + "sti\n\t" + "vmcode_int3:\n\t" + "int3\n\t" + "vmcode_int80:\n\t" + "int $0x80\n\t" ".size vmcode, . - vmcode\n\t" "end_vmcode:\n\t" ".code32\n\t" @@ -45,9 +102,11 @@ asm ( ); extern unsigned char vmcode[], end_vmcode[]; -extern unsigned char vmcode_bound[], vmcode_sysenter[]; +extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[], + vmcode_sti[], vmcode_int3[], vmcode_int80[]; static void do_test(struct vm86plus_struct *v86, unsigned long eip, + unsigned int rettype, unsigned int retarg, const char *text) { long ret; @@ -73,13 +132,28 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip, else sprintf(trapname, "%d", trapno); - printf("[OK]\tExited vm86 mode due to #%s\n", trapname); + printf("[INFO]\tExited vm86 mode due to #%s\n", trapname); } else if (VM86_TYPE(ret) == VM86_UNKNOWN) { - printf("[OK]\tExited vm86 mode due to unhandled GP fault\n"); + printf("[INFO]\tExited vm86 mode due to unhandled GP fault\n"); + } else if (VM86_TYPE(ret) == VM86_TRAP) { + printf("[INFO]\tExited vm86 mode due to a trap (arg=%ld)\n", + VM86_ARG(ret)); + } else if (VM86_TYPE(ret) == VM86_SIGNAL) { + printf("[INFO]\tExited vm86 mode due to a signal\n"); + } else if (VM86_TYPE(ret) == VM86_STI) { + printf("[INFO]\tExited vm86 mode due to STI\n"); } else { - printf("[OK]\tExited vm86 mode due to type %ld, arg %ld\n", + printf("[INFO]\tExited vm86 mode due to type %ld, arg %ld\n", VM86_TYPE(ret), VM86_ARG(ret)); } + + if (rettype == -1 || + (VM86_TYPE(ret) == rettype && VM86_ARG(ret) == retarg)) { + printf("[OK]\tReturned correctly\n"); + } else { + printf("[FAIL]\tIncorrect return reason\n"); + nerrs++; + } } int main(void) @@ -105,10 +179,52 @@ int main(void) assert((v86.regs.cs & 3) == 0); /* Looks like RPL = 0 */ /* #BR -- should deliver SIG??? */ - do_test(&v86, vmcode_bound - vmcode, "#BR"); - - /* SYSENTER -- should cause #GP or #UD depending on CPU */ - do_test(&v86, vmcode_sysenter - vmcode, "SYSENTER"); + do_test(&v86, vmcode_bound - vmcode, VM86_INTx, 5, "#BR"); + + /* + * SYSENTER -- should cause #GP or #UD depending on CPU. + * Expected return type -1 means that we shouldn't validate + * the vm86 return value. This will avoid problems on non-SEP + * CPUs. + */ + sethandler(SIGILL, sighandler, 0); + do_test(&v86, vmcode_sysenter - vmcode, -1, 0, "SYSENTER"); + clearhandler(SIGILL); + + /* + * SYSCALL would be a disaster in VM86 mode. Fortunately, + * there is no kernel that both enables SYSCALL and sets + * EFER.SCE, so it's #UD on all systems. But vm86 is + * buggy (or has a "feature"), so the SIGILL will actually + * be delivered. + */ + sethandler(SIGILL, sighandler, 0); + do_test(&v86, vmcode_syscall - vmcode, VM86_SIGNAL, 0, "SYSCALL"); + clearhandler(SIGILL); + + /* STI with VIP set */ + v86.regs.eflags |= X86_EFLAGS_VIP; + v86.regs.eflags &= ~X86_EFLAGS_IF; + do_test(&v86, vmcode_sti - vmcode, VM86_STI, 0, "STI with VIP set"); + + /* INT3 -- should cause #BP */ + do_test(&v86, vmcode_int3 - vmcode, VM86_TRAP, 3, "INT3"); + + /* INT80 -- should exit with "INTx 0x80" */ + v86.regs.eax = (unsigned int)-1; + do_test(&v86, vmcode_int80 - vmcode, VM86_INTx, 0x80, "int80"); + + /* Execute a null pointer */ + v86.regs.cs = 0; + v86.regs.ss = 0; + sethandler(SIGSEGV, sighandler, 0); + got_signal = 0; + do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer"); + if (!got_signal) { + printf("[FAIL]\tDid not receive SIGSEGV\n"); + nerrs++; + } + clearhandler(SIGSEGV); return (nerrs == 0 ? 0 : 1); } -- cgit v0.10.2 From b2c51106c7581866c37ffc77c5d739f3d4b7cbc9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 21 Jul 2015 09:27:18 -0700 Subject: x86/build: Fix detection of GCC -mpreferred-stack-boundary support As per: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 GCC only allows -mpreferred-stack-boundary=3 on x86_64 if -mno-sse is set. That means that cc-option will not detect -mpreferred-stack-boundary=3 support, because we test for it before setting -mno-sse. Fix it by reordering the Makefile bits. Compile-tested only. This should help avoid code generation issues such as the one that was worked around in: b96fecbfa8c8 ("x86/fpu: Fix boot crash in the early FPU code") I'm a bit concerned that we could still have problems on older GCC versions given that our asm code does not respect GCC's idea of the ABI-required stack alignment. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jan Kara Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/f5297c192969adfa0d28b84cf8a22d59573db26d.1436126872.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 118e6de..054ff96 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -39,6 +39,16 @@ ifdef CONFIG_X86_NEED_RELOCS LDFLAGS_vmlinux := --emit-relocs endif +# +# Prevent GCC from generating any FP code by mistake. +# +# This must happen before we try the -mpreferred-stack-boundary, see: +# +# https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 +# +KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow +KBUILD_CFLAGS += $(call cc-option,-mno-avx,) + ifeq ($(CONFIG_X86_32),y) BITS := 32 UTS_MACHINE := i386 @@ -167,9 +177,6 @@ KBUILD_CFLAGS += -pipe KBUILD_CFLAGS += -Wno-sign-compare # KBUILD_CFLAGS += -fno-asynchronous-unwind-tables -# prevent gcc from generating any FP code by mistake -KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -KBUILD_CFLAGS += $(call cc-option,-mno-avx,) KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) -- cgit v0.10.2 From 014dc90b66c8d0b5f5a9400440727c134ee5e5a3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 30 Jul 2015 14:31:33 -0700 Subject: selftests/x86, x86/ldt: Add a selftest for modify_ldt() This tests general modify_ldt() behavior (only writes, so far) as well as synchronous updates via IPI. It fails on old kernels. I called this ldt_gdt because I'll add set_thread_area() tests to it at some point. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jan Beulich Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt Cc: Thomas Gleixner Cc: security@kernel.org Cc: xen-devel Link: http://lkml.kernel.org/r/dcfda65dad07ff5a3ea97a9172b5963bf8031b2e.1438291540.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index e8df47e..b70da4a 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -4,7 +4,7 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean -TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs +TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs ldt_gdt TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c new file mode 100644 index 0000000..31a3035 --- /dev/null +++ b/tools/testing/selftests/x86/ldt_gdt.c @@ -0,0 +1,576 @@ +/* + * ldt_gdt.c - Test cases for LDT and GDT access + * Copyright (c) 2015 Andrew Lutomirski + */ + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define AR_ACCESSED (1<<8) + +#define AR_TYPE_RODATA (0 * (1<<9)) +#define AR_TYPE_RWDATA (1 * (1<<9)) +#define AR_TYPE_RODATA_EXPDOWN (2 * (1<<9)) +#define AR_TYPE_RWDATA_EXPDOWN (3 * (1<<9)) +#define AR_TYPE_XOCODE (4 * (1<<9)) +#define AR_TYPE_XRCODE (5 * (1<<9)) +#define AR_TYPE_XOCODE_CONF (6 * (1<<9)) +#define AR_TYPE_XRCODE_CONF (7 * (1<<9)) + +#define AR_DPL3 (3 * (1<<13)) + +#define AR_S (1 << 12) +#define AR_P (1 << 15) +#define AR_AVL (1 << 20) +#define AR_L (1 << 21) +#define AR_DB (1 << 22) +#define AR_G (1 << 23) + +static int nerrs; + +static void check_invalid_segment(uint16_t index, int ldt) +{ + uint32_t has_limit = 0, has_ar = 0, limit, ar; + uint32_t selector = (index << 3) | (ldt << 2) | 3; + + asm ("lsl %[selector], %[limit]\n\t" + "jnz 1f\n\t" + "movl $1, %[has_limit]\n\t" + "1:" + : [limit] "=r" (limit), [has_limit] "+rm" (has_limit) + : [selector] "r" (selector)); + asm ("larl %[selector], %[ar]\n\t" + "jnz 1f\n\t" + "movl $1, %[has_ar]\n\t" + "1:" + : [ar] "=r" (ar), [has_ar] "+rm" (has_ar) + : [selector] "r" (selector)); + + if (has_limit || has_ar) { + printf("[FAIL]\t%s entry %hu is valid but should be invalid\n", + (ldt ? "LDT" : "GDT"), index); + nerrs++; + } else { + printf("[OK]\t%s entry %hu is invalid\n", + (ldt ? "LDT" : "GDT"), index); + } +} + +static void check_valid_segment(uint16_t index, int ldt, + uint32_t expected_ar, uint32_t expected_limit, + bool verbose) +{ + uint32_t has_limit = 0, has_ar = 0, limit, ar; + uint32_t selector = (index << 3) | (ldt << 2) | 3; + + asm ("lsl %[selector], %[limit]\n\t" + "jnz 1f\n\t" + "movl $1, %[has_limit]\n\t" + "1:" + : [limit] "=r" (limit), [has_limit] "+rm" (has_limit) + : [selector] "r" (selector)); + asm ("larl %[selector], %[ar]\n\t" + "jnz 1f\n\t" + "movl $1, %[has_ar]\n\t" + "1:" + : [ar] "=r" (ar), [has_ar] "+rm" (has_ar) + : [selector] "r" (selector)); + + if (!has_limit || !has_ar) { + printf("[FAIL]\t%s entry %hu is invalid but should be valid\n", + (ldt ? "LDT" : "GDT"), index); + nerrs++; + return; + } + + if (ar != expected_ar) { + printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n", + (ldt ? "LDT" : "GDT"), index, ar, expected_ar); + nerrs++; + } else if (limit != expected_limit) { + printf("[FAIL]\t%s entry %hu has limit 0x%08X but expected 0x%08X\n", + (ldt ? "LDT" : "GDT"), index, limit, expected_limit); + nerrs++; + } else if (verbose) { + printf("[OK]\t%s entry %hu has AR 0x%08X and limit 0x%08X\n", + (ldt ? "LDT" : "GDT"), index, ar, limit); + } +} + +static bool install_valid_mode(const struct user_desc *desc, uint32_t ar, + bool oldmode) +{ + int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, + desc, sizeof(*desc)); + if (ret < -1) + errno = -ret; + if (ret == 0) { + uint32_t limit = desc->limit; + if (desc->limit_in_pages) + limit = (limit << 12) + 4095; + check_valid_segment(desc->entry_number, 1, ar, limit, true); + return true; + } else if (errno == ENOSYS) { + printf("[OK]\tmodify_ldt returned -ENOSYS\n"); + return false; + } else { + if (desc->seg_32bit) { + printf("[FAIL]\tUnexpected modify_ldt failure %d\n", + errno); + nerrs++; + return false; + } else { + printf("[OK]\tmodify_ldt rejected 16 bit segment\n"); + return false; + } + } +} + +static bool install_valid(const struct user_desc *desc, uint32_t ar) +{ + return install_valid_mode(desc, ar, false); +} + +static void install_invalid(const struct user_desc *desc, bool oldmode) +{ + int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11, + desc, sizeof(*desc)); + if (ret < -1) + errno = -ret; + if (ret == 0) { + check_invalid_segment(desc->entry_number, 1); + } else if (errno == ENOSYS) { + printf("[OK]\tmodify_ldt returned -ENOSYS\n"); + } else { + if (desc->seg_32bit) { + printf("[FAIL]\tUnexpected modify_ldt failure %d\n", + errno); + nerrs++; + } else { + printf("[OK]\tmodify_ldt rejected 16 bit segment\n"); + } + } +} + +static int safe_modify_ldt(int func, struct user_desc *ptr, + unsigned long bytecount) +{ + int ret = syscall(SYS_modify_ldt, 0x11, ptr, bytecount); + if (ret < -1) + errno = -ret; + return ret; +} + +static void fail_install(struct user_desc *desc) +{ + if (safe_modify_ldt(0x11, desc, sizeof(*desc)) == 0) { + printf("[FAIL]\tmodify_ldt accepted a bad descriptor\n"); + nerrs++; + } else if (errno == ENOSYS) { + printf("[OK]\tmodify_ldt returned -ENOSYS\n"); + } else { + printf("[OK]\tmodify_ldt failure %d\n", errno); + } +} + +static void do_simple_tests(void) +{ + struct user_desc desc = { + .entry_number = 0, + .base_addr = 0, + .limit = 10, + .seg_32bit = 1, + .contents = 2, /* Code, not conforming */ + .read_exec_only = 0, + .limit_in_pages = 0, + .seg_not_present = 0, + .useable = 0 + }; + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB); + + desc.limit_in_pages = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | + AR_S | AR_P | AR_DB | AR_G); + + check_invalid_segment(1, 1); + + desc.entry_number = 2; + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | + AR_S | AR_P | AR_DB | AR_G); + + check_invalid_segment(1, 1); + + desc.base_addr = 0xf0000000; + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | + AR_S | AR_P | AR_DB | AR_G); + + desc.useable = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | + AR_S | AR_P | AR_DB | AR_G | AR_AVL); + + desc.seg_not_present = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | + AR_S | AR_DB | AR_G | AR_AVL); + + desc.seg_32bit = 0; + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | + AR_S | AR_G | AR_AVL); + + desc.seg_32bit = 1; + desc.contents = 0; + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | + AR_S | AR_DB | AR_G | AR_AVL); + + desc.read_exec_only = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | + AR_S | AR_DB | AR_G | AR_AVL); + + desc.contents = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN | + AR_S | AR_DB | AR_G | AR_AVL); + + desc.read_exec_only = 0; + desc.limit_in_pages = 0; + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN | + AR_S | AR_DB | AR_AVL); + + desc.contents = 3; + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE_CONF | + AR_S | AR_DB | AR_AVL); + + desc.read_exec_only = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE_CONF | + AR_S | AR_DB | AR_AVL); + + desc.read_exec_only = 0; + desc.contents = 2; + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | + AR_S | AR_DB | AR_AVL); + + desc.read_exec_only = 1; + +#ifdef __x86_64__ + desc.lm = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE | + AR_S | AR_DB | AR_AVL); + desc.lm = 0; +#endif + + bool entry1_okay = install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE | + AR_S | AR_DB | AR_AVL); + + if (entry1_okay) { + printf("[RUN]\tTest fork\n"); + pid_t child = fork(); + if (child == 0) { + nerrs = 0; + check_valid_segment(desc.entry_number, 1, + AR_DPL3 | AR_TYPE_XOCODE | + AR_S | AR_DB | AR_AVL, desc.limit, + true); + check_invalid_segment(1, 1); + exit(nerrs ? 1 : 0); + } else { + int status; + if (waitpid(child, &status, 0) != child || + !WIFEXITED(status)) { + printf("[FAIL]\tChild died\n"); + nerrs++; + } else if (WEXITSTATUS(status) != 0) { + printf("[FAIL]\tChild failed\n"); + nerrs++; + } else { + printf("[OK]\tChild succeeded\n"); + } + } + + printf("[RUN]\tTest size\n"); + int i; + for (i = 0; i < 8192; i++) { + desc.entry_number = i; + desc.limit = i; + if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) { + printf("[FAIL]\tFailed to install entry %d\n", i); + nerrs++; + break; + } + } + for (int j = 0; j < i; j++) { + check_valid_segment(j, 1, AR_DPL3 | AR_TYPE_XOCODE | + AR_S | AR_DB | AR_AVL, j, false); + } + printf("[DONE]\tSize test\n"); + } else { + printf("[SKIP]\tSkipping fork and size tests because we have no LDT\n"); + } + + /* Test entry_number too high. */ + desc.entry_number = 8192; + fail_install(&desc); + + /* Test deletion and actions mistakeable for deletion. */ + memset(&desc, 0, sizeof(desc)); + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P); + + desc.seg_not_present = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S); + + desc.seg_not_present = 0; + desc.read_exec_only = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P); + + desc.read_exec_only = 0; + desc.seg_not_present = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S); + + desc.read_exec_only = 1; + desc.limit = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S); + + desc.limit = 0; + desc.base_addr = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S); + + desc.base_addr = 0; + install_invalid(&desc, false); + + desc.seg_not_present = 0; + desc.read_exec_only = 0; + desc.seg_32bit = 1; + install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB); + install_invalid(&desc, true); +} + +/* + * 0: thread is idle + * 1: thread armed + * 2: thread should clear LDT entry 0 + * 3: thread should exit + */ +static volatile unsigned int ftx; + +static void *threadproc(void *ctx) +{ + cpu_set_t cpuset; + CPU_ZERO(&cpuset); + CPU_SET(1, &cpuset); + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) + err(1, "sched_setaffinity to CPU 1"); /* should never fail */ + + while (1) { + syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0); + while (ftx != 2) { + if (ftx >= 3) + return NULL; + } + + /* clear LDT entry 0 */ + const struct user_desc desc = {}; + if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) != 0) + err(1, "modify_ldt"); + + /* If ftx == 2, set it to zero. If ftx == 100, quit. */ + unsigned int x = -2; + asm volatile ("lock xaddl %[x], %[ftx]" : + [x] "+r" (x), [ftx] "+m" (ftx)); + if (x != 2) + return NULL; + } +} + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); + +} + +static jmp_buf jmpbuf; + +static void sigsegv(int sig, siginfo_t *info, void *ctx_void) +{ + siglongjmp(jmpbuf, 1); +} + +static void do_multicpu_tests(void) +{ + cpu_set_t cpuset; + pthread_t thread; + int failures = 0, iters = 5, i; + unsigned short orig_ss; + + CPU_ZERO(&cpuset); + CPU_SET(1, &cpuset); + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { + printf("[SKIP]\tCannot set affinity to CPU 1\n"); + return; + } + + CPU_ZERO(&cpuset); + CPU_SET(0, &cpuset); + if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) { + printf("[SKIP]\tCannot set affinity to CPU 0\n"); + return; + } + + sethandler(SIGSEGV, sigsegv, 0); +#ifdef __i386__ + /* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */ + sethandler(SIGILL, sigsegv, 0); +#endif + + printf("[RUN]\tCross-CPU LDT invalidation\n"); + + if (pthread_create(&thread, 0, threadproc, 0) != 0) + err(1, "pthread_create"); + + asm volatile ("mov %%ss, %0" : "=rm" (orig_ss)); + + for (i = 0; i < 5; i++) { + if (sigsetjmp(jmpbuf, 1) != 0) + continue; + + /* Make sure the thread is ready after the last test. */ + while (ftx != 0) + ; + + struct user_desc desc = { + .entry_number = 0, + .base_addr = 0, + .limit = 0xfffff, + .seg_32bit = 1, + .contents = 0, /* Data */ + .read_exec_only = 0, + .limit_in_pages = 1, + .seg_not_present = 0, + .useable = 0 + }; + + if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) { + if (errno != ENOSYS) + err(1, "modify_ldt"); + printf("[SKIP]\tmodify_ldt unavailable\n"); + break; + } + + /* Arm the thread. */ + ftx = 1; + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); + + asm volatile ("mov %0, %%ss" : : "r" (0x7)); + + /* Go! */ + ftx = 2; + + while (ftx != 0) + ; + + /* + * On success, modify_ldt will segfault us synchronously, + * and we'll escape via siglongjmp. + */ + + failures++; + asm volatile ("mov %0, %%ss" : : "rm" (orig_ss)); + }; + + ftx = 100; /* Kill the thread. */ + syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0); + + if (pthread_join(thread, NULL) != 0) + err(1, "pthread_join"); + + if (failures) { + printf("[FAIL]\t%d of %d iterations failed\n", failures, iters); + nerrs++; + } else { + printf("[OK]\tAll %d iterations succeeded\n", iters); + } +} + +static int finish_exec_test(void) +{ + /* + * In a sensible world, this would be check_invalid_segment(0, 1); + * For better or for worse, though, the LDT is inherited across exec. + * We can probably change this safely, but for now we test it. + */ + check_valid_segment(0, 1, + AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB, + 42, true); + + return nerrs ? 1 : 0; +} + +static void do_exec_test(void) +{ + printf("[RUN]\tTest exec\n"); + + struct user_desc desc = { + .entry_number = 0, + .base_addr = 0, + .limit = 42, + .seg_32bit = 1, + .contents = 2, /* Code, not conforming */ + .read_exec_only = 0, + .limit_in_pages = 0, + .seg_not_present = 0, + .useable = 0 + }; + install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB); + + pid_t child = fork(); + if (child == 0) { + execl("/proc/self/exe", "ldt_gdt_test_exec", NULL); + printf("[FAIL]\tCould not exec self\n"); + exit(1); /* exec failed */ + } else { + int status; + if (waitpid(child, &status, 0) != child || + !WIFEXITED(status)) { + printf("[FAIL]\tChild died\n"); + nerrs++; + } else if (WEXITSTATUS(status) != 0) { + printf("[FAIL]\tChild failed\n"); + nerrs++; + } else { + printf("[OK]\tChild succeeded\n"); + } + } +} + +int main(int argc, char **argv) +{ + if (argc == 1 && !strcmp(argv[0], "ldt_gdt_test_exec")) + return finish_exec_test(); + + do_simple_tests(); + + do_multicpu_tests(); + + do_exec_test(); + + return nerrs ? 1 : 0; +} -- cgit v0.10.2 From a5b9e5a2f14f25a8dae987494d50ad3aac7366b6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 30 Jul 2015 14:31:34 -0700 Subject: x86/ldt: Make modify_ldt() optional The modify_ldt syscall exposes a large attack surface and is unnecessary for modern userspace. Make it optional. Signed-off-by: Andy Lutomirski Reviewed-by: Kees Cook Cc: Andrew Cooper Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jan Beulich Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sasha Levin Cc: Steven Rostedt Cc: Thomas Gleixner Cc: security@kernel.org Cc: xen-devel Link: http://lkml.kernel.org/r/a605166a771c343fd64802dece77a903507333bd.1438291540.git.luto@kernel.org [ Made MATH_EMULATION dependent on MODIFY_LDT_SYSCALL. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index e26fe7a..7986580 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1036,6 +1036,7 @@ config VM86 config X86_16BIT bool "Enable support for 16-bit segments" if EXPERT default y + depends on MODIFY_LDT_SYSCALL ---help--- This option is required by programs like Wine to run 16-bit protected mode legacy code on x86 processors. Disabling @@ -1530,6 +1531,7 @@ config X86_RESERVE_LOW config MATH_EMULATION bool + depends on MODIFY_LDT_SYSCALL prompt "Math emulation" if X86_32 ---help--- Linux can emulate a math coprocessor (used for floating point @@ -2074,6 +2076,22 @@ config CMDLINE_OVERRIDE This is used to work around broken boot loaders. This should be set to 'N' under normal conditions. +config MODIFY_LDT_SYSCALL + bool "Enable the LDT (local descriptor table)" if EXPERT + default y + ---help--- + Linux can allow user programs to install a per-process x86 + Local Descriptor Table (LDT) using the modify_ldt(2) system + call. This is required to run 16-bit or segmented code such as + DOSEMU or some Wine programs. It is also used by some very old + threading libraries. + + Enabling this feature adds a small amount of overhead to + context switches and increases the low-level kernel attack + surface. Disabling it removes the modify_ldt(2) system call. + + Saying 'N' here may make sense for embedded or server kernels. + source "kernel/livepatch/Kconfig" endmenu diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h index 364d274..55234d5 100644 --- a/arch/x86/include/asm/mmu.h +++ b/arch/x86/include/asm/mmu.h @@ -9,7 +9,9 @@ * we put the segment information here. */ typedef struct { +#ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; +#endif #ifdef CONFIG_X86_64 /* True if mm supports a task running in 32 bit compatibility mode. */ diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h index 984abfe..379cd36 100644 --- a/arch/x86/include/asm/mmu_context.h +++ b/arch/x86/include/asm/mmu_context.h @@ -33,6 +33,7 @@ static inline void load_mm_cr4(struct mm_struct *mm) static inline void load_mm_cr4(struct mm_struct *mm) {} #endif +#ifdef CONFIG_MODIFY_LDT_SYSCALL /* * ldt_structs can be allocated, used, and freed, but they are never * modified while live. @@ -48,8 +49,23 @@ struct ldt_struct { int size; }; +/* + * Used for LDT copy/destruction. + */ +int init_new_context(struct task_struct *tsk, struct mm_struct *mm); +void destroy_context(struct mm_struct *mm); +#else /* CONFIG_MODIFY_LDT_SYSCALL */ +static inline int init_new_context(struct task_struct *tsk, + struct mm_struct *mm) +{ + return 0; +} +static inline void destroy_context(struct mm_struct *mm) {} +#endif + static inline void load_mm_ldt(struct mm_struct *mm) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; /* lockless_dereference synchronizes with smp_store_release */ @@ -73,17 +89,13 @@ static inline void load_mm_ldt(struct mm_struct *mm) set_ldt(ldt->entries, ldt->size); else clear_LDT(); +#else + clear_LDT(); +#endif DEBUG_LOCKS_WARN_ON(preemptible()); } -/* - * Used for LDT copy/destruction. - */ -int init_new_context(struct task_struct *tsk, struct mm_struct *mm); -void destroy_context(struct mm_struct *mm); - - static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) { #ifdef CONFIG_SMP @@ -114,6 +126,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, /* Load per-mm CR4 state */ load_mm_cr4(next); +#ifdef CONFIG_MODIFY_LDT_SYSCALL /* * Load the LDT, if the LDT is different. * @@ -128,6 +141,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, */ if (unlikely(prev->context.ldt != next->context.ldt)) load_mm_ldt(next); +#endif } #ifdef CONFIG_SMP else { diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index dc19730..5140648 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -25,7 +25,8 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace obj-y := process_$(BITS).o signal.o obj-$(CONFIG_COMPAT) += signal_compat.o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o -obj-y += time.o ioport.o ldt.o dumpstack.o nmi.o +obj-y += time.o ioport.o dumpstack.o nmi.o +obj-$(CONFIG_MODIFY_LDT_SYSCALL) += ldt.o obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 09f9ff2..cc25c8f 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2179,6 +2179,7 @@ static unsigned long get_segment_base(unsigned int segment) int idx = segment >> 3; if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; if (idx > LDT_ENTRIES) @@ -2190,6 +2191,9 @@ static unsigned long get_segment_base(unsigned int segment) return 0; desc = &ldt->entries[idx]; +#else + return 0; +#endif } else { if (idx > GDT_ENTRIES) return 0; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7ff035c..3c1bbcf 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -121,6 +121,7 @@ void __show_regs(struct pt_regs *regs, int all) void release_thread(struct task_struct *dead_task) { if (dead_task->mm) { +#ifdef CONFIG_MODIFY_LDT_SYSCALL if (dead_task->mm->context.ldt) { pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n", dead_task->comm, @@ -128,6 +129,7 @@ void release_thread(struct task_struct *dead_task) dead_task->mm->context.ldt->size); BUG(); } +#endif } } diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 6273324..fd88e15 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -18,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re return addr; } +#ifdef CONFIG_MODIFY_LDT_SYSCALL /* * We'll assume that the code segments in the GDT * are all zero-based. That is largely true: the @@ -45,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re } mutex_unlock(&child->mm->context.lock); } +#endif return addr; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 7995ef5..ca7d84f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask); cond_syscall(sys_ssetmask); cond_syscall(sys_vm86old); cond_syscall(sys_vm86); +cond_syscall(sys_modify_ldt); cond_syscall(sys_ipc); cond_syscall(compat_sys_ipc); cond_syscall(compat_sys_sysctl); -- cgit v0.10.2 From e800eb39e3f586e46a2007f72d3b609f6e3b888d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 29 Jul 2015 14:35:43 -0700 Subject: selftests/x86/vm86: Fix entry_from_vm86 test on 64-bit kernels The test failed due to an oversight on my part when run on a 64-bit kernel. vm86 isn't expected to work at all, and I mistakenly failed one part of the test because no signal was delivered. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/502c8bef877b33fe4943885ded6125dfcc7892db.1438205722.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c index f004b2a..9a43a59 100644 --- a/tools/testing/selftests/x86/entry_from_vm86.c +++ b/tools/testing/selftests/x86/entry_from_vm86.c @@ -105,7 +105,8 @@ extern unsigned char vmcode[], end_vmcode[]; extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[], vmcode_sti[], vmcode_int3[], vmcode_int80[]; -static void do_test(struct vm86plus_struct *v86, unsigned long eip, +/* Returns false if the test was skipped. */ +static bool do_test(struct vm86plus_struct *v86, unsigned long eip, unsigned int rettype, unsigned int retarg, const char *text) { @@ -117,7 +118,7 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip, if (ret == -1 && errno == ENOSYS) { printf("[SKIP]\tvm86 not supported\n"); - return; + return false; } if (VM86_TYPE(ret) == VM86_INTx) { @@ -154,6 +155,8 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip, printf("[FAIL]\tIncorrect return reason\n"); nerrs++; } + + return true; } int main(void) @@ -219,8 +222,8 @@ int main(void) v86.regs.ss = 0; sethandler(SIGSEGV, sighandler, 0); got_signal = 0; - do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer"); - if (!got_signal) { + if (do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer") && + !got_signal) { printf("[FAIL]\tDid not receive SIGSEGV\n"); nerrs++; } -- cgit v0.10.2 From 9fda6a0681e070b496235b132bc70ceb80300211 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 29 Jul 2015 01:41:16 -0400 Subject: x86/vm86: Move vm86 fields out of 'thread_struct' Allocate a separate structure for the vm86 fields. Signed-off-by: Brian Gerst Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438148483-11932-2-git-send-email-brgerst@gmail.com [ Build fixes. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index befc134..9615a4e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -6,8 +6,8 @@ /* Forward declaration, a strange C thing */ struct task_struct; struct mm_struct; +struct vm86; -#include #include #include #include @@ -400,13 +400,9 @@ struct thread_struct { unsigned long cr2; unsigned long trap_nr; unsigned long error_code; -#ifdef CONFIG_X86_32 +#ifdef CONFIG_VM86 /* Virtual 86 mode info */ - struct vm86plus_struct __user *vm86_info; - unsigned long screen_bitmap; - unsigned long v86flags; - unsigned long v86mask; - unsigned long saved_sp0; + struct vm86 *vm86; #endif /* IO permissions: */ unsigned long *io_bitmap_ptr; @@ -718,7 +714,6 @@ static inline void spin_lock_prefetch(const void *x) #define INIT_THREAD { \ .sp0 = TOP_OF_INIT_STACK, \ - .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ } diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 1d8de3f..20b43b7 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -1,7 +1,6 @@ #ifndef _ASM_X86_VM86_H #define _ASM_X86_VM86_H - #include #include @@ -58,6 +57,14 @@ struct kernel_vm86_struct { */ }; +struct vm86 { + struct vm86plus_struct __user *vm86_info; + unsigned long screen_bitmap; + unsigned long v86flags; + unsigned long v86mask; + unsigned long saved_sp0; +}; + #ifdef CONFIG_VM86 void handle_vm86_fault(struct kernel_vm86_regs *, long); @@ -67,6 +74,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *); struct task_struct; void release_vm86_irqs(struct task_struct *); +#define free_vm86(t) do { \ + struct thread_struct *__t = (t); \ + if (__t->vm86 != NULL) { \ + kfree(__t->vm86); \ + __t->vm86 = NULL; \ + } \ +} while (0) + #else #define handle_vm86_fault(a, b) @@ -77,6 +92,8 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c) return 0; } +#define free_vm86(t) do { } while(0) + #endif /* CONFIG_VM86 */ #endif /* _ASM_X86_VM86_H */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 397688b..2199d9b 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -29,6 +29,7 @@ #include #include #include +#include /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, @@ -110,6 +111,8 @@ void exit_thread(void) kfree(bp); } + free_vm86(t); + fpu__drop(fpu); } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e6c2b47..bfa59b1 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -81,8 +82,8 @@ /* * virtual flags (16 and 32-bit versions) */ -#define VFLAGS (*(unsigned short *)&(current->thread.v86flags)) -#define VEFLAGS (current->thread.v86flags) +#define VFLAGS (*(unsigned short *)&(current->thread.vm86->v86flags)) +#define VEFLAGS (current->thread.vm86->v86flags) #define set_flags(X, new, mask) \ ((X) = ((X) & ~(mask)) | ((new) & (mask))) @@ -96,6 +97,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) struct pt_regs *ret; struct task_struct *tsk = current; struct vm86plus_struct __user *user; + struct vm86 *vm86 = current->thread.vm86; long err = 0; /* @@ -105,12 +107,12 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) */ local_irq_enable(); - if (!tsk->thread.vm86_info) { + if (!vm86 || !vm86->vm86_info) { pr_alert("no vm86_info: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | tsk->thread.v86mask); - user = tsk->thread.vm86_info; + set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask); + user = vm86->vm86_info; if (!access_ok(VERIFY_WRITE, user, VMPI.is_vm86pus ? sizeof(struct vm86plus_struct) : @@ -137,7 +139,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) put_user_ex(regs->fs, &user->regs.fs); put_user_ex(regs->gs, &user->regs.gs); - put_user_ex(tsk->thread.screen_bitmap, &user->screen_bitmap); + put_user_ex(vm86->screen_bitmap, &user->screen_bitmap); } put_user_catch(err); if (err) { pr_alert("could not access userspace vm86_info\n"); @@ -145,10 +147,10 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) } tss = &per_cpu(cpu_tss, get_cpu()); - tsk->thread.sp0 = tsk->thread.saved_sp0; + tsk->thread.sp0 = vm86->saved_sp0; tsk->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, &tsk->thread); - tsk->thread.saved_sp0 = 0; + vm86->saved_sp0 = 0; put_cpu(); ret = KVM86->regs32; @@ -242,9 +244,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, { struct tss_struct *tss; struct task_struct *tsk = current; + struct vm86 *vm86 = tsk->thread.vm86; unsigned long err = 0; - if (tsk->thread.saved_sp0) + if (!vm86) { + if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL))) + return -ENOMEM; + tsk->thread.vm86 = vm86; + } + if (vm86->saved_sp0) return -EPERM; if (!access_ok(VERIFY_READ, v86, plus ? @@ -295,7 +303,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, } info->regs32 = current_pt_regs(); - tsk->thread.vm86_info = v86; + vm86->vm86_info = v86; /* * The flags register is also special: we cannot trust that the user @@ -311,16 +319,16 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, switch (info->cpu_type) { case CPU_286: - tsk->thread.v86mask = 0; + vm86->v86mask = 0; break; case CPU_386: - tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; + vm86->v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; break; case CPU_486: - tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; + vm86->v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; break; default: - tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; + vm86->v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; break; } @@ -328,7 +336,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) */ info->regs32->ax = VM86_SIGNAL; - tsk->thread.saved_sp0 = tsk->thread.sp0; + vm86->saved_sp0 = tsk->thread.sp0; lazy_save_gs(info->regs32->gs); tss = &per_cpu(cpu_tss, get_cpu()); @@ -338,7 +346,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, load_sp0(tss, &tsk->thread); put_cpu(); - tsk->thread.screen_bitmap = info->screen_bitmap; + vm86->screen_bitmap = info->screen_bitmap; if (info->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); @@ -408,7 +416,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs) static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs) { - set_flags(VEFLAGS, flags, current->thread.v86mask); + set_flags(VEFLAGS, flags, current->thread.vm86->v86mask); set_flags(regs->pt.flags, flags, SAFE_MASK); if (flags & X86_EFLAGS_IF) set_IF(regs); @@ -418,7 +426,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs) { - set_flags(VFLAGS, flags, current->thread.v86mask); + set_flags(VFLAGS, flags, current->thread.vm86->v86mask); set_flags(regs->pt.flags, flags, SAFE_MASK); if (flags & X86_EFLAGS_IF) set_IF(regs); @@ -433,7 +441,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs) if (VEFLAGS & X86_EFLAGS_VIF) flags |= X86_EFLAGS_IF; flags |= X86_EFLAGS_IOPL; - return flags | (VEFLAGS & current->thread.v86mask); + return flags | (VEFLAGS & current->thread.vm86->v86mask); } static inline int is_revectored(int nr, struct revectored_struct *bitmap) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9dc9098..34a368d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -301,14 +301,16 @@ static inline void check_v8086_mode(struct pt_regs *regs, unsigned long address, struct task_struct *tsk) { +#ifdef CONFIG_VM86 unsigned long bit; - if (!v8086_mode(regs)) + if (!v8086_mode(regs) || !tsk->thread.vm86) return; bit = (address - 0xA0000) >> PAGE_SHIFT; if (bit < 32) - tsk->thread.screen_bitmap |= 1 << bit; + tsk->thread.vm86->screen_bitmap |= 1 << bit; +#endif } static bool low_pfn(unsigned long pfn) -- cgit v0.10.2 From d4ce0f26c790af8e829d3fad0a6787f40f98e24f Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 29 Jul 2015 01:41:17 -0400 Subject: x86/vm86: Move fields from 'struct kernel_vm86_struct' to 'struct vm86' Move the non-regs fields to the off-stack data. Signed-off-by: Brian Gerst Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438148483-11932-3-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 20b43b7..47c7648 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -37,13 +37,7 @@ struct kernel_vm86_struct { * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct' * in kernelspace, hence we need not reget the data from userspace. */ -#define VM86_TSS_ESP0 flags - unsigned long flags; - unsigned long screen_bitmap; - unsigned long cpu_type; - struct revectored_struct int_revectored; - struct revectored_struct int21_revectored; - struct vm86plus_info_struct vm86plus; +#define VM86_TSS_ESP0 regs32 struct pt_regs *regs32; /* here we save the pointer to the old regs */ /* * The below is not part of the structure, but the stack layout continues @@ -59,10 +53,16 @@ struct kernel_vm86_struct { struct vm86 { struct vm86plus_struct __user *vm86_info; - unsigned long screen_bitmap; unsigned long v86flags; unsigned long v86mask; unsigned long saved_sp0; + + unsigned long flags; + unsigned long screen_bitmap; + unsigned long cpu_type; + struct revectored_struct int_revectored; + struct revectored_struct int21_revectored; + struct vm86plus_info_struct vm86plus; }; #ifdef CONFIG_VM86 diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index bfa59b1..f71b4b9 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -68,7 +68,6 @@ #define KVM86 ((struct kernel_vm86_struct *)regs) -#define VMPI KVM86->vm86plus /* @@ -114,7 +113,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask); user = vm86->vm86_info; - if (!access_ok(VERIFY_WRITE, user, VMPI.is_vm86pus ? + if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ? sizeof(struct vm86plus_struct) : sizeof(struct vm86_struct))) { pr_alert("could not access userspace vm86_info\n"); @@ -282,25 +281,27 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, get_user_ex(info->regs.fs, &v86->regs.fs); get_user_ex(info->regs.gs, &v86->regs.gs); - get_user_ex(info->flags, &v86->flags); - get_user_ex(info->screen_bitmap, &v86->screen_bitmap); - get_user_ex(info->cpu_type, &v86->cpu_type); + get_user_ex(vm86->flags, &v86->flags); + get_user_ex(vm86->screen_bitmap, &v86->screen_bitmap); + get_user_ex(vm86->cpu_type, &v86->cpu_type); } get_user_catch(err); if (err) return err; - if (copy_from_user(&info->int_revectored, &v86->int_revectored, + if (copy_from_user(&vm86->int_revectored, &v86->int_revectored, sizeof(struct revectored_struct))) return -EFAULT; - if (copy_from_user(&info->int21_revectored, &v86->int21_revectored, + if (copy_from_user(&vm86->int21_revectored, &v86->int21_revectored, sizeof(struct revectored_struct))) return -EFAULT; if (plus) { - if (copy_from_user(&info->vm86plus, &v86->vm86plus, + if (copy_from_user(&vm86->vm86plus, &v86->vm86plus, sizeof(struct vm86plus_info_struct))) return -EFAULT; - info->vm86plus.is_vm86pus = 1; - } + vm86->vm86plus.is_vm86pus = 1; + } else + memset(&vm86->vm86plus, 0, + sizeof(struct vm86plus_info_struct)); info->regs32 = current_pt_regs(); vm86->vm86_info = v86; @@ -317,7 +318,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, info->regs.pt.orig_ax = info->regs32->orig_ax; - switch (info->cpu_type) { + switch (vm86->cpu_type) { case CPU_286: vm86->v86mask = 0; break; @@ -346,8 +347,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, load_sp0(tss, &tsk->thread); put_cpu(); - vm86->screen_bitmap = info->screen_bitmap; - if (info->flags & VM86_SCREEN_BITMAP) + if (vm86->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); /*call __audit_syscall_exit since we do not exit via the normal paths */ @@ -539,12 +539,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i, { unsigned long __user *intr_ptr; unsigned long segoffs; + struct kernel_vm86_info *vm86 = current->thread.vm86; if (regs->pt.cs == BIOSSEG) goto cannot_handle; - if (is_revectored(i, &KVM86->int_revectored)) + if (is_revectored(i, &vm86->int_revectored)) goto cannot_handle; - if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored)) + if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored)) goto cannot_handle; intr_ptr = (unsigned long __user *) (i << 2); if (get_user(segoffs, intr_ptr)) @@ -568,7 +569,7 @@ cannot_handle: int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) { - if (VMPI.is_vm86pus) { + if (current->thread.vm86->vm86plus.is_vm86pus) { if ((trapno == 3) || (trapno == 1)) { KVM86->regs32->ax = VM86_TRAP + (trapno << 8); /* setting this flag forces the code in entry_32.S to @@ -595,12 +596,13 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) unsigned char __user *ssp; unsigned short ip, sp, orig_flags; int data32, pref_done; + struct vm86plus_info_struct *vmpi = ¤t->thread.vm86->vm86plus; #define CHECK_IF_IN_TRAP \ - if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \ + if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \ newflags |= X86_EFLAGS_TF #define VM86_FAULT_RETURN do { \ - if (VMPI.force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \ + if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \ return_to_32bit(regs, VM86_PICRETURN); \ if (orig_flags & X86_EFLAGS_TF) \ handle_vm86_trap(regs, 0, 1); \ @@ -670,8 +672,8 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) case 0xcd: { int intno = popb(csp, ip, simulate_sigsegv); IP(regs) = ip; - if (VMPI.vm86dbg_active) { - if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3]) + if (vmpi->vm86dbg_active) { + if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) return_to_32bit(regs, VM86_INTx + (intno << 8)); } do_int(regs, intno, ssp, sp); -- cgit v0.10.2 From 90c6085a248f8f964588617f51329688bcc9f2bc Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 29 Jul 2015 01:41:18 -0400 Subject: x86/vm86: Eliminate 'struct kernel_vm86_struct' Now there is no vm86-specific data left on the kernel stack while in userspace, except for the 32-bit regs. Signed-off-by: Brian Gerst Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438148483-11932-4-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 47c7648..226d6c1 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -27,32 +27,9 @@ struct kernel_vm86_regs { unsigned short gs, __gsh; }; -struct kernel_vm86_struct { - struct kernel_vm86_regs regs; -/* - * the below part remains on the kernel stack while we are in VM86 mode. - * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we - * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above - * 'struct kernel_vm86_regs' with the then actual values. - * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct' - * in kernelspace, hence we need not reget the data from userspace. - */ -#define VM86_TSS_ESP0 regs32 - struct pt_regs *regs32; /* here we save the pointer to the old regs */ -/* - * The below is not part of the structure, but the stack layout continues - * this way. In front of 'return-eip' may be some data, depending on - * compilation, so we don't rely on this and save the pointer to 'oldregs' - * in 'regs32' above. - * However, with GCC-2.7.2 and the current CFLAGS you see exactly this: - - long return-eip; from call to vm86() - struct pt_regs oldregs; user space registers as saved by syscall - */ -}; - struct vm86 { struct vm86plus_struct __user *vm86_info; + struct pt_regs *regs32; unsigned long v86flags; unsigned long v86mask; unsigned long saved_sp0; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index f71b4b9..696ef76 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -67,9 +67,6 @@ */ -#define KVM86 ((struct kernel_vm86_struct *)regs) - - /* * 8- and 16-bit register defines.. */ @@ -152,7 +149,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) vm86->saved_sp0 = 0; put_cpu(); - ret = KVM86->regs32; + ret = vm86->regs32; lazy_load_gs(ret->gs); @@ -194,29 +191,16 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); -static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, - struct kernel_vm86_struct *info); +static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus); SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86) { - struct kernel_vm86_struct info; /* declare this _on top_, - * this avoids wasting of stack space. - * This remains on the stack until we - * return to 32 bit user space. - */ - - return do_sys_vm86((struct vm86plus_struct __user *) v86, false, &info); + return do_sys_vm86((struct vm86plus_struct __user *) v86, false); } SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) { - struct kernel_vm86_struct info; /* declare this _on top_, - * this avoids wasting of stack space. - * This remains on the stack until we - * return to 32 bit user space. - */ - switch (cmd) { case VM86_REQUEST_IRQ: case VM86_FREE_IRQ: @@ -234,16 +218,17 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) } /* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */ - return do_sys_vm86((struct vm86plus_struct __user *) arg, true, &info); + return do_sys_vm86((struct vm86plus_struct __user *) arg, true); } -static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, - struct kernel_vm86_struct *info) +static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus) { struct tss_struct *tss; struct task_struct *tsk = current; struct vm86 *vm86 = tsk->thread.vm86; + struct kernel_vm86_regs vm86regs; + struct pt_regs *regs32 = current_pt_regs(); unsigned long err = 0; if (!vm86) { @@ -259,27 +244,27 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, sizeof(struct vm86plus_struct))) return -EFAULT; - memset(info, 0, sizeof(*info)); + memset(&vm86regs, 0, sizeof(vm86regs)); get_user_try { unsigned short seg; - get_user_ex(info->regs.pt.bx, &v86->regs.ebx); - get_user_ex(info->regs.pt.cx, &v86->regs.ecx); - get_user_ex(info->regs.pt.dx, &v86->regs.edx); - get_user_ex(info->regs.pt.si, &v86->regs.esi); - get_user_ex(info->regs.pt.di, &v86->regs.edi); - get_user_ex(info->regs.pt.bp, &v86->regs.ebp); - get_user_ex(info->regs.pt.ax, &v86->regs.eax); - get_user_ex(info->regs.pt.ip, &v86->regs.eip); + get_user_ex(vm86regs.pt.bx, &v86->regs.ebx); + get_user_ex(vm86regs.pt.cx, &v86->regs.ecx); + get_user_ex(vm86regs.pt.dx, &v86->regs.edx); + get_user_ex(vm86regs.pt.si, &v86->regs.esi); + get_user_ex(vm86regs.pt.di, &v86->regs.edi); + get_user_ex(vm86regs.pt.bp, &v86->regs.ebp); + get_user_ex(vm86regs.pt.ax, &v86->regs.eax); + get_user_ex(vm86regs.pt.ip, &v86->regs.eip); get_user_ex(seg, &v86->regs.cs); - info->regs.pt.cs = seg; - get_user_ex(info->regs.pt.flags, &v86->regs.eflags); - get_user_ex(info->regs.pt.sp, &v86->regs.esp); + vm86regs.pt.cs = seg; + get_user_ex(vm86regs.pt.flags, &v86->regs.eflags); + get_user_ex(vm86regs.pt.sp, &v86->regs.esp); get_user_ex(seg, &v86->regs.ss); - info->regs.pt.ss = seg; - get_user_ex(info->regs.es, &v86->regs.es); - get_user_ex(info->regs.ds, &v86->regs.ds); - get_user_ex(info->regs.fs, &v86->regs.fs); - get_user_ex(info->regs.gs, &v86->regs.gs); + vm86regs.pt.ss = seg; + get_user_ex(vm86regs.es, &v86->regs.es); + get_user_ex(vm86regs.ds, &v86->regs.ds); + get_user_ex(vm86regs.fs, &v86->regs.fs); + get_user_ex(vm86regs.gs, &v86->regs.gs); get_user_ex(vm86->flags, &v86->flags); get_user_ex(vm86->screen_bitmap, &v86->screen_bitmap); @@ -302,8 +287,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, } else memset(&vm86->vm86plus, 0, sizeof(struct vm86plus_info_struct)); - - info->regs32 = current_pt_regs(); + vm86->regs32 = regs32; vm86->vm86_info = v86; /* @@ -311,12 +295,12 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, * has set it up safely, so this makes sure interrupt etc flags are * inherited from protected mode. */ - VEFLAGS = info->regs.pt.flags; - info->regs.pt.flags &= SAFE_MASK; - info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK; - info->regs.pt.flags |= X86_VM_MASK; + VEFLAGS = vm86regs.pt.flags; + vm86regs.pt.flags &= SAFE_MASK; + vm86regs.pt.flags |= regs32->flags & ~SAFE_MASK; + vm86regs.pt.flags |= X86_VM_MASK; - info->regs.pt.orig_ax = info->regs32->orig_ax; + vm86regs.pt.orig_ax = regs32->orig_ax; switch (vm86->cpu_type) { case CPU_286: @@ -336,12 +320,13 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, /* * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) */ - info->regs32->ax = VM86_SIGNAL; + regs32->ax = VM86_SIGNAL; vm86->saved_sp0 = tsk->thread.sp0; - lazy_save_gs(info->regs32->gs); + lazy_save_gs(regs32->gs); tss = &per_cpu(cpu_tss, get_cpu()); - tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; + /* Set new sp0 right below 32-bit regs */ + tsk->thread.sp0 = (unsigned long) regs32; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; load_sp0(tss, &tsk->thread); @@ -364,7 +349,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus, #endif "jmp resume_userspace" : /* no outputs */ - :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0)); + :"r" (&vm86regs), "r" (task_thread_info(tsk)), "r" (0)); unreachable(); /* we never return here */ } @@ -539,7 +524,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i, { unsigned long __user *intr_ptr; unsigned long segoffs; - struct kernel_vm86_info *vm86 = current->thread.vm86; + struct vm86 *vm86 = current->thread.vm86; if (regs->pt.cs == BIOSSEG) goto cannot_handle; @@ -569,12 +554,14 @@ cannot_handle: int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) { - if (current->thread.vm86->vm86plus.is_vm86pus) { + struct vm86 *vm86 = current->thread.vm86; + + if (vm86->vm86plus.is_vm86pus) { if ((trapno == 3) || (trapno == 1)) { - KVM86->regs32->ax = VM86_TRAP + (trapno << 8); + vm86->regs32->ax = VM86_TRAP + (trapno << 8); /* setting this flag forces the code in entry_32.S to the path where we call save_v86_state() and change - the stack pointer to KVM86->regs32 */ + the stack pointer to regs32 */ set_thread_flag(TIF_NOTIFY_RESUME); return 0; } -- cgit v0.10.2 From 5ed92a8ab71f8865ba07811429c988c72299b315 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 29 Jul 2015 01:41:19 -0400 Subject: x86/vm86: Use the normal pt_regs area for vm86 Change to use the normal pt_regs area to enter and exit vm86 mode. This is done by increasing the padding at the top of the stack to make room for the extra vm86 segment slots in the IRET frame. It then saves the 32-bit regs in the off-stack vm86 data, and copies in the vm86 regs. Exiting back to 32-bit mode does the reverse. This allows removing the hacks to jump directly into the exit asm code due to having to change the stack pointer. Returning normally from the vm86 syscall and the exception handlers allows things like ptrace and auditing to work properly. Signed-off-by: Brian Gerst Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438148483-11932-5-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 21dc60a..f940e24 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -525,34 +525,12 @@ work_resched: work_notifysig: # deal with pending signals and # notify-resume requests -#ifdef CONFIG_VM86 - testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) - movl %esp, %eax - jnz work_notifysig_v86 # returning to kernel-space or - # vm86-space -1: -#else - movl %esp, %eax -#endif TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - movb PT_CS(%esp), %bl - andb $SEGMENT_RPL_MASK, %bl - cmpb $USER_RPL, %bl - jb resume_kernel + movl %esp, %eax xorl %edx, %edx call do_notify_resume jmp resume_userspace - -#ifdef CONFIG_VM86 - ALIGN -work_notifysig_v86: - pushl %ecx # save ti_flags for do_notify_resume - call save_v86_state # %eax contains pt_regs pointer - popl %ecx - movl %eax, %esp - jmp 1b -#endif END(work_pending) # perform syscall exit tracing diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 225ee54..fdad5c2 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -27,14 +27,17 @@ * Without this offset, that can result in a page fault. (We are * careful that, in this case, the value we read doesn't matter.) * - * In vm86 mode, the hardware frame is much longer still, but we neither - * access the extra members from NMI context, nor do we write such a - * frame at sp0 at all. + * In vm86 mode, the hardware frame is much longer still, so add 16 + * bytes to make room for the real-mode segments. * * x86_64 has a fixed-length stack frame. */ #ifdef CONFIG_X86_32 -# define TOP_OF_KERNEL_STACK_PADDING 8 +# ifdef CONFIG_VM86 +# define TOP_OF_KERNEL_STACK_PADDING 16 +# else +# define TOP_OF_KERNEL_STACK_PADDING 8 +# endif #else # define TOP_OF_KERNEL_STACK_PADDING 0 #endif diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index 226d6c1..e45386e 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -29,7 +29,7 @@ struct kernel_vm86_regs { struct vm86 { struct vm86plus_struct __user *vm86_info; - struct pt_regs *regs32; + struct pt_regs regs32; unsigned long v86flags; unsigned long v86mask; unsigned long saved_sp0; @@ -46,7 +46,7 @@ struct vm86 { void handle_vm86_fault(struct kernel_vm86_regs *, long); int handle_vm86_trap(struct kernel_vm86_regs *, long, int); -struct pt_regs *save_v86_state(struct kernel_vm86_regs *); +void save_v86_state(struct kernel_vm86_regs *, int); struct task_struct; void release_vm86_irqs(struct task_struct *); @@ -69,6 +69,8 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c) return 0; } +static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { } + #define free_vm86(t) do { } while(0) #endif /* CONFIG_VM86 */ diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 7e88cc7..bfd736e 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -635,6 +635,9 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) bool stepping, failed; struct fpu *fpu = ¤t->thread.fpu; + if (v8086_mode(regs)) + save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); + /* Are we from a system call? */ if (syscall_get_nr(current, regs) >= 0) { /* If so, check system call restarting.. */ diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 696ef76..ffe98ec 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -50,6 +50,7 @@ #include #include #include +#include /* * Known problems: @@ -87,10 +88,9 @@ #define SAFE_MASK (0xDD5) #define RETURN_MASK (0xDFF) -struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) +void save_v86_state(struct kernel_vm86_regs *regs, int retval) { struct tss_struct *tss; - struct pt_regs *ret; struct task_struct *tsk = current; struct vm86plus_struct __user *user; struct vm86 *vm86 = current->thread.vm86; @@ -149,11 +149,11 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) vm86->saved_sp0 = 0; put_cpu(); - ret = vm86->regs32; + memcpy(®s->pt, &vm86->regs32, sizeof(struct pt_regs)); - lazy_load_gs(ret->gs); + lazy_load_gs(vm86->regs32.gs); - return ret; + regs->pt.ax = retval; } static void mark_screen_rdonly(struct mm_struct *mm) @@ -228,7 +228,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus) struct task_struct *tsk = current; struct vm86 *vm86 = tsk->thread.vm86; struct kernel_vm86_regs vm86regs; - struct pt_regs *regs32 = current_pt_regs(); + struct pt_regs *regs = current_pt_regs(); unsigned long err = 0; if (!vm86) { @@ -287,7 +287,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus) } else memset(&vm86->vm86plus, 0, sizeof(struct vm86plus_info_struct)); - vm86->regs32 = regs32; + + memcpy(&vm86->regs32, regs, sizeof(struct pt_regs)); vm86->vm86_info = v86; /* @@ -297,10 +298,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus) */ VEFLAGS = vm86regs.pt.flags; vm86regs.pt.flags &= SAFE_MASK; - vm86regs.pt.flags |= regs32->flags & ~SAFE_MASK; + vm86regs.pt.flags |= regs->flags & ~SAFE_MASK; vm86regs.pt.flags |= X86_VM_MASK; - vm86regs.pt.orig_ax = regs32->orig_ax; + vm86regs.pt.orig_ax = regs->orig_ax; switch (vm86->cpu_type) { case CPU_286: @@ -318,15 +319,14 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus) } /* - * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) + * Save old state */ - regs32->ax = VM86_SIGNAL; vm86->saved_sp0 = tsk->thread.sp0; - lazy_save_gs(regs32->gs); + lazy_save_gs(vm86->regs32.gs); tss = &per_cpu(cpu_tss, get_cpu()); - /* Set new sp0 right below 32-bit regs */ - tsk->thread.sp0 = (unsigned long) regs32; + /* make room for real-mode segments */ + tsk->thread.sp0 += 16; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; load_sp0(tss, &tsk->thread); @@ -335,41 +335,14 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus) if (vm86->flags & VM86_SCREEN_BITMAP) mark_screen_rdonly(tsk->mm); - /*call __audit_syscall_exit since we do not exit via the normal paths */ -#ifdef CONFIG_AUDITSYSCALL - if (unlikely(current->audit_context)) - __audit_syscall_exit(1, 0); -#endif - - __asm__ __volatile__( - "movl %0,%%esp\n\t" - "movl %1,%%ebp\n\t" -#ifdef CONFIG_X86_32_LAZY_GS - "mov %2, %%gs\n\t" -#endif - "jmp resume_userspace" - : /* no outputs */ - :"r" (&vm86regs), "r" (task_thread_info(tsk)), "r" (0)); - unreachable(); /* we never return here */ -} - -static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval) -{ - struct pt_regs *regs32; - - regs32 = save_v86_state(regs16); - regs32->ax = retval; - __asm__ __volatile__("movl %0,%%esp\n\t" - "movl %1,%%ebp\n\t" - "jmp resume_userspace" - : : "r" (regs32), "r" (current_thread_info())); + memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs)); + force_iret(); + return regs->ax; } static inline void set_IF(struct kernel_vm86_regs *regs) { VEFLAGS |= X86_EFLAGS_VIF; - if (VEFLAGS & X86_EFLAGS_VIP) - return_to_32bit(regs, VM86_STI); } static inline void clear_IF(struct kernel_vm86_regs *regs) @@ -549,7 +522,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i, return; cannot_handle: - return_to_32bit(regs, VM86_INTx + (i << 8)); + save_v86_state(regs, VM86_INTx + (i << 8)); } int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) @@ -558,11 +531,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) if (vm86->vm86plus.is_vm86pus) { if ((trapno == 3) || (trapno == 1)) { - vm86->regs32->ax = VM86_TRAP + (trapno << 8); - /* setting this flag forces the code in entry_32.S to - the path where we call save_v86_state() and change - the stack pointer to regs32 */ - set_thread_flag(TIF_NOTIFY_RESUME); + save_v86_state(regs, VM86_TRAP + (trapno << 8)); return 0; } do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); @@ -588,12 +557,6 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) #define CHECK_IF_IN_TRAP \ if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \ newflags |= X86_EFLAGS_TF -#define VM86_FAULT_RETURN do { \ - if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \ - return_to_32bit(regs, VM86_PICRETURN); \ - if (orig_flags & X86_EFLAGS_TF) \ - handle_vm86_trap(regs, 0, 1); \ - return; } while (0) orig_flags = *(unsigned short *)®s->pt.flags; @@ -632,7 +595,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) SP(regs) -= 2; } IP(regs) = ip; - VM86_FAULT_RETURN; + goto vm86_fault_return; /* popf */ case 0x9d: @@ -652,7 +615,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) else set_vflags_short(newflags, regs); - VM86_FAULT_RETURN; + goto check_vip; } /* int xx */ @@ -660,8 +623,10 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) int intno = popb(csp, ip, simulate_sigsegv); IP(regs) = ip; if (vmpi->vm86dbg_active) { - if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) - return_to_32bit(regs, VM86_INTx + (intno << 8)); + if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) { + save_v86_state(regs, VM86_INTx + (intno << 8)); + return; + } } do_int(regs, intno, ssp, sp); return; @@ -692,14 +657,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) } else { set_vflags_short(newflags, regs); } - VM86_FAULT_RETURN; + goto check_vip; } /* cli */ case 0xfa: IP(regs) = ip; clear_IF(regs); - VM86_FAULT_RETURN; + goto vm86_fault_return; /* sti */ /* @@ -711,12 +676,27 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code) case 0xfb: IP(regs) = ip; set_IF(regs); - VM86_FAULT_RETURN; + goto check_vip; default: - return_to_32bit(regs, VM86_UNKNOWN); + save_v86_state(regs, VM86_UNKNOWN); + } + + return; + +check_vip: + if (VEFLAGS & X86_EFLAGS_VIP) { + save_v86_state(regs, VM86_STI); + return; } +vm86_fault_return: + if (vmpi->force_return_for_pic && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) { + save_v86_state(regs, VM86_PICRETURN); + return; + } + if (orig_flags & X86_EFLAGS_TF) + handle_vm86_trap(regs, 0, X86_TRAP_DB); return; simulate_sigsegv: @@ -730,7 +710,7 @@ simulate_sigsegv: * should be a mixture of the two, but how do we * get the information? [KD] */ - return_to_32bit(regs, VM86_UNKNOWN); + save_v86_state(regs, VM86_UNKNOWN); } /* ---------------- vm86 special IRQ passing stuff ----------------- */ -- cgit v0.10.2 From af3e565a8542c4be699a0403b88fd6c691f5914f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 31 Jul 2015 10:59:20 +0200 Subject: x86/vm86: Move the vm86 IRQ definitions to vm86.h Move vm86 specific definitions from irq_vectors.h to vm86.h. Based on patch from Brian Gerst. Originally-from: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438148483-11932-6-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 4c2d2eb..6ca9fd6 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -117,16 +117,6 @@ #define FPU_IRQ 13 -#define FIRST_VM86_IRQ 3 -#define LAST_VM86_IRQ 15 - -#ifndef __ASSEMBLY__ -static inline int invalid_vm86_irq(int irq) -{ - return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; -} -#endif - /* * Size the maximum number of interrupts. * diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index e45386e..b063196 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -49,7 +49,6 @@ int handle_vm86_trap(struct kernel_vm86_regs *, long, int); void save_v86_state(struct kernel_vm86_regs *, int); struct task_struct; -void release_vm86_irqs(struct task_struct *); #define free_vm86(t) do { \ struct thread_struct *__t = (t); \ @@ -59,6 +58,20 @@ void release_vm86_irqs(struct task_struct *); } \ } while (0) +/* + * Support for VM86 programs to request interrupts for + * real mode hardware drivers: + */ +#define FIRST_VM86_IRQ 3 +#define LAST_VM86_IRQ 15 + +static inline int invalid_vm86_irq(int irq) +{ + return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ; +} + +void release_vm86_irqs(struct task_struct *); + #else #define handle_vm86_fault(a, b) -- cgit v0.10.2 From ba3e127ec105e790eeec4034d9769e018e4a1b54 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 29 Jul 2015 01:41:21 -0400 Subject: x86/vm86: Clean up vm86.h includes vm86.h was being implicitly included in alot of places via processor.h, which in turn got it from math_emu.h. Break that chain and explicitly include vm86.h in all files that need it. Also remove unused vm86 field from math_emu_info. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438148483-11932-7-git-send-email-brgerst@gmail.com [ Fixed build failure. ] Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h index 031f626..0d9b14f 100644 --- a/arch/x86/include/asm/math_emu.h +++ b/arch/x86/include/asm/math_emu.h @@ -2,7 +2,6 @@ #define _ASM_X86_MATH_EMU_H #include -#include /* This structure matches the layout of the data saved to the stack following a device-not-present interrupt, part of it saved @@ -10,9 +9,6 @@ */ struct math_emu_info { long ___orig_eip; - union { - struct pt_regs *regs; - struct kernel_vm86_regs *vm86; - }; + struct pt_regs *regs; }; #endif /* _ASM_X86_MATH_EMU_H */ diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 592a6a6..91dfcaf 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *); asmlinkage unsigned long sys_sigreturn(void); /* kernel/vm86_32.c */ +struct vm86_struct; asmlinkage long sys_vm86old(struct vm86_struct __user *); asmlinkage long sys_vm86(unsigned long, unsigned long); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f73c962..c13df2c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -53,6 +53,7 @@ #include #include #include +#include asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread"); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index bfd736e..07eb844 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -31,6 +31,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 8e65d8a..86a82ea 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -62,6 +62,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 #include diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index ffe98ec..0de1f66 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -51,6 +51,7 @@ #include #include #include +#include /* * Known problems: diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c index 6ef5e99..a2eefb1 100644 --- a/arch/x86/math-emu/get_address.c +++ b/arch/x86/math-emu/get_address.c @@ -21,6 +21,7 @@ #include #include +#include #include "fpu_system.h" #include "exception.h" diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 34a368d..eef44d9 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -20,6 +20,7 @@ #include /* kmemcheck_*(), ... */ #include /* VSYSCALL_ADDR */ #include /* emulate_vsyscall */ +#include /* struct vm86 */ #define CREATE_TRACE_POINTS #include diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c index f35ed53..d4cda5e 100644 --- a/drivers/scsi/dpt_i2o.c +++ b/drivers/scsi/dpt_i2o.c @@ -1924,6 +1924,9 @@ static void adpt_alpha_info(sysInfo_S* si) #endif #if defined __i386__ + +#include + static void adpt_i386_info(sysInfo_S* si) { // This is all the info we need for now -- cgit v0.10.2 From 1342635638cba9b7c8eac776da5e54390d14d313 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 29 Jul 2015 01:41:22 -0400 Subject: x86/vm86: Rename vm86->vm86_info to user_vm86 Make it clearer that this is the pointer to the userspace vm86 state area. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438148483-11932-8-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index b063196..c93ae73 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -28,7 +28,7 @@ struct kernel_vm86_regs { }; struct vm86 { - struct vm86plus_struct __user *vm86_info; + struct vm86plus_struct __user *user_vm86; struct pt_regs regs32; unsigned long v86flags; unsigned long v86mask; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 0de1f66..52aa33e 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -104,17 +104,17 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) */ local_irq_enable(); - if (!vm86 || !vm86->vm86_info) { - pr_alert("no vm86_info: BAD\n"); + if (!vm86 || !vm86->user_vm86) { + pr_alert("no user_vm86: BAD\n"); do_exit(SIGSEGV); } set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask); - user = vm86->vm86_info; + user = vm86->user_vm86; if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ? sizeof(struct vm86plus_struct) : sizeof(struct vm86_struct))) { - pr_alert("could not access userspace vm86_info\n"); + pr_alert("could not access userspace vm86 info\n"); do_exit(SIGSEGV); } @@ -139,7 +139,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) put_user_ex(vm86->screen_bitmap, &user->screen_bitmap); } put_user_catch(err); if (err) { - pr_alert("could not access userspace vm86_info\n"); + pr_alert("could not access userspace vm86 info\n"); do_exit(SIGSEGV); } @@ -192,11 +192,11 @@ out: static int do_vm86_irq_handling(int subfunction, int irqnumber); -static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus); +static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus); -SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86) +SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86) { - return do_sys_vm86((struct vm86plus_struct __user *) v86, false); + return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false); } @@ -223,7 +223,7 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg) } -static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus) +static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) { struct tss_struct *tss; struct task_struct *tsk = current; @@ -240,7 +240,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus) if (vm86->saved_sp0) return -EPERM; - if (!access_ok(VERIFY_READ, v86, plus ? + if (!access_ok(VERIFY_READ, user_vm86, plus ? sizeof(struct vm86_struct) : sizeof(struct vm86plus_struct))) return -EFAULT; @@ -248,40 +248,42 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus) memset(&vm86regs, 0, sizeof(vm86regs)); get_user_try { unsigned short seg; - get_user_ex(vm86regs.pt.bx, &v86->regs.ebx); - get_user_ex(vm86regs.pt.cx, &v86->regs.ecx); - get_user_ex(vm86regs.pt.dx, &v86->regs.edx); - get_user_ex(vm86regs.pt.si, &v86->regs.esi); - get_user_ex(vm86regs.pt.di, &v86->regs.edi); - get_user_ex(vm86regs.pt.bp, &v86->regs.ebp); - get_user_ex(vm86regs.pt.ax, &v86->regs.eax); - get_user_ex(vm86regs.pt.ip, &v86->regs.eip); - get_user_ex(seg, &v86->regs.cs); + get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx); + get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx); + get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx); + get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi); + get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi); + get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp); + get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax); + get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip); + get_user_ex(seg, &user_vm86->regs.cs); vm86regs.pt.cs = seg; - get_user_ex(vm86regs.pt.flags, &v86->regs.eflags); - get_user_ex(vm86regs.pt.sp, &v86->regs.esp); - get_user_ex(seg, &v86->regs.ss); + get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags); + get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp); + get_user_ex(seg, &user_vm86->regs.ss); vm86regs.pt.ss = seg; - get_user_ex(vm86regs.es, &v86->regs.es); - get_user_ex(vm86regs.ds, &v86->regs.ds); - get_user_ex(vm86regs.fs, &v86->regs.fs); - get_user_ex(vm86regs.gs, &v86->regs.gs); - - get_user_ex(vm86->flags, &v86->flags); - get_user_ex(vm86->screen_bitmap, &v86->screen_bitmap); - get_user_ex(vm86->cpu_type, &v86->cpu_type); + get_user_ex(vm86regs.es, &user_vm86->regs.es); + get_user_ex(vm86regs.ds, &user_vm86->regs.ds); + get_user_ex(vm86regs.fs, &user_vm86->regs.fs); + get_user_ex(vm86regs.gs, &user_vm86->regs.gs); + + get_user_ex(vm86->flags, &user_vm86->flags); + get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap); + get_user_ex(vm86->cpu_type, &user_vm86->cpu_type); } get_user_catch(err); if (err) return err; - if (copy_from_user(&vm86->int_revectored, &v86->int_revectored, + if (copy_from_user(&vm86->int_revectored, + &user_vm86->int_revectored, sizeof(struct revectored_struct))) return -EFAULT; - if (copy_from_user(&vm86->int21_revectored, &v86->int21_revectored, + if (copy_from_user(&vm86->int21_revectored, + &user_vm86->int21_revectored, sizeof(struct revectored_struct))) return -EFAULT; if (plus) { - if (copy_from_user(&vm86->vm86plus, &v86->vm86plus, + if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus, sizeof(struct vm86plus_info_struct))) return -EFAULT; vm86->vm86plus.is_vm86pus = 1; @@ -290,7 +292,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus) sizeof(struct vm86plus_info_struct)); memcpy(&vm86->regs32, regs, sizeof(struct pt_regs)); - vm86->vm86_info = v86; + vm86->user_vm86 = user_vm86; /* * The flags register is also special: we cannot trust that the user -- cgit v0.10.2 From decd275e62d5eef4b947fab89652fa6afdadf2f2 Mon Sep 17 00:00:00 2001 From: Brian Gerst Date: Wed, 29 Jul 2015 01:41:23 -0400 Subject: x86/vm86: Rename vm86->v86flags and v86mask Rename v86flags to veflags, and v86mask to veflags_mask. Signed-off-by: Brian Gerst Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1438148483-11932-9-git-send-email-brgerst@gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h index c93ae73..1e491f3 100644 --- a/arch/x86/include/asm/vm86.h +++ b/arch/x86/include/asm/vm86.h @@ -30,8 +30,8 @@ struct kernel_vm86_regs { struct vm86 { struct vm86plus_struct __user *user_vm86; struct pt_regs regs32; - unsigned long v86flags; - unsigned long v86mask; + unsigned long veflags; + unsigned long veflags_mask; unsigned long saved_sp0; unsigned long flags; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 52aa33e..abd8b856 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -80,8 +80,8 @@ /* * virtual flags (16 and 32-bit versions) */ -#define VFLAGS (*(unsigned short *)&(current->thread.vm86->v86flags)) -#define VEFLAGS (current->thread.vm86->v86flags) +#define VFLAGS (*(unsigned short *)&(current->thread.vm86->veflags)) +#define VEFLAGS (current->thread.vm86->veflags) #define set_flags(X, new, mask) \ ((X) = ((X) & ~(mask)) | ((new) & (mask))) @@ -108,7 +108,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) pr_alert("no user_vm86: BAD\n"); do_exit(SIGSEGV); } - set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask); + set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); user = vm86->user_vm86; if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ? @@ -308,16 +308,16 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) switch (vm86->cpu_type) { case CPU_286: - vm86->v86mask = 0; + vm86->veflags_mask = 0; break; case CPU_386: - vm86->v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; + vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL; break; case CPU_486: - vm86->v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; + vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; break; default: - vm86->v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; + vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL; break; } @@ -377,7 +377,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs) static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs) { - set_flags(VEFLAGS, flags, current->thread.vm86->v86mask); + set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask); set_flags(regs->pt.flags, flags, SAFE_MASK); if (flags & X86_EFLAGS_IF) set_IF(regs); @@ -387,7 +387,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs) { - set_flags(VFLAGS, flags, current->thread.vm86->v86mask); + set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask); set_flags(regs->pt.flags, flags, SAFE_MASK); if (flags & X86_EFLAGS_IF) set_IF(regs); @@ -402,7 +402,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs) if (VEFLAGS & X86_EFLAGS_VIF) flags |= X86_EFLAGS_IF; flags |= X86_EFLAGS_IOPL; - return flags | (VEFLAGS & current->thread.vm86->v86mask); + return flags | (VEFLAGS & current->thread.vm86->veflags_mask); } static inline int is_revectored(int nr, struct revectored_struct *bitmap) -- cgit v0.10.2 From c5f69fde26d1581ee495f68bb9de4049c8168a04 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 31 Jul 2015 14:41:08 -0700 Subject: x86/entry/32: Remove 32-bit syscall audit optimizations The asm audit optimizations are ugly and obfuscate the code too much. Remove them. This will regress performance if syscall auditing is enabled on 32-bit kernels and SYSENTER is in use. If this becomes a problem, interested parties are encouraged to implement the equivalent of the 64-bit opportunistic SYSRET optimization. Alternatively, a case could be made that, on 32-bit kernels, a less messy asm audit optimization could be done. 32-bit kernels don't have the complicated partial register saving tricks that 64-bit kernels have, so the SYSENTER post-syscall path could just call the audit hooks directly. Any reimplementation of this ought to demonstrate that it only calls the audit hook once per syscall, though, which does not currently appear to be true. Someone would have to make the case that doing so would be better than implementing opportunistic SYSEXIT, though. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Eric Paris Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/212be39dd8c90b44c4b7bbc678128d6b88bdb9912.1438378274.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index f940e24..a3c307a 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -45,16 +45,6 @@ #include #include -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -# define sysenter_audit syscall_trace_entry -# define sysexit_audit syscall_exit_work -#endif - .section .entry.text, "ax" /* @@ -339,7 +329,7 @@ sysenter_past_esp: GET_THREAD_INFO(%ebp) testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) - jnz sysenter_audit + jnz syscall_trace_entry sysenter_do_call: cmpl $(NR_syscalls), %eax jae sysenter_badsys @@ -351,7 +341,7 @@ sysenter_after_call: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $_TIF_ALLWORK_MASK, %ecx - jnz sysexit_audit + jnz syscall_exit_work sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx @@ -362,40 +352,6 @@ sysenter_exit: PTGS_TO_GS ENABLE_INTERRUPTS_SYSEXIT -#ifdef CONFIG_AUDITSYSCALL -sysenter_audit: - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp) - jnz syscall_trace_entry - /* movl PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */ - movl PT_EBX(%esp), %edx /* ebx/a0: 2nd arg to audit */ - /* movl PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */ - pushl PT_ESI(%esp) /* a3: 5th arg */ - pushl PT_EDX+4(%esp) /* a2: 4th arg */ - call __audit_syscall_entry - popl %ecx /* get that remapped edx off the stack */ - popl %ecx /* get that remapped esi off the stack */ - movl PT_EAX(%esp), %eax /* reload syscall number */ - jmp sysenter_do_call - -sysexit_audit: - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - movl %eax, %edx /* second arg, syscall return value */ - cmpl $-MAX_ERRNO, %eax /* is it an error ? */ - setbe %al /* 1 if so, 0 if not */ - movzbl %al, %eax /* zero-extend that */ - call __audit_syscall_exit - DISABLE_INTERRUPTS(CLBR_ANY) - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jnz syscall_exit_work - movl PT_EAX(%esp), %eax /* reload syscall return value */ - jmp sysenter_exit -#endif - .pushsection .fixup, "ax" 2: movl $0, PT_FS(%esp) jmp 1b -- cgit v0.10.2 From 5d73fc70996d9de0d1b2fc87e62dc51153204eba Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 31 Jul 2015 14:41:09 -0700 Subject: x86/entry/32: Migrate to C exit path This removes the hybrid asm-and-C implementation of exit work. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Eric Paris Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/2baa438619ea6c027b40ec9fceacca52f09c74d09.1438378274.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a3c307a..b2909bf 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -256,14 +256,10 @@ ret_from_intr: ENTRY(resume_userspace) LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret + DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done on - # int/exception return? - jne work_pending + movl %esp, %eax + call prepare_exit_to_usermode jmp restore_all END(ret_from_exception) @@ -341,7 +337,7 @@ sysenter_after_call: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $_TIF_ALLWORK_MASK, %ecx - jnz syscall_exit_work + jnz syscall_exit_work_irqs_off sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx @@ -377,13 +373,7 @@ syscall_after_call: movl %eax, PT_EAX(%esp) # store the return value syscall_exit: LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx # current->work - jnz syscall_exit_work + jmp syscall_exit_work restore_all: TRACE_IRQS_IRET @@ -460,35 +450,6 @@ ldt_ss: #endif ENDPROC(entry_INT80_32) - # perform work that needs to be done immediately before resumption - ALIGN -work_pending: - testb $_TIF_NEED_RESCHED, %cl - jz work_notifysig -work_resched: - call schedule - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) # make sure we don't miss an interrupt - # setting need_resched or sigpending - # between sampling and the iret - TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - andl $_TIF_WORK_MASK, %ecx # is there any work to be done other - # than syscall tracing? - jz restore_all - testb $_TIF_NEED_RESCHED, %cl - jnz work_resched - -work_notifysig: # deal with pending signals and - # notify-resume requests - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - movl %esp, %eax - xorl %edx, %edx - call do_notify_resume - jmp resume_userspace -END(work_pending) - # perform syscall exit tracing ALIGN syscall_trace_entry: @@ -503,15 +464,14 @@ END(syscall_trace_entry) # perform syscall exit tracing ALIGN -syscall_exit_work: - testl $_TIF_WORK_SYSCALL_EXIT, %ecx - jz work_pending +syscall_exit_work_irqs_off: TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) # could let syscall_trace_leave() call - # schedule() instead + ENABLE_INTERRUPTS(CLBR_ANY) + +syscall_exit_work: movl %esp, %eax - call syscall_trace_leave - jmp resume_userspace + call syscall_return_slowpath + jmp restore_all END(syscall_exit_work) syscall_fault: -- cgit v0.10.2 From 88cd622f9299c4c9e61e978bb9ef9d7599769ed0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 31 Jul 2015 14:41:10 -0700 Subject: x86/entry: Remove do_notify_resume(), syscall_trace_leave(), and their TIF masks They are no longer used. Good riddance! Deleting the TIF_ macros is really nice. It was never clear why there were so many variants. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Eric Paris Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/22c61682f446628573dde0f1d573ab821677e06da.1438378274.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index a3e9c7f..80dcc92 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -207,37 +207,6 @@ long syscall_trace_enter(struct pt_regs *regs) return syscall_trace_enter_phase2(regs, arch, phase1_result); } -/* Deprecated. */ -void syscall_trace_leave(struct pt_regs *regs) -{ - bool step; - - /* - * We may come here right after calling schedule_user() - * or do_notify_resume(), in which case we can be in RCU - * user mode. - */ - user_exit(); - - audit_syscall_exit(regs); - - if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) - trace_sys_exit(regs, regs->ax); - - /* - * If TIF_SYSCALL_EMU is set, we only get here because of - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). - * We already reported this syscall instruction in - * syscall_trace_enter(). - */ - step = unlikely(test_thread_flag(TIF_SINGLESTEP)) && - !test_thread_flag(TIF_SYSCALL_EMU); - if (step || test_thread_flag(TIF_SYSCALL_TRACE)) - tracehook_report_syscall_exit(regs, step); - - user_enter(); -} - static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) { unsigned long top_of_stack = @@ -347,29 +316,3 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) local_irq_disable(); prepare_exit_to_usermode(regs); } - -/* - * Deprecated notification of userspace execution resumption - * - triggered by the TIF_WORK_MASK flags - */ -__visible void -do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) -{ - user_exit(); - - if (thread_info_flags & _TIF_UPROBE) - uprobe_notify_resume(regs); - - /* deal with pending signal delivery */ - if (thread_info_flags & _TIF_SIGPENDING) - do_signal(regs); - - if (thread_info_flags & _TIF_NOTIFY_RESUME) { - clear_thread_flag(TIF_NOTIFY_RESUME); - tracehook_notify_resume(regs); - } - if (thread_info_flags & _TIF_USER_RETURN_NOTIFY) - fire_user_return_notifiers(); - - user_enter(); -} diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 5fabf13..6271281 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -88,7 +88,6 @@ extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch, unsigned long phase1_result); extern long syscall_trace_enter(struct pt_regs *); -extern void syscall_trace_leave(struct pt_regs *); static inline unsigned long regs_return_value(struct pt_regs *regs) { diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index b42408b..c481be7 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -31,7 +31,6 @@ typedef sigset_t compat_sigset_t; #include #ifndef __ASSEMBLY__ extern void do_signal(struct pt_regs *regs); -extern void do_notify_resume(struct pt_regs *, void *, __u32); #define __ARCH_HAS_SA_RESTORER diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index fdad5c2..8afdc3e 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -143,27 +143,11 @@ struct thread_info { _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT | \ _TIF_NOHZ) -/* work to do in syscall_trace_leave() */ -#define _TIF_WORK_SYSCALL_EXIT \ - (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP | \ - _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ) - -/* work to do on interrupt/exception return */ -#define _TIF_WORK_MASK \ - (0x0000FFFF & \ - ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT| \ - _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU)) - /* work to do on any return to user space */ #define _TIF_ALLWORK_MASK \ ((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT | \ _TIF_NOHZ) -/* Only used for 64 bit */ -#define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | \ - _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE) - /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ (_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP) -- cgit v0.10.2 From 6b7e26547fad7ace3dcb27a5babd2317fb9d1e12 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 6 Aug 2015 14:45:45 -0700 Subject: x86/vdso: Emit a GNU hash Some dynamic loaders may be slightly faster if a GNU hash is available. Strangely, this seems to have no effect at all on the vdso size. This is unlikely to have any measurable effect on the time it takes to resolve vdso symbols (since there are so few of them). In some contexts, it can be a win for a different reason: if every DSO has a GNU hash section, then libc can avoid calculating SysV hashes at all. Both musl and glibc appear to have this optimization. It's plausible that this breaks some ancient glibc version. If so, then, depending on what glibc versions break, we could either require COMPAT_VDSO for them or consider reverting. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Isaac Dunham Cc: Linus Torvalds Cc: Nathan Lynch Cc: Peter Zijlstra Cc: Rich Felker Cc: Thomas Gleixner Cc: musl@lists.openwall.com Link: http://lkml.kernel.org/r/fd56cc057a2d62ab31c56a48d04fccb435b3fd4f.1438897382.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 96c0617..a3d0767 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -175,7 +175,7 @@ quiet_cmd_vdso = VDSO $@ -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' -VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \ $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) GCOV_PROFILE := n -- cgit v0.10.2 From 33f3df41d03879ab86c7f2d650e67b655e0b85c8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 17 Aug 2015 12:22:51 -0700 Subject: selftests/x86: Disable sigreturn_64 sigreturn_64 was broken by ed596cde9425 ("Revert x86 sigcontext cleanups"). Turn it off until we have a better fix. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a184e75ff170a0bcd76bf376c41cad2c402fe9f7.1439838962.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index b70da4a..986e7cb 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -4,8 +4,8 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean -TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs ldt_gdt -TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault +TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt +TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) -- cgit v0.10.2 From a9c909ce8c7853b4fc16055c50eb50d91e20cb93 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 17 Aug 2015 12:22:52 -0700 Subject: selftests/x86: Add syscall_nt selftest I've had this sitting around for a while. Add it to the selftests tree. Far Cry running under Wine depends on this behavior. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ee4d63799a9e5294b70930618b71d04d2770eb2d.1439838962.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 986e7cb..29089b2 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -4,7 +4,7 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean -TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt +TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c new file mode 100644 index 0000000..60c06af4 --- /dev/null +++ b/tools/testing/selftests/x86/syscall_nt.c @@ -0,0 +1,54 @@ +/* + * syscall_nt.c - checks syscalls with NT set + * Copyright (c) 2014-2015 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * Some obscure user-space code requires the ability to make system calls + * with FLAGS.NT set. Make sure it works. + */ + +#include +#include +#include +#include + +#ifdef __x86_64__ +# define WIDTH "q" +#else +# define WIDTH "l" +#endif + +static unsigned long get_eflags(void) +{ + unsigned long eflags; + asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags)); + return eflags; +} + +static void set_eflags(unsigned long eflags) +{ + asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH + : : "rm" (eflags) : "flags"); +} + +int main() +{ + printf("[RUN]\tSet NT and issue a syscall\n"); + set_eflags(get_eflags() | X86_EFLAGS_NT); + syscall(SYS_getpid); + if (get_eflags() & X86_EFLAGS_NT) { + printf("[OK]\tThe syscall worked and NT is still set\n"); + return 0; + } else { + printf("[FAIL]\tThe syscall worked but NT was cleared\n"); + return 1; + } +} -- cgit v0.10.2 From 99770737ca7e3ebc14e66460a69b7032de9421e1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 21 Aug 2015 08:33:53 +0200 Subject: x86/asm/tsc: Add rdtscll() merge helper Some in-flight code makes use of the old rdtscll() (now removed), provide a wrapper for a kernel cycle to smooth the transition to rdtsc(). ( We use the safest variant, rdtsc_ordered(), which has barriers - this adds another incentive to remove the wrapper in the future. ) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Huang Rui Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Ralf Baechle Cc: Thomas Gleixner Cc: kvm ML Link: http://lkml.kernel.org/r/dddbf98a2af53312e9aa73a5a2b1622fe5d6f52b.1434501121.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 131eec2..54e9f08 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -152,6 +152,9 @@ static __always_inline unsigned long long rdtsc_ordered(void) return rdtsc(); } +/* Deprecated, keep it for a cycle for easier merging: */ +#define rdtscll(now) do { (now) = rdtsc_ordered(); } while (0) + static inline unsigned long long native_read_pmc(int counter) { DECLARE_ARGS(val, low, high); -- cgit v0.10.2 From f0a97af83f6287357dcc100c859ec0066f164f32 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 20 Aug 2015 22:03:21 -0700 Subject: x86/traps: Weaken context tracking entry assertions We were asserting that we were all the way in CONTEXT_KERNEL when exception handlers were called. While having this be true is, I think, a nice goal (or maybe a variant in which we assert that we're in CONTEXT_KERNEL or some new IRQ context), we're not quite there. In particular, if an IRQ interrupts the SYSCALL prologue and the IRQ handler in turn causes an exception, the exception entry will be called in RCU IRQ mode but with CONTEXT_USER. This is okay (nothing goes wrong), but until we fix up the SYSCALL prologue, we need to avoid warning. Signed-off-by: Andy Lutomirski Acked-by: Frederic Weisbecker Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Sasha Levin Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/c81faf3916346c0e04346c441392974f49cd7184.1440133286.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 86a82ea..45e8d98 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -112,7 +112,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) void ist_enter(struct pt_regs *regs) { if (user_mode(regs)) { - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU"); } else { /* * We might have interrupted pretty much anything. In @@ -282,7 +282,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str, { siginfo_t info; - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU"); if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) != NOTIFY_STOP) { @@ -364,7 +364,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) const struct bndcsr *bndcsr; siginfo_t *info; - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU"); if (notify_die(DIE_TRAP, "bounds", regs, error_code, X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP) return; @@ -442,7 +442,7 @@ do_general_protection(struct pt_regs *regs, long error_code) { struct task_struct *tsk; - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU"); conditional_sti(regs); if (v8086_mode(regs)) { @@ -496,7 +496,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code) return; ist_enter(regs); - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU"); #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP, SIGTRAP) == NOTIFY_STOP) @@ -729,14 +729,14 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU"); math_error(regs, error_code, X86_TRAP_MF); } dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU"); math_error(regs, error_code, X86_TRAP_XF); } @@ -749,7 +749,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code) dotraplinkage void do_device_not_available(struct pt_regs *regs, long error_code) { - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU"); BUG_ON(use_eager_fpu()); #ifdef CONFIG_MATH_EMULATION @@ -775,7 +775,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) { siginfo_t info; - CT_WARN_ON(ct_state() != CONTEXT_KERNEL); + rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU"); local_irq_enable(); info.si_signo = SIGILL; -- cgit v0.10.2 From f96756746c7909de37db3d03ac5fd5cfb2757f38 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Mon, 10 Aug 2015 12:19:53 +0200 Subject: x86/asm: Add MONITORX/MWAITX instruction support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AMD Carrizo processors (Family 15h, Models 60h-6fh) added a new feature called MWAITX (MWAIT with extensions) as an extension to MONITOR/MWAIT. This new instruction controls a configurable timer which causes the core to exit wait state on timer expiration, in addition to "normal" MWAIT condition of reading from a monitored VA. Compared to MONITOR/MWAIT, there are minor differences in opcode and input parameters: MWAITX ECX[1]: enable timer if set MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks == TSC. The software P0 frequency is the same as the TSC frequency. MWAIT MWAITX opcode 0f 01 c9 | 0f 01 fb ECX[0] value of RFLAGS.IF seen by instruction ECX[1] unused/#GP if set | enable timer if set ECX[31:2] unused/#GP if set EAX unused (reserve for hint) EBX[31:0] unused | max wait time (SW P0 == TSC) MONITOR MONITORX opcode 0f 01 c8 | 0f 01 fa EAX (logical) address to monitor ECX #GP if not zero Max timeout = EBX/(TSC frequency) Signed-off-by: Huang Rui Signed-off-by: Borislav Petkov Cc: Aaron Lu Cc: Alexander Shishkin Cc: Andreas Herrmann Cc: Andy Lutomirski Cc: Dave Hansen Cc: Dirk Brandewie Cc: Fengguang Wu Cc: Frédéric Weisbecker Cc: H. Peter Anvin Cc: John Stultz Cc: Josh Triplett Cc: Len Brown Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Ross Zwisler Cc: Thomas Gleixner Cc: Tony Li Link: http://lkml.kernel.org/r/1439201994-28067-3-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 3d6606f..a39e570 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -176,6 +176,7 @@ #define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ #define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ #define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ /* * Auxiliary flags: Linux defined - For features scattered in various diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h index 653dfa7..c70689b 100644 --- a/arch/x86/include/asm/mwait.h +++ b/arch/x86/include/asm/mwait.h @@ -14,6 +14,9 @@ #define CPUID5_ECX_INTERRUPT_BREAK 0x2 #define MWAIT_ECX_INTERRUPT_BREAK 0x1 +#define MWAITX_ECX_TIMER_ENABLE BIT(1) +#define MWAITX_MAX_LOOPS ((u32)-1) +#define MWAITX_DISABLE_CSTATES 0xf static inline void __monitor(const void *eax, unsigned long ecx, unsigned long edx) @@ -23,6 +26,14 @@ static inline void __monitor(const void *eax, unsigned long ecx, :: "a" (eax), "c" (ecx), "d"(edx)); } +static inline void __monitorx(const void *eax, unsigned long ecx, + unsigned long edx) +{ + /* "monitorx %eax, %ecx, %edx;" */ + asm volatile(".byte 0x0f, 0x01, 0xfa;" + :: "a" (eax), "c" (ecx), "d"(edx)); +} + static inline void __mwait(unsigned long eax, unsigned long ecx) { /* "mwait %eax, %ecx;" */ @@ -30,6 +41,40 @@ static inline void __mwait(unsigned long eax, unsigned long ecx) :: "a" (eax), "c" (ecx)); } +/* + * MWAITX allows for a timer expiration to get the core out a wait state in + * addition to the default MWAIT exit condition of a store appearing at a + * monitored virtual address. + * + * Registers: + * + * MWAITX ECX[1]: enable timer if set + * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0 + * frequency is the same as the TSC frequency. + * + * Below is a comparison between MWAIT and MWAITX on AMD processors: + * + * MWAIT MWAITX + * opcode 0f 01 c9 | 0f 01 fb + * ECX[0] value of RFLAGS.IF seen by instruction + * ECX[1] unused/#GP if set | enable timer if set + * ECX[31:2] unused/#GP if set + * EAX unused (reserve for hint) + * EBX[31:0] unused | max wait time (P0 clocks) + * + * MONITOR MONITORX + * opcode 0f 01 c8 | 0f 01 fa + * EAX (logical) address to monitor + * ECX #GP if not zero + */ +static inline void __mwaitx(unsigned long eax, unsigned long ebx, + unsigned long ecx) +{ + /* "mwaitx %eax, %ebx, %ecx;" */ + asm volatile(".byte 0x0f, 0x01, 0xfb;" + :: "a" (eax), "b" (ebx), "c" (ecx)); +} + static inline void __sti_mwait(unsigned long eax, unsigned long ecx) { trace_hardirqs_on(); -- cgit v0.10.2 From b466bdb614823aaaa7188e85516177d2850f4782 Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Mon, 10 Aug 2015 12:19:54 +0200 Subject: x86/asm/delay: Introduce an MWAITX-based delay with a configurable timer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit MWAITX can enable a timer and a corresponding timer value specified in SW P0 clocks. The SW P0 frequency is the same as TSC. The timer provides an upper bound on how long the instruction waits before exiting. This way, a delay function in the kernel can leverage that MWAITX timer of MWAITX. When a CPU core executes MWAITX, it will be quiesced in a waiting phase, diminishing its power consumption. This way, we can save power in comparison to our default TSC-based delays. A simple test shows that: $ cat /sys/bus/pci/devices/0000\:00\:18.4/hwmon/hwmon0/power1_acc $ sleep 10000s $ cat /sys/bus/pci/devices/0000\:00\:18.4/hwmon/hwmon0/power1_acc Results: * TSC-based default delay: 485115 uWatts average power * MWAITX-based delay: 252738 uWatts average power Thus, that's about 240 milliWatts less power consumption. The test method relies on the support of AMD CPU accumulated power algorithm in fam15h_power for which patches are forthcoming. Suggested-by: Andy Lutomirski Suggested-by: Borislav Petkov Suggested-by: Peter Zijlstra Signed-off-by: Huang Rui [ Fix delay truncation. ] Signed-off-by: Borislav Petkov Cc: Aaron Lu Cc: Andreas Herrmann Cc: Aravind Gopalakrishnan Cc: Fengguang Wu Cc: Frédéric Weisbecker Cc: H. Peter Anvin Cc: Hector Marco-Gisbert Cc: Jacob Shin Cc: Jiri Olsa Cc: John Stultz Cc: Len Brown Cc: Linus Torvalds Cc: Paolo Bonzini Cc: Rafael J. Wysocki Cc: Thomas Gleixner Cc: Tony Li Link: http://lkml.kernel.org/r/1438744732-1459-3-git-send-email-ray.huang@amd.com Link: http://lkml.kernel.org/r/1439201994-28067-4-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h index 9b3b4f2..36a760b 100644 --- a/arch/x86/include/asm/delay.h +++ b/arch/x86/include/asm/delay.h @@ -4,5 +4,6 @@ #include void use_tsc_delay(void); +void use_mwaitx_delay(void); #endif /* _ASM_X86_DELAY_H */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 51ad2af..4a70fc6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -11,6 +11,7 @@ #include #include #include +#include #ifdef CONFIG_X86_64 # include @@ -506,6 +507,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) /* A random value per boot for bit slice [12:upper_bit) */ va_align.bits = get_random_int() & va_align.mask; } + + if (cpu_has(c, X86_FEATURE_MWAITX)) + use_mwaitx_delay(); } static void early_init_amd(struct cpuinfo_x86 *c) diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c index 4453d52..e912b2f 100644 --- a/arch/x86/lib/delay.c +++ b/arch/x86/lib/delay.c @@ -20,6 +20,7 @@ #include #include #include +#include #ifdef CONFIG_SMP # include @@ -84,6 +85,44 @@ static void delay_tsc(unsigned long __loops) } /* + * On some AMD platforms, MWAITX has a configurable 32-bit timer, that + * counts with TSC frequency. The input value is the loop of the + * counter, it will exit when the timer expires. + */ +static void delay_mwaitx(unsigned long __loops) +{ + u64 start, end, delay, loops = __loops; + + start = rdtsc_ordered(); + + for (;;) { + delay = min_t(u64, MWAITX_MAX_LOOPS, loops); + + /* + * Use cpu_tss as a cacheline-aligned, seldomly + * accessed per-cpu variable as the monitor target. + */ + __monitorx(this_cpu_ptr(&cpu_tss), 0, 0); + + /* + * AMD, like Intel, supports the EAX hint and EAX=0xf + * means, do not enter any deep C-state and we use it + * here in delay() to minimize wakeup latency. + */ + __mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE); + + end = rdtsc_ordered(); + + if (loops <= end - start) + break; + + loops -= end - start; + + start = end; + } +} + +/* * Since we calibrate only once at boot, this * function should be set once at boot and not changed */ @@ -91,7 +130,13 @@ static void (*delay_fn)(unsigned long) = delay_loop; void use_tsc_delay(void) { - delay_fn = delay_tsc; + if (delay_fn == delay_loop) + delay_fn = delay_tsc; +} + +void use_mwaitx_delay(void) +{ + delay_fn = delay_mwaitx; } int read_current_timer(unsigned long *timer_val) -- cgit v0.10.2 From 47edb65178cb7056c2eea0b6c41a7d8c84547192 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 23 Jul 2015 12:14:40 -0700 Subject: x86/asm/msr: Make wrmsrl() a function As of cf991de2f614 ("x86/asm/msr: Make wrmsrl_safe() a function"), wrmsrl_safe is a function, but wrmsrl is still a macro. The wrmsrl macro performs invalid shifts if the value argument is 32 bits. This makes it unnecessarily awkward to write code that puts an unsigned long into an MSR. To make this work, syscall_init needs tweaking to stop passing a function pointer to wrmsrl. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Willy Tarreau Link: http://lkml.kernel.org/r/690f0c629a1085d054e2d1ef3da073cfb3f7db92.1437678821.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index 54e9f08..77d8b28 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -188,8 +188,10 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high) #define rdmsrl(msr, val) \ ((val) = native_read_msr((msr))) -#define wrmsrl(msr, val) \ - native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32)) +static inline void wrmsrl(unsigned msr, u64 val) +{ + native_write_msr(msr, (u32)val, (u32)(val >> 32)); +} /* wrmsr with exception handling */ static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c2be037..10d0596 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -153,7 +153,11 @@ do { \ val = paravirt_read_msr(msr, &_err); \ } while (0) -#define wrmsrl(msr, val) wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32) +static inline void wrmsrl(unsigned msr, u64 val) +{ + wrmsr(msr, (u32)val, (u32)(val>>32)); +} + #define wrmsr_safe(msr, a, b) paravirt_write_msr(msr, a, b) /* rdmsr with exception handling */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cb9e5df..b128808 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1185,10 +1185,10 @@ void syscall_init(void) * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip. */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); - wrmsrl(MSR_LSTAR, entry_SYSCALL_64); + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION - wrmsrl(MSR_CSTAR, entry_SYSCALL_compat); + wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. @@ -1199,7 +1199,7 @@ void syscall_init(void) wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else - wrmsrl(MSR_CSTAR, ignore_sysret); + wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); -- cgit v0.10.2 From 7e01ebffffedec22cea86ebe94802f909e4579ca Mon Sep 17 00:00:00 2001 From: Huang Rui Date: Thu, 27 Aug 2015 18:04:04 +0800 Subject: x86/asm: Drop repeated macro of X86_EFLAGS_AC definition We just need one macro of X86_EFLAGS_AC_BIT and X86_EFLAGS_AC. Signed-off-by: Huang Rui Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Fengguang Wu Cc: Fenghua Yu Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Li Cc: Tony Luck Link: http://lkml.kernel.org/r/1440669844-21535-1-git-send-email-ray.huang@amd.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h index 180a0c3..79887ab 100644 --- a/arch/x86/include/uapi/asm/processor-flags.h +++ b/arch/x86/include/uapi/asm/processor-flags.h @@ -37,8 +37,6 @@ #define X86_EFLAGS_VM _BITUL(X86_EFLAGS_VM_BIT) #define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */ #define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT) -#define X86_EFLAGS_AC_BIT 18 /* Alignment Check/Access Control */ -#define X86_EFLAGS_AC _BITUL(X86_EFLAGS_AC_BIT) #define X86_EFLAGS_VIF_BIT 19 /* Virtual Interrupt Flag */ #define X86_EFLAGS_VIF _BITUL(X86_EFLAGS_VIF_BIT) #define X86_EFLAGS_VIP_BIT 20 /* Virtual Interrupt Pending */ -- cgit v0.10.2