From 5fdf5d37f40a3b18c0d613463867f71c017b75ef Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 19 Nov 2015 16:55:45 -0500 Subject: x86/xen: Avoid fast syscall path for Xen PV guests After 32-bit syscall rewrite, and specifically after commit: 5f310f739b4c ("x86/entry/32: Re-implement SYSENTER using the new C path") ... the stack frame that is passed to xen_sysexit is no longer a "standard" one (i.e. it's not pt_regs). Since we end up calling xen_iret from xen_sysexit we don't need to fix up the stack and instead follow entry_SYSENTER_32's IRET path directly to xen_iret. We can do the same thing for compat mode even though stack does not need to be fixed. This will allow us to drop usergs_sysret32 paravirt op (in the subsequent patch) Suggested-by: Andy Lutomirski Signed-off-by: Boris Ostrovsky Reviewed-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: david.vrabel@citrix.com Cc: konrad.wilk@oracle.com Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1447970147-1733-2-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 3eb572e..0870825 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -308,8 +308,9 @@ sysenter_past_esp: movl %esp, %eax call do_fast_syscall_32 - testl %eax, %eax - jz .Lsyscall_32_done + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV /* Opportunistic SYSEXIT */ TRACE_IRQS_ON /* User mode traces as IRQs on. */ diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index c320183..402e34a 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -121,8 +121,9 @@ sysenter_flags_fixed: movq %rsp, %rdi call do_fast_syscall_32 - testl %eax, %eax - jz .Lsyscall_32_done + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV jmp sysret32_from_system_call sysenter_fix_flags: @@ -200,8 +201,9 @@ ENTRY(entry_SYSCALL_compat) movq %rsp, %rdi call do_fast_syscall_32 - testl %eax, %eax - jz .Lsyscall_32_done + /* XEN PV guests always use IRET path */ + ALTERNATIVE "testl %eax, %eax; jz .Lsyscall_32_done", \ + "jmp .Lsyscall_32_done", X86_FEATURE_XENPV /* Opportunistic SYSRET */ sysret32_from_system_call: diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index e4f8010..f7ba9fb 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -216,6 +216,7 @@ #define X86_FEATURE_PAUSEFILTER ( 8*32+13) /* AMD filtered pause intercept */ #define X86_FEATURE_PFTHRESHOLD ( 8*32+14) /* AMD pause filter threshold */ #define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ /* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5774800..d315151 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1886,8 +1886,10 @@ EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); static void xen_set_cpu_features(struct cpuinfo_x86 *c) { - if (xen_pv_domain()) + if (xen_pv_domain()) { clear_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + set_cpu_cap(c, X86_FEATURE_XENPV); + } } const struct hypervisor_x86 x86_hyper_xen = { -- cgit v0.10.2 From 88c15ec90ff16880efab92b519436ee17b198477 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 19 Nov 2015 16:55:46 -0500 Subject: x86/paravirt: Remove the unused irq_enable_sysexit pv op As result of commit "x86/xen: Avoid fast syscall path for Xen PV guests", the irq_enable_sysexit pv op is not called by Xen PV guests anymore and since they were the only ones who used it we can safely remove it. Signed-off-by: Boris Ostrovsky Reviewed-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: david.vrabel@citrix.com Cc: konrad.wilk@oracle.com Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1447970147-1733-3-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 0870825..9870c97 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -329,7 +329,8 @@ sysenter_past_esp: * Return back to the vDSO, which will pop ecx and edx. * Don't bother with DS and ES (they already contain __USER_DS). */ - ENABLE_INTERRUPTS_SYSEXIT + sti + sysexit .pushsection .fixup, "ax" 2: movl $0, PT_FS(%esp) @@ -552,11 +553,6 @@ ENTRY(native_iret) iret _ASM_EXTABLE(native_iret, iret_exc) END(native_iret) - -ENTRY(native_irq_enable_sysexit) - sti - sysexit -END(native_irq_enable_sysexit) #endif ENTRY(overflow) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 10d0596..c28518e 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -932,13 +932,6 @@ extern void default_banner(void); push %ecx; push %edx; \ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ pop %edx; pop %ecx - -#define ENABLE_INTERRUPTS_SYSEXIT \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_irq_enable_sysexit), \ - CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_irq_enable_sysexit)) - - #else /* !CONFIG_X86_32 */ /* diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 31247b5..608bbf3 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -157,15 +157,6 @@ struct pv_cpu_ops { u64 (*read_pmc)(int counter); -#ifdef CONFIG_X86_32 - /* - * Atomically enable interrupts and return to userspace. This - * is only used in 32-bit kernels. 64-bit kernels use - * usergs_sysret32 instead. - */ - void (*irq_enable_sysexit)(void); -#endif - /* * Switch to usermode gs and return to 64-bit usermode using * sysret. Only used in 64-bit kernels to return to 64-bit diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 439df97..84a7524 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -65,9 +65,6 @@ void common(void) { OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable); OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable); OFFSET(PV_CPU_iret, pv_cpu_ops, iret); -#ifdef CONFIG_X86_32 - OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit); -#endif OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0); OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); #endif diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c2130ae..c55f437 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -162,9 +162,6 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_64(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || -#ifdef CONFIG_X86_32 - type == PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit) || -#endif type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) /* If operation requires a jmp, then jmp */ @@ -220,7 +217,6 @@ static u64 native_steal_clock(int cpu) /* These are in entry.S */ extern void native_iret(void); -extern void native_irq_enable_sysexit(void); extern void native_usergs_sysret32(void); extern void native_usergs_sysret64(void); @@ -379,9 +375,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .load_sp0 = native_load_sp0, -#if defined(CONFIG_X86_32) - .irq_enable_sysexit = native_irq_enable_sysexit, -#endif #ifdef CONFIG_X86_64 #ifdef CONFIG_IA32_EMULATION .usergs_sysret32 = native_usergs_sysret32, diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index c89f50a..158dc06 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -5,7 +5,6 @@ DEF_NATIVE(pv_irq_ops, irq_enable, "sti"); DEF_NATIVE(pv_irq_ops, restore_fl, "push %eax; popf"); DEF_NATIVE(pv_irq_ops, save_fl, "pushf; pop %eax"); DEF_NATIVE(pv_cpu_ops, iret, "iret"); -DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "sti; sysexit"); DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); @@ -46,7 +45,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, restore_fl); PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_cpu_ops, iret); - PATCH_SITE(pv_cpu_ops, irq_enable_sysexit); PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 8aa0558..17c00f8 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -13,7 +13,6 @@ DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); -DEF_NATIVE(pv_cpu_ops, irq_enable_sysexit, "swapgs; sti; sysexit"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index d315151..a068e36 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1229,10 +1229,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = { .iret = xen_iret, #ifdef CONFIG_X86_64 - .usergs_sysret32 = xen_sysret32, .usergs_sysret64 = xen_sysret64, -#else - .irq_enable_sysexit = xen_sysexit, #endif .load_tr_desc = paravirt_nop, diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S index fd92a64..feb6d40 100644 --- a/arch/x86/xen/xen-asm_32.S +++ b/arch/x86/xen/xen-asm_32.S @@ -35,20 +35,6 @@ check_events: ret /* - * We can't use sysexit directly, because we're not running in ring0. - * But we can easily fake it up using iret. Assuming xen_sysexit is - * jumped to with a standard stack frame, we can just strip it back to - * a standard iret frame and use iret. - */ -ENTRY(xen_sysexit) - movl PT_EAX(%esp), %eax /* Shouldn't be necessary? */ - orl $X86_EFLAGS_IF, PT_EFLAGS(%esp) - lea PT_EIP(%esp), %esp - - jmp xen_iret -ENDPROC(xen_sysexit) - -/* * This is run where a normal iret would be run, with the same stack setup: * 8: eflags * 4: cs diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 1399423..4140b07 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -139,9 +139,6 @@ DECL_ASM(void, xen_restore_fl_direct, unsigned long); /* These are not functions, and cannot be called normally */ __visible void xen_iret(void); -#ifdef CONFIG_X86_32 -__visible void xen_sysexit(void); -#endif __visible void xen_sysret32(void); __visible void xen_sysret64(void); __visible void xen_adjust_exception_frame(void); -- cgit v0.10.2 From 75ef82190dceac3d84cdc209fdf82800a7cc6609 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Thu, 19 Nov 2015 16:55:47 -0500 Subject: x86/entry, x86/paravirt: Remove the unused usergs_sysret32 PV op As result of commit "x86/xen: Avoid fast syscall path for Xen PV guests", usergs_sysret32 pv op is not called by Xen PV guests anymore and since they were the only ones who used it we can safely remove it. Signed-off-by: Boris Ostrovsky Reviewed-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: david.vrabel@citrix.com Cc: konrad.wilk@oracle.com Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1447970147-1733-4-git-send-email-boris.ostrovsky@oracle.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 402e34a..bbcb285 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -18,13 +18,6 @@ .section .entry.text, "ax" -#ifdef CONFIG_PARAVIRT -ENTRY(native_usergs_sysret32) - swapgs - sysretl -ENDPROC(native_usergs_sysret32) -#endif - /* * 32-bit SYSENTER instruction entry. * @@ -238,7 +231,8 @@ sysret32_from_system_call: xorq %r9, %r9 xorq %r10, %r10 movq RSP-ORIG_RAX(%rsp), %rsp - USERGS_SYSRET32 + swapgs + sysretl END(entry_SYSCALL_compat) /* diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c28518e..1b71c3a 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -922,11 +922,6 @@ extern void default_banner(void); call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) -#define USERGS_SYSRET32 \ - PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret32), \ - CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret32)) - #ifdef CONFIG_X86_32 #define GET_CR0_INTO_EAX \ push %ecx; push %edx; \ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 608bbf3..702c8bd 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -165,14 +165,6 @@ struct pv_cpu_ops { */ void (*usergs_sysret64)(void); - /* - * Switch to usermode gs and return to 32-bit usermode using - * sysret. Used to return to 32-on-64 compat processes. - * Other usermode register state, including %esp, must already - * be restored. - */ - void (*usergs_sysret32)(void); - /* Normal iret. Jump to this with the standard iret stack frame set up. */ void (*iret)(void); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index d8f42f9..f2edafb 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -23,7 +23,6 @@ int main(void) { #ifdef CONFIG_PARAVIRT OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); - OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); BLANK(); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index c55f437..8c19b4d 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -162,7 +162,6 @@ unsigned paravirt_patch_default(u8 type, u16 clobbers, void *insnbuf, ret = paravirt_patch_ident_64(insnbuf, len); else if (type == PARAVIRT_PATCH(pv_cpu_ops.iret) || - type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret32) || type == PARAVIRT_PATCH(pv_cpu_ops.usergs_sysret64)) /* If operation requires a jmp, then jmp */ ret = paravirt_patch_jmp(insnbuf, opfunc, addr, len); @@ -217,7 +216,6 @@ static u64 native_steal_clock(int cpu) /* These are in entry.S */ extern void native_iret(void); -extern void native_usergs_sysret32(void); extern void native_usergs_sysret64(void); static struct resource reserve_ioports = { @@ -376,9 +374,6 @@ __visible struct pv_cpu_ops pv_cpu_ops = { .load_sp0 = native_load_sp0, #ifdef CONFIG_X86_64 -#ifdef CONFIG_IA32_EMULATION - .usergs_sysret32 = native_usergs_sysret32, -#endif .usergs_sysret64 = native_usergs_sysret64, #endif .iret = native_iret, diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index 17c00f8..e70087a 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -14,7 +14,6 @@ DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); -DEF_NATIVE(pv_cpu_ops, usergs_sysret32, "swapgs; sysretl"); DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); @@ -54,7 +53,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_irq_ops, save_fl); PATCH_SITE(pv_irq_ops, irq_enable); PATCH_SITE(pv_irq_ops, irq_disable); - PATCH_SITE(pv_cpu_ops, usergs_sysret32); PATCH_SITE(pv_cpu_ops, usergs_sysret64); PATCH_SITE(pv_cpu_ops, swapgs); PATCH_SITE(pv_mmu_ops, read_cr2); diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index f22667a..cc8acc4 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -68,25 +68,6 @@ ENTRY(xen_sysret64) ENDPATCH(xen_sysret64) RELOC(xen_sysret64, 1b+1) -ENTRY(xen_sysret32) - /* - * We're already on the usermode stack at this point, but - * still with the kernel gs, so we can easily switch back - */ - movq %rsp, PER_CPU_VAR(rsp_scratch) - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - - pushq $__USER32_DS - pushq PER_CPU_VAR(rsp_scratch) - pushq %r11 - pushq $__USER32_CS - pushq %rcx - - pushq $0 -1: jmp hypercall_iret -ENDPATCH(xen_sysret32) -RELOC(xen_sysret32, 1b+1) - /* * Xen handles syscall callbacks much like ordinary exceptions, which * means we have: -- cgit v0.10.2 From ed11a7f1b3bd482bd7d6ef7bc2859c41fb43b9ee Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 12 Nov 2015 12:59:01 -0800 Subject: context_tracking: Switch to new static_branch API This is much less error-prone than the old code. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/812df7e64f120c5c7c08481f36a8caa9f53b2199.1447361906.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h index ee956c5..1d34fe6 100644 --- a/include/linux/context_tracking_state.h +++ b/include/linux/context_tracking_state.h @@ -22,12 +22,12 @@ struct context_tracking { }; #ifdef CONFIG_CONTEXT_TRACKING -extern struct static_key context_tracking_enabled; +extern struct static_key_false context_tracking_enabled; DECLARE_PER_CPU(struct context_tracking, context_tracking); static inline bool context_tracking_is_enabled(void) { - return static_key_false(&context_tracking_enabled); + return static_branch_unlikely(&context_tracking_enabled); } static inline bool context_tracking_cpu_is_enabled(void) diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index d8560ee..9ad37b9 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -24,7 +24,7 @@ #define CREATE_TRACE_POINTS #include -struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(context_tracking_enabled); EXPORT_SYMBOL_GPL(context_tracking_enabled); DEFINE_PER_CPU(struct context_tracking, context_tracking); @@ -191,7 +191,7 @@ void __init context_tracking_cpu_set(int cpu) if (!per_cpu(context_tracking.active, cpu)) { per_cpu(context_tracking.active, cpu) = true; - static_key_slow_inc(&context_tracking_enabled); + static_branch_inc(&context_tracking_enabled); } if (initialized) -- cgit v0.10.2 From c28454332fe0b65e22c3a2717e5bf05b5b47ca20 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 12 Nov 2015 12:59:02 -0800 Subject: x86/asm: Error out if asm/jump_label.h is included inappropriately Rather than potentially generating incorrect code on a non-HAVE_JUMP_LABEL kernel if someone includes asm/jump_label.h, error out. Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/99407f0ac7fa3ab03a3d31ce076d47b5c2f44795.1447361906.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 5daeca3..96872dc 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -1,6 +1,19 @@ #ifndef _ASM_X86_JUMP_LABEL_H #define _ASM_X86_JUMP_LABEL_H +#ifndef HAVE_JUMP_LABEL +/* + * For better or for worse, if jump labels (the gcc extension) are missing, + * then the entire static branch patching infrastructure is compiled out. + * If that happens, the code in here will malfunction. Raise a compiler + * error instead. + * + * In theory, jump labels and the static branch patching infrastructure + * could be decoupled to fix this. + */ +#error asm/jump_label.h included on a non-jump-label kernel +#endif + #ifndef __ASSEMBLY__ #include -- cgit v0.10.2 From 2671c3e4fe2a34bd9bf2eecdf5d1149d4b55dbdf Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 12 Nov 2015 12:59:03 -0800 Subject: x86/asm: Add asm macros for static keys/jump labels Unfortunately, we can only do this if HAVE_JUMP_LABEL. In principle, we could do some serious surgery on the core jump label infrastructure to keep the patch infrastructure available on x86 on all builds, but that's probably not worth it. Implementing the macros using a conditional branch as a fallback seems like a bad idea: we'd have to clobber flags. This limitation can't cause silent failures -- trying to include asm/jump_label.h at all on a non-HAVE_JUMP_LABEL kernel will error out. The macro's users are responsible for handling this issue themselves. Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/63aa45c4b692e8469e1876d6ccbb5da707972990.1447361906.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h index 96872dc..adc54c1 100644 --- a/arch/x86/include/asm/jump_label.h +++ b/arch/x86/include/asm/jump_label.h @@ -14,13 +14,6 @@ #error asm/jump_label.h included on a non-jump-label kernel #endif -#ifndef __ASSEMBLY__ - -#include -#include -#include -#include - #define JUMP_LABEL_NOP_SIZE 5 #ifdef CONFIG_X86_64 @@ -29,6 +22,14 @@ # define STATIC_KEY_INIT_NOP GENERIC_NOP5_ATOMIC #endif +#include +#include + +#ifndef __ASSEMBLY__ + +#include +#include + static __always_inline bool arch_static_branch(struct static_key *key, bool branch) { asm_volatile_goto("1:" @@ -72,5 +73,40 @@ struct jump_entry { jump_label_t key; }; -#endif /* __ASSEMBLY__ */ +#else /* __ASSEMBLY__ */ + +.macro STATIC_JUMP_IF_TRUE target, key, def +.Lstatic_jump_\@: + .if \def + /* Equivalent to "jmp.d32 \target" */ + .byte 0xe9 + .long \target - .Lstatic_jump_after_\@ +.Lstatic_jump_after_\@: + .else + .byte STATIC_KEY_INIT_NOP + .endif + .pushsection __jump_table, "aw" + _ASM_ALIGN + _ASM_PTR .Lstatic_jump_\@, \target, \key + .popsection +.endm + +.macro STATIC_JUMP_IF_FALSE target, key, def +.Lstatic_jump_\@: + .if \def + .byte STATIC_KEY_INIT_NOP + .else + /* Equivalent to "jmp.d32 \target" */ + .byte 0xe9 + .long \target - .Lstatic_jump_after_\@ +.Lstatic_jump_after_\@: + .endif + .pushsection __jump_table, "aw" + _ASM_ALIGN + _ASM_PTR .Lstatic_jump_\@, \target, \key + 1 + .popsection +.endm + +#endif /* __ASSEMBLY__ */ + #endif -- cgit v0.10.2 From 478dc89cf316697e8029411a64ea2b30c528434d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 12 Nov 2015 12:59:04 -0800 Subject: x86/entry/64: Bypass enter_from_user_mode on non-context-tracking boots On CONFIG_CONTEXT_TRACKING kernels that have context tracking disabled at runtime (which includes most distro kernels), we still have the overhead of a call to enter_from_user_mode in interrupt and exception entries. If jump labels are available, this uses the jump label infrastructure to skip the call. Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/73ee804fff48cd8c66b65b724f9f728a11a8c686.1447361906.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 3c71dd9..e32206e 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -1,3 +1,5 @@ +#include + /* x86 function call convention, 64-bit: @@ -232,3 +234,16 @@ For 32-bit we have the following conventions - kernel is built with #endif /* CONFIG_X86_64 */ +/* + * This does 'call enter_from_user_mode' unless we can avoid it based on + * kernel config or using the static jump infrastructure. + */ +.macro CALL_enter_from_user_mode +#ifdef CONFIG_CONTEXT_TRACKING +#ifdef HAVE_JUMP_LABEL + STATIC_JUMP_IF_FALSE .Lafter_call_\@, context_tracking_enabled, def=0 +#endif + call enter_from_user_mode +.Lafter_call_\@: +#endif +.endm diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a55697d..9d34d3c 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -520,9 +520,7 @@ END(irq_entries_start) */ TRACE_IRQS_OFF -#ifdef CONFIG_CONTEXT_TRACKING - call enter_from_user_mode -#endif + CALL_enter_from_user_mode 1: /* @@ -1066,9 +1064,7 @@ ENTRY(error_entry) * (which can take locks). */ TRACE_IRQS_OFF -#ifdef CONFIG_CONTEXT_TRACKING - call enter_from_user_mode -#endif + CALL_enter_from_user_mode ret .Lerror_entry_done: -- cgit v0.10.2 From 677a73a9aa5433ea728200c26a7b3506d5eaa92b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 10 Dec 2015 19:20:18 -0800 Subject: x86/kvm: On KVM re-enable (e.g. after suspend), update clocks This gets rid of the "did TSC go backwards" logic and just updates all clocks. It should work better (no more disabling of fast timing) and more reliably (all of the clocks are actually updated). Signed-off-by: Andy Lutomirski Reviewed-by: Paolo Bonzini Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/861716d768a1da6d1fd257b7972f8df13baf7f85.1449702533.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 00462bd..6e32e87 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -123,8 +123,6 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); unsigned int __read_mostly lapic_timer_advance_ns = 0; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); -static bool __read_mostly backwards_tsc_observed = false; - #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -1671,7 +1669,6 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched - && !backwards_tsc_observed && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) @@ -7366,88 +7363,22 @@ int kvm_arch_hardware_enable(void) struct kvm_vcpu *vcpu; int i; int ret; - u64 local_tsc; - u64 max_tsc = 0; - bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); ret = kvm_x86_ops->hardware_enable(); if (ret != 0) return ret; - local_tsc = rdtsc(); - stable = !check_tsc_unstable(); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { - if (!stable && vcpu->cpu == smp_processor_id()) + if (vcpu->cpu == smp_processor_id()) { kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - if (stable && vcpu->arch.last_host_tsc > local_tsc) { - backwards_tsc = true; - if (vcpu->arch.last_host_tsc > max_tsc) - max_tsc = vcpu->arch.last_host_tsc; + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, + vcpu); } } } - /* - * Sometimes, even reliable TSCs go backwards. This happens on - * platforms that reset TSC during suspend or hibernate actions, but - * maintain synchronization. We must compensate. Fortunately, we can - * detect that condition here, which happens early in CPU bringup, - * before any KVM threads can be running. Unfortunately, we can't - * bring the TSCs fully up to date with real time, as we aren't yet far - * enough into CPU bringup that we know how much real time has actually - * elapsed; our helper function, get_kernel_ns() will be using boot - * variables that haven't been updated yet. - * - * So we simply find the maximum observed TSC above, then record the - * adjustment to TSC in each VCPU. When the VCPU later gets loaded, - * the adjustment will be applied. Note that we accumulate - * adjustments, in case multiple suspend cycles happen before some VCPU - * gets a chance to run again. In the event that no KVM threads get a - * chance to run, we will miss the entire elapsed period, as we'll have - * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may - * loose cycle time. This isn't too big a deal, since the loss will be - * uniform across all VCPUs (not to mention the scenario is extremely - * unlikely). It is possible that a second hibernate recovery happens - * much faster than a first, causing the observed TSC here to be - * smaller; this would require additional padding adjustment, which is - * why we set last_host_tsc to the local tsc observed here. - * - * N.B. - this code below runs only on platforms with reliable TSC, - * as that is the only way backwards_tsc is set above. Also note - * that this runs for ALL vcpus, which is not a bug; all VCPUs should - * have the same delta_cyc adjustment applied if backwards_tsc - * is detected. Note further, this adjustment is only done once, - * as we reset last_host_tsc on all VCPUs to stop this from being - * called multiple times (one for each physical CPU bringup). - * - * Platforms with unreliable TSCs don't have to deal with this, they - * will be compensated by the logic in vcpu_load, which sets the TSC to - * catchup mode. This will catchup all VCPUs to real time, but cannot - * guarantee that they stay in perfect synchronization. - */ - if (backwards_tsc) { - u64 delta_cyc = max_tsc - local_tsc; - backwards_tsc_observed = true; - list_for_each_entry(kvm, &vm_list, vm_list) { - kvm_for_each_vcpu(i, vcpu, kvm) { - vcpu->arch.tsc_offset_adjustment += delta_cyc; - vcpu->arch.last_host_tsc = local_tsc; - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); - } - - /* - * We have to disable TSC offset matching.. if you were - * booting a VM while issuing an S4 host suspend.... - * you may have some problem. Solving this issue is - * left as an exercise to the reader. - */ - kvm->arch.last_tsc_nsec = 0; - kvm->arch.last_tsc_write = 0; - } - - } return 0; } -- cgit v0.10.2 From 6b078f5de7fc0851af4102493c7b5bb07e49c4cb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 10 Dec 2015 19:20:19 -0800 Subject: x86, vdso, pvclock: Simplify and speed up the vdso pvclock reader The pvclock vdso code was too abstracted to understand easily and excessively paranoid. Simplify it for a huge speedup. This opens the door for additional simplifications, as the vdso no longer accesses the pvti for any vcpu other than vcpu 0. Before, vclock_gettime using kvm-clock took about 45ns on my machine. With this change, it takes 29ns, which is almost as fast as the pure TSC implementation. Signed-off-by: Andy Lutomirski Reviewed-by: Paolo Bonzini Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/6b51dcc41f1b101f963945c5ec7093d72bdac429.1449702533.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index ca94fa6..c325ba1 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -78,47 +78,58 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) static notrace cycle_t vread_pvclock(int *mode) { - const struct pvclock_vsyscall_time_info *pvti; + const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti; cycle_t ret; - u64 last; - u32 version; - u8 flags; - unsigned cpu, cpu1; - + u64 tsc, pvti_tsc; + u64 last, delta, pvti_system_time; + u32 version, pvti_tsc_to_system_mul, pvti_tsc_shift; /* - * Note: hypervisor must guarantee that: - * 1. cpu ID number maps 1:1 to per-CPU pvclock time info. - * 2. that per-CPU pvclock time info is updated if the - * underlying CPU changes. - * 3. that version is increased whenever underlying CPU - * changes. + * Note: The kernel and hypervisor must guarantee that cpu ID + * number maps 1:1 to per-CPU pvclock time info. + * + * Because the hypervisor is entirely unaware of guest userspace + * preemption, it cannot guarantee that per-CPU pvclock time + * info is updated if the underlying CPU changes or that that + * version is increased whenever underlying CPU changes. * + * On KVM, we are guaranteed that pvti updates for any vCPU are + * atomic as seen by *all* vCPUs. This is an even stronger + * guarantee than we get with a normal seqlock. + * + * On Xen, we don't appear to have that guarantee, but Xen still + * supplies a valid seqlock using the version field. + + * We only do pvclock vdso timing at all if + * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to + * mean that all vCPUs have matching pvti and that the TSC is + * synced, so we can just look at vCPU 0's pvti. */ - do { - cpu = __getcpu() & VGETCPU_CPU_MASK; - /* TODO: We can put vcpu id into higher bits of pvti.version. - * This will save a couple of cycles by getting rid of - * __getcpu() calls (Gleb). - */ - - pvti = get_pvti(cpu); - - version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags); - - /* - * Test we're still on the cpu as well as the version. - * We could have been migrated just after the first - * vgetcpu but before fetching the version, so we - * wouldn't notice a version change. - */ - cpu1 = __getcpu() & VGETCPU_CPU_MASK; - } while (unlikely(cpu != cpu1 || - (pvti->pvti.version & 1) || - pvti->pvti.version != version)); - - if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT))) + + if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) { *mode = VCLOCK_NONE; + return 0; + } + + do { + version = pvti->version; + + /* This is also a read barrier, so we'll read version first. */ + tsc = rdtsc_ordered(); + + pvti_tsc_to_system_mul = pvti->tsc_to_system_mul; + pvti_tsc_shift = pvti->tsc_shift; + pvti_system_time = pvti->system_time; + pvti_tsc = pvti->tsc_timestamp; + + /* Make sure that the version double-check is last. */ + smp_rmb(); + } while (unlikely((version & 1) || version != pvti->version)); + + delta = tsc - pvti_tsc; + ret = pvti_system_time + + pvclock_scale_delta(delta, pvti_tsc_to_system_mul, + pvti_tsc_shift); /* refer to tsc.c read_tsc() comment for rationale */ last = gtod->cycle_last; -- cgit v0.10.2 From dac16fba6fc590fa7239676b35ed75dae4c4cd2b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 10 Dec 2015 19:20:20 -0800 Subject: x86/vdso: Get pvclock data from the vvar VMA instead of the fixmap Signed-off-by: Andy Lutomirski Reviewed-by: Paolo Bonzini Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/9d37826fdc7e2d2809efe31d5345f97186859284.1449702533.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index c325ba1..5dd363d 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -36,6 +36,11 @@ static notrace cycle_t vread_hpet(void) } #endif +#ifdef CONFIG_PARAVIRT_CLOCK +extern u8 pvclock_page + __attribute__((visibility("hidden"))); +#endif + #ifndef BUILD_VDSO32 #include @@ -62,23 +67,14 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) #ifdef CONFIG_PARAVIRT_CLOCK -static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu) +static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) { - const struct pvclock_vsyscall_time_info *pvti_base; - int idx = cpu / (PAGE_SIZE/PVTI_SIZE); - int offset = cpu % (PAGE_SIZE/PVTI_SIZE); - - BUG_ON(PVCLOCK_FIXMAP_BEGIN + idx > PVCLOCK_FIXMAP_END); - - pvti_base = (struct pvclock_vsyscall_time_info *) - __fix_to_virt(PVCLOCK_FIXMAP_BEGIN+idx); - - return &pvti_base[offset]; + return (const struct pvclock_vsyscall_time_info *)&pvclock_page; } static notrace cycle_t vread_pvclock(int *mode) { - const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti; + const struct pvclock_vcpu_time_info *pvti = &get_pvti0()->pvti; cycle_t ret; u64 tsc, pvti_tsc; u64 last, delta, pvti_system_time; diff --git a/arch/x86/entry/vdso/vdso-layout.lds.S b/arch/x86/entry/vdso/vdso-layout.lds.S index de2c921..4158acc 100644 --- a/arch/x86/entry/vdso/vdso-layout.lds.S +++ b/arch/x86/entry/vdso/vdso-layout.lds.S @@ -25,7 +25,7 @@ SECTIONS * segment. */ - vvar_start = . - 2 * PAGE_SIZE; + vvar_start = . - 3 * PAGE_SIZE; vvar_page = vvar_start; /* Place all vvars at the offsets in asm/vvar.h. */ @@ -36,6 +36,7 @@ SECTIONS #undef EMIT_VVAR hpet_page = vvar_start + PAGE_SIZE; + pvclock_page = vvar_start + 2 * PAGE_SIZE; . = SIZEOF_HEADERS; diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 785d992..491020b 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -73,6 +73,7 @@ enum { sym_vvar_start, sym_vvar_page, sym_hpet_page, + sym_pvclock_page, sym_VDSO_FAKE_SECTION_TABLE_START, sym_VDSO_FAKE_SECTION_TABLE_END, }; @@ -80,6 +81,7 @@ enum { const int special_pages[] = { sym_vvar_page, sym_hpet_page, + sym_pvclock_page, }; struct vdso_sym { @@ -91,6 +93,7 @@ struct vdso_sym required_syms[] = { [sym_vvar_start] = {"vvar_start", true}, [sym_vvar_page] = {"vvar_page", true}, [sym_hpet_page] = {"hpet_page", true}, + [sym_pvclock_page] = {"pvclock_page", true}, [sym_VDSO_FAKE_SECTION_TABLE_START] = { "VDSO_FAKE_SECTION_TABLE_START", false }, diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 64df471..aa82819 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -100,6 +100,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) .name = "[vvar]", .pages = no_pages, }; + struct pvclock_vsyscall_time_info *pvti; if (calculate_addr) { addr = vdso_addr(current->mm->start_stack, @@ -169,6 +170,18 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) } #endif + pvti = pvclock_pvti_cpu0_va(); + if (pvti && image->sym_pvclock_page) { + ret = remap_pfn_range(vma, + text_start + image->sym_pvclock_page, + __pa(pvti) >> PAGE_SHIFT, + PAGE_SIZE, + PAGE_READONLY); + + if (ret) + goto up_fail; + } + up_fail: if (ret) current->mm->context.vdso = NULL; diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 7a6bed5..3864398 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -4,6 +4,15 @@ #include #include +#ifdef CONFIG_PARAVIRT_CLOCK +extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void); +#else +static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) +{ + return NULL; +} +#endif + /* some helper functions for xen and kvm pv clock sources */ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src); u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src); diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 756de91..deabaf9 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -22,6 +22,7 @@ struct vdso_image { long sym_vvar_page; long sym_hpet_page; + long sym_pvclock_page; long sym_VDSO32_NOTE_MASK; long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 2bd81e3..ec1b06d 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -45,6 +45,11 @@ early_param("no-kvmclock", parse_no_kvmclock); static struct pvclock_vsyscall_time_info *hv_clock; static struct pvclock_wall_clock wall_clock; +struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) +{ + return hv_clock; +} + /* * The wallclock is the time of day when we booted. Since then, some time may * have elapsed since the hypervisor wrote the data. So we try to account for -- cgit v0.10.2 From cc1e24fdb064d3126a494716f22ad4fc39306742 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 10 Dec 2015 19:20:21 -0800 Subject: x86/vdso: Remove pvclock fixmap machinery Signed-off-by: Andy Lutomirski Reviewed-by: Paolo Bonzini Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/4933029991103ae44672c82b97a20035f5c1fe4f.1449702533.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 5dd363d..59a98c2 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -45,7 +45,6 @@ extern u8 pvclock_page #include #include -#include #include notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index aa82819..b8f69e2 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index f80d700..6d7d0e5 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -19,7 +19,6 @@ #include #include #include -#include #ifdef CONFIG_X86_32 #include #include @@ -72,10 +71,6 @@ enum fixed_addresses { #ifdef CONFIG_X86_VSYSCALL_EMULATION VSYSCALL_PAGE = (FIXADDR_TOP - VSYSCALL_ADDR) >> PAGE_SHIFT, #endif -#ifdef CONFIG_PARAVIRT_CLOCK - PVCLOCK_FIXMAP_BEGIN, - PVCLOCK_FIXMAP_END = PVCLOCK_FIXMAP_BEGIN+PVCLOCK_VSYSCALL_NR_PAGES-1, -#endif #endif FIX_DBGP_BASE, FIX_EARLYCON_MEM_BASE, diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 3864398..66df22b 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -100,10 +100,5 @@ struct pvclock_vsyscall_time_info { } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define PVTI_SIZE sizeof(struct pvclock_vsyscall_time_info) -#define PVCLOCK_VSYSCALL_NR_PAGES (((NR_CPUS-1)/(PAGE_SIZE/PVTI_SIZE))+1) - -int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, - int size); -struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu); #endif /* _ASM_X86_PVCLOCK_H */ diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index ec1b06d..72cef58 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -310,7 +310,6 @@ int __init kvm_setup_vsyscall_timeinfo(void) { #ifdef CONFIG_X86_64 int cpu; - int ret; u8 flags; struct pvclock_vcpu_time_info *vcpu_time; unsigned int size; @@ -330,11 +329,6 @@ int __init kvm_setup_vsyscall_timeinfo(void) return 1; } - if ((ret = pvclock_init_vsyscall(hv_clock, size))) { - put_cpu(); - return ret; - } - put_cpu(); kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2f355d2..99bfc02 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -140,27 +140,3 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } - -#ifdef CONFIG_X86_64 -/* - * Initialize the generic pvclock vsyscall state. This will allocate - * a/some page(s) for the per-vcpu pvclock information, set up a - * fixmap mapping for the page(s) - */ - -int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, - int size) -{ - int idx; - - WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); - - for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { - __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, - __pa(i) + (idx*PAGE_SIZE), - PAGE_KERNEL_VVAR); - } - - return 0; -} -#endif -- cgit v0.10.2 From 76480a6a55a03d0fe5dd6290ccde7f78678ab85e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 10 Dec 2015 19:20:22 -0800 Subject: x86/vdso: Enable vdso pvclock access on all vdso variants Now that pvclock doesn't require access to the fixmap, all vdso variants can use it. The kernel side isn't wired up for 32-bit kernels yet, but this covers 32-bit and x32 userspace on 64-bit kernels. Signed-off-by: Andy Lutomirski Reviewed-by: Paolo Bonzini Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/a7ef693b7a4c88dd2173dc1d4bf6bc27023626eb.1449702533.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c index 59a98c2..8602f06 100644 --- a/arch/x86/entry/vdso/vclock_gettime.c +++ b/arch/x86/entry/vdso/vclock_gettime.c @@ -17,8 +17,10 @@ #include #include #include +#include #include #include +#include #define gtod (&VVAR(vsyscall_gtod_data)) @@ -43,10 +45,6 @@ extern u8 pvclock_page #ifndef BUILD_VDSO32 -#include -#include -#include - notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { long ret; @@ -64,8 +62,42 @@ notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) return ret; } -#ifdef CONFIG_PARAVIRT_CLOCK +#else + +notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) + : "memory", "edx"); + return ret; +} + +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm( + "mov %%ebx, %%edx \n" + "mov %2, %%ebx \n" + "call __kernel_vsyscall \n" + "mov %%edx, %%ebx \n" + : "=a" (ret) + : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) + : "memory", "edx"); + return ret; +} + +#endif + +#ifdef CONFIG_PARAVIRT_CLOCK static notrace const struct pvclock_vsyscall_time_info *get_pvti0(void) { return (const struct pvclock_vsyscall_time_info *)&pvclock_page; @@ -109,9 +141,9 @@ static notrace cycle_t vread_pvclock(int *mode) do { version = pvti->version; - /* This is also a read barrier, so we'll read version first. */ - tsc = rdtsc_ordered(); + smp_rmb(); + tsc = rdtsc_ordered(); pvti_tsc_to_system_mul = pvti->tsc_to_system_mul; pvti_tsc_shift = pvti->tsc_shift; pvti_system_time = pvti->system_time; @@ -126,7 +158,7 @@ static notrace cycle_t vread_pvclock(int *mode) pvclock_scale_delta(delta, pvti_tsc_to_system_mul, pvti_tsc_shift); - /* refer to tsc.c read_tsc() comment for rationale */ + /* refer to vread_tsc() comment for rationale */ last = gtod->cycle_last; if (likely(ret >= last)) @@ -136,49 +168,6 @@ static notrace cycle_t vread_pvclock(int *mode) } #endif -#else - -notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) -{ - long ret; - - asm( - "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_clock_gettime), "g" (clock), "c" (ts) - : "memory", "edx"); - return ret; -} - -notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) -{ - long ret; - - asm( - "mov %%ebx, %%edx \n" - "mov %2, %%ebx \n" - "call __kernel_vsyscall \n" - "mov %%edx, %%ebx \n" - : "=a" (ret) - : "0" (__NR_gettimeofday), "g" (tv), "c" (tz) - : "memory", "edx"); - return ret; -} - -#ifdef CONFIG_PARAVIRT_CLOCK - -static notrace cycle_t vread_pvclock(int *mode) -{ - *mode = VCLOCK_NONE; - return 0; -} -#endif - -#endif - notrace static cycle_t vread_tsc(void) { cycle_t ret = (cycle_t)rdtsc_ordered(); -- cgit v0.10.2 From d51953b0873358d13b189996e6976dfa12a9b59d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 11 Dec 2015 09:01:30 +0100 Subject: x86/platform/uv: Include clocksource.h for clocksource_touch_watchdog() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This build failure triggers on 64-bit allmodconfig: arch/x86/platform/uv/uv_nmi.c:493:2: error: implicit declaration of function ‘clocksource_touch_watchdog’ [-Werror=implicit-function-declaration] which is caused by recent changes exposing a missing clocksource.h include in uv_nmi.c: cc1e24fdb064 x86/vdso: Remove pvclock fixmap machinery this file got clocksource.h indirectly via fixmap.h - that stealth route of header inclusion is now gone. Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Thomas Gleixner Signed-off-by: Ingo Molnar diff --git a/arch/x86/platform/uv/uv_nmi.c b/arch/x86/platform/uv/uv_nmi.c index 327f21c..8dd8005 100644 --- a/arch/x86/platform/uv/uv_nmi.c +++ b/arch/x86/platform/uv/uv_nmi.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include -- cgit v0.10.2 From f74acf0e4326bfaa2c0be1e82f23801fe347cd9c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sat, 12 Dec 2015 11:27:57 +0100 Subject: x86/entry/64_compat: Make labels local ... so that they don't appear as symbols in the final ELF. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1449916077-6506-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index bbcb285..8d802a1 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -96,15 +96,15 @@ ENTRY(entry_SYSENTER_compat) * This needs to happen before enabling interrupts so that * we don't get preempted with NT set. * - * NB.: sysenter_fix_flags is a label with the code under it moved + * NB.: .Lsysenter_fix_flags is a label with the code under it moved * out-of-line as an optimization: NT is unlikely to be set in the * majority of the cases and instead of polluting the I$ unnecessarily, * we're keeping that code behind a branch which will predict as * not-taken and therefore its instructions won't be fetched. */ testl $X86_EFLAGS_NT, EFLAGS(%rsp) - jnz sysenter_fix_flags -sysenter_flags_fixed: + jnz .Lsysenter_fix_flags +.Lsysenter_flags_fixed: /* * User mode is traced as though IRQs are on, and SYSENTER @@ -119,10 +119,10 @@ sysenter_flags_fixed: "jmp .Lsyscall_32_done", X86_FEATURE_XENPV jmp sysret32_from_system_call -sysenter_fix_flags: +.Lsysenter_fix_flags: pushq $X86_EFLAGS_FIXED popfq - jmp sysenter_flags_fixed + jmp .Lsysenter_flags_fixed ENDPROC(entry_SYSENTER_compat) /* -- cgit v0.10.2 From b92c453d520ebf0703f8195e9f2c6e7522b85e1d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 23 Dec 2015 12:35:21 +0100 Subject: Revert "x86/kvm: On KVM re-enable (e.g. after suspend), update clocks" This reverts commit 677a73a9aa54. This patch was not meant to be merged and has issues. Revert it. Requested-by: Andy Lutomirski Cc: Borislav Petkov Signed-off-by: Thomas Gleixner diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 6e32e87..00462bd 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -123,6 +123,8 @@ module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); unsigned int __read_mostly lapic_timer_advance_ns = 0; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); +static bool __read_mostly backwards_tsc_observed = false; + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -1669,6 +1671,7 @@ static void pvclock_update_vm_gtod_copy(struct kvm *kvm) &ka->master_cycle_now); ka->use_master_clock = host_tsc_clocksource && vcpus_matched + && !backwards_tsc_observed && !ka->boot_vcpu_runs_old_kvmclock; if (ka->use_master_clock) @@ -7363,22 +7366,88 @@ int kvm_arch_hardware_enable(void) struct kvm_vcpu *vcpu; int i; int ret; + u64 local_tsc; + u64 max_tsc = 0; + bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); ret = kvm_x86_ops->hardware_enable(); if (ret != 0) return ret; + local_tsc = rdtsc(); + stable = !check_tsc_unstable(); list_for_each_entry(kvm, &vm_list, vm_list) { kvm_for_each_vcpu(i, vcpu, kvm) { - if (vcpu->cpu == smp_processor_id()) { + if (!stable && vcpu->cpu == smp_processor_id()) kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, - vcpu); + if (stable && vcpu->arch.last_host_tsc > local_tsc) { + backwards_tsc = true; + if (vcpu->arch.last_host_tsc > max_tsc) + max_tsc = vcpu->arch.last_host_tsc; } } } + /* + * Sometimes, even reliable TSCs go backwards. This happens on + * platforms that reset TSC during suspend or hibernate actions, but + * maintain synchronization. We must compensate. Fortunately, we can + * detect that condition here, which happens early in CPU bringup, + * before any KVM threads can be running. Unfortunately, we can't + * bring the TSCs fully up to date with real time, as we aren't yet far + * enough into CPU bringup that we know how much real time has actually + * elapsed; our helper function, get_kernel_ns() will be using boot + * variables that haven't been updated yet. + * + * So we simply find the maximum observed TSC above, then record the + * adjustment to TSC in each VCPU. When the VCPU later gets loaded, + * the adjustment will be applied. Note that we accumulate + * adjustments, in case multiple suspend cycles happen before some VCPU + * gets a chance to run again. In the event that no KVM threads get a + * chance to run, we will miss the entire elapsed period, as we'll have + * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may + * loose cycle time. This isn't too big a deal, since the loss will be + * uniform across all VCPUs (not to mention the scenario is extremely + * unlikely). It is possible that a second hibernate recovery happens + * much faster than a first, causing the observed TSC here to be + * smaller; this would require additional padding adjustment, which is + * why we set last_host_tsc to the local tsc observed here. + * + * N.B. - this code below runs only on platforms with reliable TSC, + * as that is the only way backwards_tsc is set above. Also note + * that this runs for ALL vcpus, which is not a bug; all VCPUs should + * have the same delta_cyc adjustment applied if backwards_tsc + * is detected. Note further, this adjustment is only done once, + * as we reset last_host_tsc on all VCPUs to stop this from being + * called multiple times (one for each physical CPU bringup). + * + * Platforms with unreliable TSCs don't have to deal with this, they + * will be compensated by the logic in vcpu_load, which sets the TSC to + * catchup mode. This will catchup all VCPUs to real time, but cannot + * guarantee that they stay in perfect synchronization. + */ + if (backwards_tsc) { + u64 delta_cyc = max_tsc - local_tsc; + backwards_tsc_observed = true; + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.tsc_offset_adjustment += delta_cyc; + vcpu->arch.last_host_tsc = local_tsc; + kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); + } + + /* + * We have to disable TSC offset matching.. if you were + * booting a VM while issuing an S4 host suspend.... + * you may have some problem. Solving this issue is + * left as an exercise to the reader. + */ + kvm->arch.last_tsc_nsec = 0; + kvm->arch.last_tsc_write = 0; + } + + } return 0; } -- cgit v0.10.2 From 8705d603edd49f1cff165cd3b7998f4c7f098d27 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 29 Dec 2015 20:12:18 -0800 Subject: x86/vsdo: Fix build on PARAVIRT_CLOCK=y, KVM_GUEST=n arch/x86/built-in.o: In function `arch_setup_additional_pages': (.text+0x587): undefined reference to `pvclock_pvti_cpu0_va' KVM_GUEST selects PARAVIRT_CLOCK, so we can make pvclock_pvti_cpu0_va depend on KVM_GUEST. Signed-off-by: Andy Lutomirski Tested-by: Borislav Petkov Cc: Oleg Nesterov Cc: Kees Cook Link: http://lkml.kernel.org/r/444d38a9bcba832685740ea1401b569861d09a72.1451446564.git.luto@kernel.org Signed-off-by: Thomas Gleixner diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h index 66df22b..fdcc040 100644 --- a/arch/x86/include/asm/pvclock.h +++ b/arch/x86/include/asm/pvclock.h @@ -4,7 +4,7 @@ #include #include -#ifdef CONFIG_PARAVIRT_CLOCK +#ifdef CONFIG_KVM_GUEST extern struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void); #else static inline struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void) -- cgit v0.10.2