From c25be94f2870bf75552a41ad8b15f756e19ffb1d Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Wed, 16 Sep 2015 20:23:29 +0200 Subject: x86/asm/entry/32, selftests: Add 'test_syscall_vdso' test This new test checks that all x86 registers are preserved across 32-bit syscalls. It tests syscalls through VDSO (if available) and through INT 0x80, normally and under ptrace. If kernel is a 64-bit one, high registers (r8..r15) are poisoned before the syscall is called and are checked afterwards. They must be either preserved, or cleared to zero (but r11 is special); r12..15 must be preserved for INT 0x80. EFLAGS is checked for changes too, but change there is not considered to be a bug (paravirt kernels do not preserve arithmetic flags). Run-tested on 64-bit kernel: $ ./test_syscall_vdso_32 [RUN] Executing 6-argument 32-bit syscall via VDSO [OK] Arguments are preserved across syscall [NOTE] R11 has changed:0000000000200ed7 - assuming clobbered by SYSRET insn [OK] R8..R15 did not leak kernel data [RUN] Executing 6-argument 32-bit syscall via INT 80 [OK] Arguments are preserved across syscall [OK] R8..R15 did not leak kernel data [RUN] Running tests under ptrace [RUN] Executing 6-argument 32-bit syscall via VDSO [OK] Arguments are preserved across syscall [OK] R8..R15 did not leak kernel data [RUN] Executing 6-argument 32-bit syscall via INT 80 [OK] Arguments are preserved across syscall [OK] R8..R15 did not leak kernel data On 32-bit paravirt kernel: $ ./test_syscall_vdso_32 [NOTE] Not a 64-bit kernel, won't test R8..R15 leaks [RUN] Executing 6-argument 32-bit syscall via VDSO [WARN] Flags before=0000000000200ed7 id 0 00 o d i s z 0 a 0 p 1 c [WARN] Flags after=0000000000200246 id 0 00 i z 0 0 p 1 [WARN] Flags change=0000000000000c91 0 00 o d s 0 a 0 0 c [OK] Arguments are preserved across syscall [RUN] Executing 6-argument 32-bit syscall via INT 80 [OK] Arguments are preserved across syscall [RUN] Running tests under ptrace [RUN] Executing 6-argument 32-bit syscall via VDSO [OK] Arguments are preserved across syscall [RUN] Executing 6-argument 32-bit syscall via INT 80 [OK] Arguments are preserved across syscall Signed-off-by: Denys Vlasenko Acked-by: Andy Lutomirski Cc: Alexei Starovoitov Cc: Borislav Petkov Cc: Brian Gerst Cc: Frederic Weisbecker Cc: H. Peter Anvin Cc: Kees Cook Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Shuah Khan Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Will Drewry Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1442427809-2027-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 29089b2..fd55bc3 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -5,7 +5,7 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt -TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn +TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) @@ -60,3 +60,4 @@ endif # Some tests have additional dependencies. sysret_ss_attrs_64: thunks.S +test_syscall_vdso_32: thunks_32.S diff --git a/tools/testing/selftests/x86/test_syscall_vdso.c b/tools/testing/selftests/x86/test_syscall_vdso.c new file mode 100644 index 0000000..4037035 --- /dev/null +++ b/tools/testing/selftests/x86/test_syscall_vdso.c @@ -0,0 +1,401 @@ +/* + * 32-bit syscall ABI conformance test. + * + * Copyright (c) 2015 Denys Vlasenko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +/* + * Can be built statically: + * gcc -Os -Wall -static -m32 test_syscall_vdso.c thunks_32.S + */ +#undef _GNU_SOURCE +#define _GNU_SOURCE 1 +#undef __USE_GNU +#define __USE_GNU 1 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if !defined(__i386__) +int main(int argc, char **argv, char **envp) +{ + printf("[SKIP]\tNot a 32-bit x86 userspace\n"); + return 0; +} +#else + +long syscall_addr; +long get_syscall(char **envp) +{ + Elf32_auxv_t *auxv; + while (*envp++ != NULL) + continue; + for (auxv = (void *)envp; auxv->a_type != AT_NULL; auxv++) + if (auxv->a_type == AT_SYSINFO) + return auxv->a_un.a_val; + printf("[WARN]\tAT_SYSINFO not supplied\n"); + return 0; +} + +asm ( + " .pushsection .text\n" + " .global int80\n" + "int80:\n" + " int $0x80\n" + " ret\n" + " .popsection\n" +); +extern char int80; + +struct regs64 { + uint64_t rax, rbx, rcx, rdx; + uint64_t rsi, rdi, rbp, rsp; + uint64_t r8, r9, r10, r11; + uint64_t r12, r13, r14, r15; +}; +struct regs64 regs64; +int kernel_is_64bit; + +asm ( + " .pushsection .text\n" + " .code64\n" + "get_regs64:\n" + " push %rax\n" + " mov $regs64, %eax\n" + " pop 0*8(%rax)\n" + " movq %rbx, 1*8(%rax)\n" + " movq %rcx, 2*8(%rax)\n" + " movq %rdx, 3*8(%rax)\n" + " movq %rsi, 4*8(%rax)\n" + " movq %rdi, 5*8(%rax)\n" + " movq %rbp, 6*8(%rax)\n" + " movq %rsp, 7*8(%rax)\n" + " movq %r8, 8*8(%rax)\n" + " movq %r9, 9*8(%rax)\n" + " movq %r10, 10*8(%rax)\n" + " movq %r11, 11*8(%rax)\n" + " movq %r12, 12*8(%rax)\n" + " movq %r13, 13*8(%rax)\n" + " movq %r14, 14*8(%rax)\n" + " movq %r15, 15*8(%rax)\n" + " ret\n" + "poison_regs64:\n" + " movq $0x7f7f7f7f, %r8\n" + " shl $32, %r8\n" + " orq $0x7f7f7f7f, %r8\n" + " movq %r8, %r9\n" + " movq %r8, %r10\n" + " movq %r8, %r11\n" + " movq %r8, %r12\n" + " movq %r8, %r13\n" + " movq %r8, %r14\n" + " movq %r8, %r15\n" + " ret\n" + " .code32\n" + " .popsection\n" +); +extern void get_regs64(void); +extern void poison_regs64(void); +extern unsigned long call64_from_32(void (*function)(void)); +void print_regs64(void) +{ + if (!kernel_is_64bit) + return; + printf("ax:%016llx bx:%016llx cx:%016llx dx:%016llx\n", regs64.rax, regs64.rbx, regs64.rcx, regs64.rdx); + printf("si:%016llx di:%016llx bp:%016llx sp:%016llx\n", regs64.rsi, regs64.rdi, regs64.rbp, regs64.rsp); + printf(" 8:%016llx 9:%016llx 10:%016llx 11:%016llx\n", regs64.r8 , regs64.r9 , regs64.r10, regs64.r11); + printf("12:%016llx 13:%016llx 14:%016llx 15:%016llx\n", regs64.r12, regs64.r13, regs64.r14, regs64.r15); +} + +int check_regs64(void) +{ + int err = 0; + int num = 8; + uint64_t *r64 = ®s64.r8; + + if (!kernel_is_64bit) + return 0; + + do { + if (*r64 == 0x7f7f7f7f7f7f7f7fULL) + continue; /* register did not change */ + if (syscall_addr != (long)&int80) { + /* + * Non-INT80 syscall entrypoints are allowed to clobber R8+ regs: + * either clear them to 0, or for R11, load EFLAGS. + */ + if (*r64 == 0) + continue; + if (num == 11) { + printf("[NOTE]\tR11 has changed:%016llx - assuming clobbered by SYSRET insn\n", *r64); + continue; + } + } else { + /* INT80 syscall entrypoint can be used by + * 64-bit programs too, unlike SYSCALL/SYSENTER. + * Therefore it must preserve R12+ + * (they are callee-saved registers in 64-bit C ABI). + * + * This was probably historically not intended, + * but R8..11 are clobbered (cleared to 0). + * IOW: they are the only registers which aren't + * preserved across INT80 syscall. + */ + if (*r64 == 0 && num <= 11) + continue; + } + printf("[FAIL]\tR%d has changed:%016llx\n", num, *r64); + err++; + } while (r64++, ++num < 16); + + if (!err) + printf("[OK]\tR8..R15 did not leak kernel data\n"); + return err; +} + +int nfds; +fd_set rfds; +fd_set wfds; +fd_set efds; +struct timespec timeout; +sigset_t sigmask; +struct { + sigset_t *sp; + int sz; +} sigmask_desc; + +void prep_args() +{ + nfds = 42; + FD_ZERO(&rfds); + FD_ZERO(&wfds); + FD_ZERO(&efds); + FD_SET(0, &rfds); + FD_SET(1, &wfds); + FD_SET(2, &efds); + timeout.tv_sec = 0; + timeout.tv_nsec = 123; + sigemptyset(&sigmask); + sigaddset(&sigmask, SIGINT); + sigaddset(&sigmask, SIGUSR2); + sigaddset(&sigmask, SIGRTMAX); + sigmask_desc.sp = &sigmask; + sigmask_desc.sz = 8; /* bytes */ +} + +static void print_flags(const char *name, unsigned long r) +{ + static const char *bitarray[] = { + "\n" ,"c\n" ,/* Carry Flag */ + "0 " ,"1 " ,/* Bit 1 - always on */ + "" ,"p " ,/* Parity Flag */ + "0 " ,"3? " , + "" ,"a " ,/* Auxiliary carry Flag */ + "0 " ,"5? " , + "" ,"z " ,/* Zero Flag */ + "" ,"s " ,/* Sign Flag */ + "" ,"t " ,/* Trap Flag */ + "" ,"i " ,/* Interrupt Flag */ + "" ,"d " ,/* Direction Flag */ + "" ,"o " ,/* Overflow Flag */ + "0 " ,"1 " ,/* I/O Privilege Level (2 bits) */ + "0" ,"1" ,/* I/O Privilege Level (2 bits) */ + "" ,"n " ,/* Nested Task */ + "0 " ,"15? ", + "" ,"r " ,/* Resume Flag */ + "" ,"v " ,/* Virtual Mode */ + "" ,"ac " ,/* Alignment Check/Access Control */ + "" ,"vif ",/* Virtual Interrupt Flag */ + "" ,"vip ",/* Virtual Interrupt Pending */ + "" ,"id " ,/* CPUID detection */ + NULL + }; + const char **bitstr; + int bit; + + printf("%s=%016lx ", name, r); + bitstr = bitarray + 42; + bit = 21; + if ((r >> 22) != 0) + printf("(extra bits are set) "); + do { + if (bitstr[(r >> bit) & 1][0]) + fputs(bitstr[(r >> bit) & 1], stdout); + bitstr -= 2; + bit--; + } while (bit >= 0); +} + +int run_syscall(void) +{ + long flags, bad_arg; + + prep_args(); + + if (kernel_is_64bit) + call64_from_32(poison_regs64); + /*print_regs64();*/ + + asm("\n" + /* Try 6-arg syscall: pselect. It should return quickly */ + " push %%ebp\n" + " mov $308, %%eax\n" /* PSELECT */ + " mov nfds, %%ebx\n" /* ebx arg1 */ + " mov $rfds, %%ecx\n" /* ecx arg2 */ + " mov $wfds, %%edx\n" /* edx arg3 */ + " mov $efds, %%esi\n" /* esi arg4 */ + " mov $timeout, %%edi\n" /* edi arg5 */ + " mov $sigmask_desc, %%ebp\n" /* %ebp arg6 */ + " push $0x200ed7\n" /* set almost all flags */ + " popf\n" /* except TF, IOPL, NT, RF, VM, AC, VIF, VIP */ + " call *syscall_addr\n" + /* Check that registers are not clobbered */ + " pushf\n" + " pop %%eax\n" + " cld\n" + " cmp nfds, %%ebx\n" /* ebx arg1 */ + " mov $1, %%ebx\n" + " jne 1f\n" + " cmp $rfds, %%ecx\n" /* ecx arg2 */ + " mov $2, %%ebx\n" + " jne 1f\n" + " cmp $wfds, %%edx\n" /* edx arg3 */ + " mov $3, %%ebx\n" + " jne 1f\n" + " cmp $efds, %%esi\n" /* esi arg4 */ + " mov $4, %%ebx\n" + " jne 1f\n" + " cmp $timeout, %%edi\n" /* edi arg5 */ + " mov $5, %%ebx\n" + " jne 1f\n" + " cmpl $sigmask_desc, %%ebp\n" /* %ebp arg6 */ + " mov $6, %%ebx\n" + " jne 1f\n" + " mov $0, %%ebx\n" + "1:\n" + " pop %%ebp\n" + : "=a" (flags), "=b" (bad_arg) + : + : "cx", "dx", "si", "di" + ); + + if (kernel_is_64bit) { + memset(®s64, 0x77, sizeof(regs64)); + call64_from_32(get_regs64); + /*print_regs64();*/ + } + + /* + * On paravirt kernels, flags are not preserved across syscalls. + * Thus, we do not consider it a bug if some are changed. + * We just show ones which do. + */ + if ((0x200ed7 ^ flags) != 0) { + print_flags("[WARN]\tFlags before", 0x200ed7); + print_flags("[WARN]\tFlags after", flags); + print_flags("[WARN]\tFlags change", (0x200ed7 ^ flags)); + } + + if (bad_arg) { + printf("[FAIL]\targ#%ld clobbered\n", bad_arg); + return 1; + } + printf("[OK]\tArguments are preserved across syscall\n"); + + return check_regs64(); +} + +int run_syscall_twice() +{ + int exitcode = 0; + long sv; + + if (syscall_addr) { + printf("[RUN]\tExecuting 6-argument 32-bit syscall via VDSO\n"); + exitcode = run_syscall(); + } + sv = syscall_addr; + syscall_addr = (long)&int80; + printf("[RUN]\tExecuting 6-argument 32-bit syscall via INT 80\n"); + exitcode += run_syscall(); + syscall_addr = sv; + return exitcode; +} + +void ptrace_me() +{ + pid_t pid; + + fflush(NULL); + pid = fork(); + if (pid < 0) + exit(1); + if (pid == 0) { + /* child */ + if (ptrace(PTRACE_TRACEME, 0L, 0L, 0L) != 0) + exit(0); + raise(SIGSTOP); + return; + } + /* parent */ + printf("[RUN]\tRunning tests under ptrace\n"); + while (1) { + int status; + pid = waitpid(-1, &status, __WALL); + if (WIFEXITED(status)) + exit(WEXITSTATUS(status)); + if (WIFSIGNALED(status)) + exit(WTERMSIG(status)); + if (pid <= 0 || !WIFSTOPPED(status)) /* paranoia */ + exit(255); + /* + * Note: we do not inject sig = WSTOPSIG(status). + * We probably should, but careful: do not inject SIGTRAP + * generated by syscall entry/exit stops. + * That kills the child. + */ + ptrace(PTRACE_SYSCALL, pid, 0L, 0L /*sig*/); + } +} + +int main(int argc, char **argv, char **envp) +{ + int exitcode = 0; + int cs; + + asm("\n" + " movl %%cs, %%eax\n" + : "=a" (cs) + ); + kernel_is_64bit = (cs == 0x23); + if (!kernel_is_64bit) + printf("[NOTE]\tNot a 64-bit kernel, won't test R8..R15 leaks\n"); + + /* This only works for non-static builds: + * syscall_addr = dlsym(dlopen("linux-gate.so.1", RTLD_NOW), "__kernel_vsyscall"); + */ + syscall_addr = get_syscall(envp); + + exitcode += run_syscall_twice(); + ptrace_me(); + exitcode += run_syscall_twice(); + + return exitcode; +} +#endif diff --git a/tools/testing/selftests/x86/thunks_32.S b/tools/testing/selftests/x86/thunks_32.S new file mode 100644 index 0000000..29b644b --- /dev/null +++ b/tools/testing/selftests/x86/thunks_32.S @@ -0,0 +1,55 @@ +/* + * thunks_32.S - assembly helpers for mixed-bitness code + * Copyright (c) 2015 Denys Vlasenko + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * These are little helpers that make it easier to switch bitness on + * the fly. + */ + + .text + .code32 + + .global call64_from_32 + .type call32_from_64, @function + + // 4(%esp): function to call +call64_from_32: + // Fetch function address + mov 4(%esp), %eax + + // Save registers which are callee-clobbered by 64-bit ABI + push %ecx + push %edx + push %esi + push %edi + + // Switch to long mode + jmp $0x33,$1f +1: .code64 + + // Call the function + call *%rax + + // Switch to compatibility mode + push $0x23 /* USER32_CS */ + .code32; push $1f; .code64 /* hack: can't have X86_64_32S relocation in 32-bit ELF */ + lretq +1: .code32 + + pop %edi + pop %esi + pop %edx + pop %ecx + + ret + +.size call64_from_32, .-call64_from_32 -- cgit v0.10.2 From 3dc33bd30f3e1c1bcaaafa3482737694debf0f0b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 12 Aug 2015 17:55:19 -0700 Subject: x86/entry/vsyscall: Add CONFIG to control default Most modern systems can run with vsyscall=none. In an effort to provide a way for build-time defaults to lack legacy settings, this adds a new CONFIG to select the type of vsyscall mapping to use, similar to the existing "vsyscall" command line parameter. Signed-off-by: Kees Cook Acked-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Triplett Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150813005519.GA11696@www.outflux.net Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 328c835..9bfb9e1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2042,6 +2042,55 @@ config COMPAT_VDSO If unsure, say N: if you are compiling your own kernel, you are unlikely to be using a buggy version of glibc. +choice + prompt "vsyscall table for legacy applications" + depends on X86_64 + default LEGACY_VSYSCALL_EMULATE + help + Legacy user code that does not know how to find the vDSO expects + to be able to issue three syscalls by calling fixed addresses in + kernel space. Since this location is not randomized with ASLR, + it can be used to assist security vulnerability exploitation. + + This setting can be changed at boot time via the kernel command + line parameter vsyscall=[native|emulate|none]. + + On a system with recent enough glibc (2.14 or newer) and no + static binaries, you can say None without a performance penalty + to improve security. + + If unsure, select "Emulate". + + config LEGACY_VSYSCALL_NATIVE + bool "Native" + help + Actual executable code is located in the fixed vsyscall + address mapping, implementing time() efficiently. Since + this makes the mapping executable, it can be used during + security vulnerability exploitation (traditionally as + ROP gadgets). This configuration is not recommended. + + config LEGACY_VSYSCALL_EMULATE + bool "Emulate" + help + The kernel traps and emulates calls into the fixed + vsyscall address mapping. This makes the mapping + non-executable, but it still contains known contents, + which could be used in certain rare security vulnerability + exploits. This configuration is recommended when userspace + still uses the vsyscall area. + + config LEGACY_VSYSCALL_NONE + bool "None" + help + There will be no vsyscall mapping at all. This will + eliminate any risk of ASLR bypass due to the vsyscall + fixed address mapping. Attempts to use the vsyscalls + will be reported to dmesg, so that either old or + malicious userspace programs can be identified. + +endchoice + config CMDLINE_BOOL bool "Built-in kernel command line" ---help--- diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index b160c0c..76e0fd3 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -38,7 +38,14 @@ #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = +#ifdef CONFIG_LEGACY_VSYSCALL_NATIVE + NATIVE; +#elif CONFIG_LEGACY_VSYSCALL_NONE + NONE; +#else + EMULATE; +#endif static int __init vsyscall_setup(char *str) { -- cgit v0.10.2 From 93f13a9f96771a064c716364aebc6e283b186eb8 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 21 Sep 2015 09:48:29 +0200 Subject: x86/entry/vsyscall: Fix undefined symbol warning Commit: 3dc33bd30f3e1 ("x86/entry/vsyscall: Add CONFIG to control default") did the ifdef/elif thing but GCC doesn't like that: arch/x86/entry/vsyscall/vsyscall_64.c:44:7: warning: "CONFIG_LEGACY_VSYSCALL_NONE" is not defined [-Wundef] #elif CONFIG_LEGACY_VSYSCALL_NONE ^ Use defined() instead. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Triplett Cc: Kees Cook Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20150921074829.GA3550@pd.tnic Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 76e0fd3..174c254 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -39,9 +39,9 @@ #include "vsyscall_trace.h" static enum { EMULATE, NATIVE, NONE } vsyscall_mode = -#ifdef CONFIG_LEGACY_VSYSCALL_NATIVE +#if defined(CONFIG_LEGACY_VSYSCALL_NATIVE) NATIVE; -#elif CONFIG_LEGACY_VSYSCALL_NONE +#elif defined(CONFIG_LEGACY_VSYSCALL_NONE) NONE; #else EMULATE; -- cgit v0.10.2 From 3f2c5085ed99b6ad233cf77009c2f4f898b2f7c8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 1 Sep 2015 15:41:06 -0700 Subject: x86/sched/64: Don't save flags on context switch (reinstated) This reinstates the following commit: 2c7577a75837 ("sched/x86_64: Don't save flags on context switch") which was reverted in: 512255a2ad2c ("Revert 'sched/x86_64: Don't save flags on context switch'") Historically, Linux has always saved and restored EFLAGS across context switches. As far as I know, the only reason to do this is because of the NT flag. In particular, if something calls switch_to() with the NT flag set, then we don't want to leak the NT flag into a different task that might try to IRET and fail because NT is set. Before this commit: 8c7aa698baca ("x86_64, entry: Filter RFLAGS.NT on entry from userspace") we could run system call bodies with NT set. This would be a DoS or possibly privilege escalation hole if scheduling in such a system call would leak NT into a different task. Importantly, we don't need to worry about NT being set while preemptible or across page faults. The only way we can schedule due to preemption or a page fault is in an interrupt entry that nests inside the SYSENTER prologue. The CPU will clear NT when entering through an interrupt gate, so we won't schedule with NT set. The only other interesting flags are IOPL and AC. Allowing switch_to() to change IOPL has no effect, as the value loaded during kernel execution doesn't matter at all except between a SYSENTER entry and the subsequent PUSHF, and anythign that interrupts in that window will restore IOPL on return. If we call __switch_to() with AC set, we have bigger problems. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d4440fdc2a89247bffb7c003d2a9a2952bd46827.1441146105.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h index d7f3b3b..751bf4b 100644 --- a/arch/x86/include/asm/switch_to.h +++ b/arch/x86/include/asm/switch_to.h @@ -79,12 +79,12 @@ do { \ #else /* CONFIG_X86_32 */ /* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" +#define SAVE_CONTEXT "pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp\t" #define __EXTRA_CLOBBER \ , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15" + "r12", "r13", "r14", "r15", "flags" #ifdef CONFIG_CC_STACKPROTECTOR #define __switch_canary \ @@ -100,7 +100,11 @@ do { \ #define __switch_canary_iparam #endif /* CC_STACKPROTECTOR */ -/* Save restore flags to clear handle leaking NT */ +/* + * There is no need to save or restore flags, because flags are always + * clean in kernel mode, with the possible exception of IOPL. Kernel IOPL + * has no effect. + */ #define switch_to(prev, next, last) \ asm volatile(SAVE_CONTEXT \ "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ -- cgit v0.10.2 From 0b101e62afe626ecae60173f92f1e0ec72151653 Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Thu, 24 Sep 2015 14:02:29 +0200 Subject: x86/asm: Force inlining of cpu_relax() On x86, cpu_relax() simply calls rep_nop(), which generates one instruction, PAUSE (aka REP NOP). With this config: http://busybox.net/~vda/kernel_config_OPTIMIZE_INLINING_and_Os gcc-4.7.2 does not always inline rep_nop(): it generates several copies of this: (16 copies, 194 calls): 55 push %rbp 48 89 e5 mov %rsp,%rbp f3 90 pause 5d pop %rbp c3 retq See: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122 This patch fixes this via s/inline/__always_inline/ on rep_nop() and cpu_relax(). ( Forcing inlining only on rep_nop() causes GCC to deinline cpu_relax(), with almost no change in generated code). text data bss dec hex filename 88118971 19905208 36421632 144445811 89c1173 vmlinux.before 88118139 19905208 36421632 144444979 89c0e33 vmlinux Signed-off-by: Denys Vlasenko Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1443096149-27291-1-git-send-email-dvlasenk@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 19577dd..b55f309 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -556,12 +556,12 @@ static inline unsigned int cpuid_edx(unsigned int op) } /* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) +static __always_inline void rep_nop(void) { asm volatile("rep; nop" ::: "memory"); } -static inline void cpu_relax(void) +static __always_inline void cpu_relax(void) { rep_nop(); } -- cgit v0.10.2 From a76cf66e948afbaeda8e3ecc861f29c47a026c27 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:49 -0700 Subject: x86/uaccess: Tell the compiler that uaccess is unlikely to fault GCC doesn't realize that get_user(), put_user(), and their __ variants are unlikely to fail. Tell it. I noticed this while playing with the C entry code. Before: text data bss dec filename 21828763 5194760 1277952 28301475 vmlinux.baseline After: text data bss dec filename 21828379 5194760 1277952 28301091 vmlinux.new The generated code shrunk by 384 bytes. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/dc37bed7024319c3004d950d57151fca6aeacf97.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a8df874..3e911c6 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -182,7 +182,7 @@ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) : "=a" (__ret_gu), "=r" (__val_gu) \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ - __ret_gu; \ + __builtin_expect(__ret_gu, 0); \ }) #define __put_user_x(size, x, ptr, __ret_pu) \ @@ -278,7 +278,7 @@ extern void __put_user_8(void); __put_user_x(X, __pu_val, ptr, __ret_pu); \ break; \ } \ - __ret_pu; \ + __builtin_expect(__ret_pu, 0); \ }) #define __put_user_size(x, ptr, size, retval, errret) \ @@ -401,7 +401,7 @@ do { \ ({ \ int __pu_err; \ __put_user_size((x), (ptr), (size), __pu_err, -EFAULT); \ - __pu_err; \ + __builtin_expect(__pu_err, 0); \ }) #define __get_user_nocheck(x, ptr, size) \ @@ -410,7 +410,7 @@ do { \ unsigned long __gu_val; \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ - __gu_err; \ + __builtin_expect(__gu_err, 0); \ }) /* FIXME: this hack is definitely wrong -AK */ -- cgit v0.10.2 From 7e0f51cb445be8d3aee80e433ed8da4a33ad0157 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:50 -0700 Subject: x86/uaccess: Add unlikely() to __chk_range_not_ok() failure paths This should improve code quality a bit. It also shrinks the kernel text: Before: text data bss dec filename 21828379 5194760 1277952 28301091 vmlinux After: text data bss dec filename 21827997 5194760 1277952 28300709 vmlinux ... by 382 bytes. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/f427b8002d932e5deab9055e0074bb4e7e80ee39.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 3e911c6..09b1b0a 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -51,13 +51,13 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un * limit, not add it to the address). */ if (__builtin_constant_p(size)) - return addr > limit - size; + return unlikely(addr > limit - size); /* Arbitrary sizes? Be careful about overflow */ addr += size; - if (addr < size) + if (unlikely(addr < size)) return true; - return addr > limit; + return unlikely(addr > limit); } #define __range_not_ok(addr, size, limit) \ -- cgit v0.10.2 From 3b56aae34bc695638b8673fc8459be1837c18730 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:51 -0700 Subject: selftests/x86: Add a test for vDSO unwinding While the kernel itself doesn't use DWARF unwinding, user code expects to be able to unwind the vDSO. The vsyscall (AT_SYSINFO) entry is manually CFI-annotated, and this tests that it unwinds correctly. I tested the test by incorrectly annotating __kernel_vsyscall, and the test indeed fails if I do that. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/8bf736d1925cdd165c0f980156a4248e55af47a1.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index fd55bc3..7541352 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -5,7 +5,7 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt -TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso +TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32) diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c new file mode 100644 index 0000000..5992ff2 --- /dev/null +++ b/tools/testing/selftests/x86/unwind_vdso.c @@ -0,0 +1,209 @@ +/* + * unwind_vdso.c - tests unwind info for AT_SYSINFO in the vDSO + * Copyright (c) 2014-2015 Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * This tests __kernel_vsyscall's unwind info. + */ + +#define _GNU_SOURCE + +#include +#include + +#if defined(__GLIBC__) && __GLIBC__ == 2 && __GLIBC_MINOR__ < 16 + +int main() +{ + /* We need getauxval(). */ + printf("[SKIP]\tGLIBC before 2.16 cannot compile this test\n"); + return 0; +} + +#else + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +#ifdef __x86_64__ +# define WIDTH "q" +#else +# define WIDTH "l" +#endif + +static unsigned long get_eflags(void) +{ + unsigned long eflags; + asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags)); + return eflags; +} + +static void set_eflags(unsigned long eflags) +{ + asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH + : : "rm" (eflags) : "flags"); +} + +#define X86_EFLAGS_TF (1UL << 8) + +static volatile sig_atomic_t nerrs; +static unsigned long sysinfo; +static bool got_sysinfo = false; +static unsigned long return_address; + +struct unwind_state { + unsigned long ip; /* trap source */ + int depth; /* -1 until we hit the trap source */ +}; + +_Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque) +{ + struct unwind_state *state = opaque; + unsigned long ip = _Unwind_GetIP(ctx); + + if (state->depth == -1) { + if (ip == state->ip) + state->depth = 0; + else + return _URC_NO_REASON; /* Not there yet */ + } + printf("\t 0x%lx\n", ip); + + if (ip == return_address) { + /* Here we are. */ + unsigned long eax = _Unwind_GetGR(ctx, 0); + unsigned long ecx = _Unwind_GetGR(ctx, 1); + unsigned long edx = _Unwind_GetGR(ctx, 2); + unsigned long ebx = _Unwind_GetGR(ctx, 3); + unsigned long ebp = _Unwind_GetGR(ctx, 5); + unsigned long esi = _Unwind_GetGR(ctx, 6); + unsigned long edi = _Unwind_GetGR(ctx, 7); + bool ok = (eax == SYS_getpid || eax == getpid()) && + ebx == 1 && ecx == 2 && edx == 3 && + esi == 4 && edi == 5 && ebp == 6; + + if (!ok) + nerrs++; + printf("[%s]\t NR = %ld, args = %ld, %ld, %ld, %ld, %ld, %ld\n", + (ok ? "OK" : "FAIL"), + eax, ebx, ecx, edx, esi, edi, ebp); + + return _URC_NORMAL_STOP; + } else { + state->depth++; + return _URC_NO_REASON; + } +} + +static void sigtrap(int sig, siginfo_t *info, void *ctx_void) +{ + ucontext_t *ctx = (ucontext_t*)ctx_void; + struct unwind_state state; + unsigned long ip = ctx->uc_mcontext.gregs[REG_EIP]; + + if (!got_sysinfo && ip == sysinfo) { + got_sysinfo = true; + + /* Find the return address. */ + return_address = *(unsigned long *)(unsigned long)ctx->uc_mcontext.gregs[REG_ESP]; + + printf("\tIn vsyscall at 0x%lx, returning to 0x%lx\n", + ip, return_address); + } + + if (!got_sysinfo) + return; /* Not there yet */ + + if (ip == return_address) { + ctx->uc_mcontext.gregs[REG_EFL] &= ~X86_EFLAGS_TF; + printf("\tVsyscall is done\n"); + return; + } + + printf("\tSIGTRAP at 0x%lx\n", ip); + + state.ip = ip; + state.depth = -1; + _Unwind_Backtrace(trace_fn, &state); +} + +int main() +{ + sysinfo = getauxval(AT_SYSINFO); + printf("\tAT_SYSINFO is 0x%lx\n", sysinfo); + + Dl_info info; + if (!dladdr((void *)sysinfo, &info)) { + printf("[WARN]\tdladdr failed on AT_SYSINFO\n"); + } else { + printf("[OK]\tAT_SYSINFO maps to %s, loaded at 0x%p\n", + info.dli_fname, info.dli_fbase); + } + + sethandler(SIGTRAP, sigtrap, 0); + + syscall(SYS_getpid); /* Force symbol binding without TF set. */ + printf("[RUN]\tSet TF and check a fast syscall\n"); + set_eflags(get_eflags() | X86_EFLAGS_TF); + syscall(SYS_getpid, 1, 2, 3, 4, 5, 6); + if (!got_sysinfo) { + set_eflags(get_eflags() & ~X86_EFLAGS_TF); + + /* + * The most likely cause of this is that you're on Debian or + * a Debian-based distro, you're missing libc6-i686, and you're + * affected by libc/19006 (https://sourceware.org/PR19006). + */ + printf("[WARN]\tsyscall(2) didn't enter AT_SYSINFO\n"); + } if (get_eflags() & X86_EFLAGS_TF) { + printf("[FAIL]\tTF is still set\n"); + nerrs++; + } + + if (nerrs) { + printf("[FAIL]\tThere were errors\n"); + return 1; + } else { + printf("[OK]\tAll is well\n"); + return 0; + } +} + +#endif /* New enough libc */ -- cgit v0.10.2 From 04235c00b6bb72b589e99efcc18883378ee76f1b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:52 -0700 Subject: selftests/x86: Add a test for ptrace syscall restart and arg modification This tests assumptions about how fast syscall works wrt pt_regs and, in particular, what happens if IP is decremented by 2 during a syscall. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1c44dbfe59000ba135bbf35ccc5d2433a0b31618.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index 7541352..389701f 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -4,7 +4,7 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean -TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt +TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY) @@ -60,4 +60,5 @@ endif # Some tests have additional dependencies. sysret_ss_attrs_64: thunks.S +ptrace_syscall_32: raw_syscall_helper_32.S test_syscall_vdso_32: thunks_32.S diff --git a/tools/testing/selftests/x86/ptrace_syscall.c b/tools/testing/selftests/x86/ptrace_syscall.c new file mode 100644 index 0000000..5105b49 --- /dev/null +++ b/tools/testing/selftests/x86/ptrace_syscall.c @@ -0,0 +1,294 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Bitness-agnostic defines for user_regs_struct fields. */ +#ifdef __x86_64__ +# define user_syscall_nr orig_rax +# define user_arg0 rdi +# define user_arg1 rsi +# define user_arg2 rdx +# define user_arg3 r10 +# define user_arg4 r8 +# define user_arg5 r9 +# define user_ip rip +# define user_ax rax +#else +# define user_syscall_nr orig_eax +# define user_arg0 ebx +# define user_arg1 ecx +# define user_arg2 edx +# define user_arg3 esi +# define user_arg4 edi +# define user_arg5 ebp +# define user_ip eip +# define user_ax eax +#endif + +static int nerrs = 0; + +struct syscall_args32 { + uint32_t nr, arg0, arg1, arg2, arg3, arg4, arg5; +}; + +#ifdef __i386__ +extern void sys32_helper(struct syscall_args32 *, void *); +extern void int80_and_ret(void); +#endif + +/* + * Helper to invoke int80 with controlled regs and capture the final regs. + */ +static void do_full_int80(struct syscall_args32 *args) +{ +#ifdef __x86_64__ + register unsigned long bp asm("bp") = args->arg5; + asm volatile ("int $0x80" + : "+a" (args->nr), + "+b" (args->arg0), "+c" (args->arg1), "+d" (args->arg2), + "+S" (args->arg3), "+D" (args->arg4), "+r" (bp)); + args->arg5 = bp; +#else + sys32_helper(args, int80_and_ret); +#endif +} + +#ifdef __i386__ +static void (*vsyscall32)(void); + +/* + * Nasty helper to invoke AT_SYSINFO (i.e. __kernel_vsyscall) with + * controlled regs and capture the final regs. This is so nasty that it + * crashes my copy of gdb :) + */ +static void do_full_vsyscall32(struct syscall_args32 *args) +{ + sys32_helper(args, vsyscall32); +} +#endif + +static siginfo_t wait_trap(pid_t chld) +{ + siginfo_t si; + if (waitid(P_PID, chld, &si, WEXITED|WSTOPPED) != 0) + err(1, "waitid"); + if (si.si_pid != chld) + errx(1, "got unexpected pid in event\n"); + if (si.si_code != CLD_TRAPPED) + errx(1, "got unexpected event type %d\n", si.si_code); + return si; +} + +static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *), + int flags) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_sigaction = handler; + sa.sa_flags = SA_SIGINFO | flags; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +static void clearhandler(int sig) +{ + struct sigaction sa; + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = SIG_DFL; + sigemptyset(&sa.sa_mask); + if (sigaction(sig, &sa, 0)) + err(1, "sigaction"); +} + +#ifdef __x86_64__ +# define REG_BP REG_RBP +#else +# define REG_BP REG_EBP +#endif + +static void empty_handler(int sig, siginfo_t *si, void *ctx_void) +{ +} + +static void test_sys32_regs(void (*do_syscall)(struct syscall_args32 *)) +{ + struct syscall_args32 args = { + .nr = 224, /* gettid */ + .arg0 = 10, .arg1 = 11, .arg2 = 12, + .arg3 = 13, .arg4 = 14, .arg5 = 15, + }; + + do_syscall(&args); + + if (args.nr != getpid() || + args.arg0 != 10 || args.arg1 != 11 || args.arg2 != 12 || + args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) { + printf("[FAIL]\tgetpid() failed to preseve regs\n"); + nerrs++; + } else { + printf("[OK]\tgetpid() preserves regs\n"); + } + + sethandler(SIGUSR1, empty_handler, 0); + + args.nr = 37; /* kill */ + args.arg0 = getpid(); + args.arg1 = SIGUSR1; + do_syscall(&args); + if (args.nr != 0 || + args.arg0 != getpid() || args.arg1 != SIGUSR1 || args.arg2 != 12 || + args.arg3 != 13 || args.arg4 != 14 || args.arg5 != 15) { + printf("[FAIL]\tkill(getpid(), SIGUSR1) failed to preseve regs\n"); + nerrs++; + } else { + printf("[OK]\tkill(getpid(), SIGUSR1) preserves regs\n"); + } + clearhandler(SIGUSR1); +} + +static void test_ptrace_syscall_restart(void) +{ + printf("[RUN]\tptrace-induced syscall restart\n"); + pid_t chld = fork(); + if (chld < 0) + err(1, "fork"); + + if (chld == 0) { + if (ptrace(PTRACE_TRACEME, 0, 0, 0) != 0) + err(1, "PTRACE_TRACEME"); + + printf("\tChild will make one syscall\n"); + raise(SIGSTOP); + + syscall(SYS_gettid, 10, 11, 12, 13, 14, 15); + _exit(0); + } + + int status; + + /* Wait for SIGSTOP. */ + if (waitpid(chld, &status, 0) != chld || !WIFSTOPPED(status)) + err(1, "waitpid"); + + struct user_regs_struct regs; + + printf("[RUN]\tSYSEMU\n"); + if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) + err(1, "PTRACE_SYSCALL"); + wait_trap(chld); + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + if (regs.user_syscall_nr != SYS_gettid || + regs.user_arg0 != 10 || regs.user_arg1 != 11 || + regs.user_arg2 != 12 || regs.user_arg3 != 13 || + regs.user_arg4 != 14 || regs.user_arg5 != 15) { + printf("[FAIL]\tInitial args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tInitial nr and args are correct\n"); + } + + printf("[RUN]\tRestart the syscall (ip = 0x%lx)\n", + (unsigned long)regs.user_ip); + + /* + * This does exactly what it appears to do if syscall is int80 or + * SYSCALL64. For SYSCALL32 or SYSENTER, though, this is highly + * magical. It needs to work so that ptrace and syscall restart + * work as expected. + */ + regs.user_ax = regs.user_syscall_nr; + regs.user_ip -= 2; + if (ptrace(PTRACE_SETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_SETREGS"); + + if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) + err(1, "PTRACE_SYSCALL"); + wait_trap(chld); + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + if (regs.user_syscall_nr != SYS_gettid || + regs.user_arg0 != 10 || regs.user_arg1 != 11 || + regs.user_arg2 != 12 || regs.user_arg3 != 13 || + regs.user_arg4 != 14 || regs.user_arg5 != 15) { + printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tRestarted nr and args are correct\n"); + } + + printf("[RUN]\tChange nr and args and restart the syscall (ip = 0x%lx)\n", + (unsigned long)regs.user_ip); + + regs.user_ax = SYS_getpid; + regs.user_arg0 = 20; + regs.user_arg1 = 21; + regs.user_arg2 = 22; + regs.user_arg3 = 23; + regs.user_arg4 = 24; + regs.user_arg5 = 25; + regs.user_ip -= 2; + + if (ptrace(PTRACE_SETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_SETREGS"); + + if (ptrace(PTRACE_SYSEMU, chld, 0, 0) != 0) + err(1, "PTRACE_SYSCALL"); + wait_trap(chld); + + if (ptrace(PTRACE_GETREGS, chld, 0, ®s) != 0) + err(1, "PTRACE_GETREGS"); + + if (regs.user_syscall_nr != SYS_getpid || + regs.user_arg0 != 20 || regs.user_arg1 != 21 || regs.user_arg2 != 22 || + regs.user_arg3 != 23 || regs.user_arg4 != 24 || regs.user_arg5 != 25) { + printf("[FAIL]\tRestart nr or args are wrong (nr=%lu, args=%lu %lu %lu %lu %lu %lu)\n", (unsigned long)regs.user_syscall_nr, (unsigned long)regs.user_arg0, (unsigned long)regs.user_arg1, (unsigned long)regs.user_arg2, (unsigned long)regs.user_arg3, (unsigned long)regs.user_arg4, (unsigned long)regs.user_arg5); + nerrs++; + } else { + printf("[OK]\tReplacement nr and args are correct\n"); + } + + if (ptrace(PTRACE_CONT, chld, 0, 0) != 0) + err(1, "PTRACE_CONT"); + if (waitpid(chld, &status, 0) != chld) + err(1, "waitpid"); + if (!WIFEXITED(status) || WEXITSTATUS(status) != 0) { + printf("[FAIL]\tChild failed\n"); + nerrs++; + } else { + printf("[OK]\tChild exited cleanly\n"); + } +} + +int main() +{ + printf("[RUN]\tCheck int80 return regs\n"); + test_sys32_regs(do_full_int80); + +#if defined(__i386__) && (!defined(__GLIBC__) || __GLIBC__ > 2 || __GLIBC_MINOR__ >= 16) + vsyscall32 = (void *)getauxval(AT_SYSINFO); + printf("[RUN]\tCheck AT_SYSINFO return regs\n"); + test_sys32_regs(do_full_vsyscall32); +#endif + + test_ptrace_syscall_restart(); + + return 0; +} diff --git a/tools/testing/selftests/x86/raw_syscall_helper_32.S b/tools/testing/selftests/x86/raw_syscall_helper_32.S new file mode 100644 index 0000000..534e71e --- /dev/null +++ b/tools/testing/selftests/x86/raw_syscall_helper_32.S @@ -0,0 +1,46 @@ +.global sys32_helper +sys32_helper: + /* Args: syscall_args_32*, function pointer */ + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 5*4(%esp), %eax /* pointer to args struct */ + + movl 1*4(%eax), %ebx + movl 2*4(%eax), %ecx + movl 3*4(%eax), %edx + movl 4*4(%eax), %esi + movl 5*4(%eax), %edi + movl 6*4(%eax), %ebp + movl 0*4(%eax), %eax + + call *(6*4)(%esp) /* Do the syscall */ + + /* Now we need to recover without losing any reg values */ + pushl %eax + movl 6*4(%esp), %eax + popl 0*4(%eax) + movl %ebx, 1*4(%eax) + movl %ecx, 2*4(%eax) + movl %edx, 3*4(%eax) + movl %esi, 4*4(%eax) + movl %edi, 5*4(%eax) + movl %ebp, 6*4(%eax) + + popl %edi + popl %esi + popl %ebx + popl %ebp + ret + + .type sys32_helper, @function + .size sys32_helper, .-sys32_helper + +.global int80_and_ret +int80_and_ret: + int $0x80 + ret + + .type int80_and_ret, @function + .size int80_and_ret, .-int80_and_ret -- cgit v0.10.2 From dd27f998f0ed3c797032a82033fa191be7c61e4c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:53 -0700 Subject: x86/entry/64/compat: Fix SYSENTER's NT flag before user memory access Clearing NT is part of the prologue, whereas loading up arg6 makes more sense to think about as part of syscall processing. Reorder them. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/19eb235828b2d2a52c53459e09f2974e15e65a35.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index a9360d4..e2cca89 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -89,15 +89,6 @@ ENTRY(entry_SYSENTER_compat) sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ /* - * no need to do an access_ok check here because rbp has been - * 32-bit zero extended - */ - ASM_STAC -1: movl (%rbp), %ebp - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - - /* * Sysenter doesn't filter flags, so we need to clear NT * ourselves. To save a few cycles, we can check whether * NT was set instead of doing an unconditional popfq. @@ -106,6 +97,15 @@ ENTRY(entry_SYSENTER_compat) jnz sysenter_fix_flags sysenter_flags_fixed: + /* + * No need to do an access_ok() check here because RBP has been + * 32-bit zero extended: + */ + ASM_STAC +1: movl (%rbp), %ebp + _ASM_EXTABLE(1b, ia32_badarg) + ASM_CLAC + orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz sysenter_tracesys -- cgit v0.10.2 From 72f924783b8a87e4454516520ffb5f35e4930371 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:54 -0700 Subject: x86/entry, locking/lockdep: Move lockdep_sys_exit() to prepare_exit_to_usermode() Rather than worrying about exactly where LOCKDEP_SYS_EXIT should go in the asm code, add it to prepare_exit_from_usermode() and remove all of the asm calls that are followed by prepare_exit_to_usermode(). LOCKDEP_SYS_EXIT now appears only in the syscall fast paths. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/1736ebe948b845e68120b86b89091f3ec27f5e8e.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 80dcc92..d94a60c 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -220,6 +220,8 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) if (WARN_ON(!irqs_disabled())) local_irq_disable(); + lockdep_sys_exit(); + /* * In order to return to user mode, we need to have IRQs off with * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index b2909bf..a08ded4 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -255,7 +255,6 @@ ret_from_intr: jb resume_kernel # not returning to v8086 or userspace ENTRY(resume_userspace) - LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF movl %esp, %eax @@ -372,7 +371,6 @@ syscall_call: syscall_after_call: movl %eax, PT_EAX(%esp) # store the return value syscall_exit: - LOCKDEP_SYS_EXIT jmp syscall_exit_work restore_all: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 055a01d..7dc2850 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -557,7 +557,6 @@ ret_from_intr: jz retint_kernel /* Interrupt came from user space */ - LOCKDEP_SYS_EXIT_IRQ GLOBAL(retint_user) mov %rsp,%rdi call prepare_exit_to_usermode -- cgit v0.10.2 From b611acf4736b5b00c89dcc238f640337832abcb4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:55 -0700 Subject: x86/entry/64/compat: After SYSENTER, move STI after the NT fixup We eventually want to make it all the way into C code before enabling interrupts. We need to rework our flags handling slightly to delay enabling interrupts. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/35d24d2a9305da3182eab7b2cdfd32902e90962c.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index e2cca89..bc678f0 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -58,14 +58,9 @@ ENDPROC(native_usergs_sysret32) * with the int 0x80 path. */ ENTRY(entry_SYSENTER_compat) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ + /* Interrupts are off on entry. */ SWAPGS_UNSAFE_STACK movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ movl %ebp, %ebp @@ -76,7 +71,16 @@ ENTRY(entry_SYSENTER_compat) /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ pushq %rbp /* pt_regs->sp */ - pushfq /* pt_regs->flags */ + + /* + * Push flags. This is nasty. First, interrupts are currently + * off, but we need pt_regs->flags to have IF set. Second, even + * if TF was set when SYSENTER started, it's clear by now. We fix + * that later using TIF_SINGLESTEP. + */ + pushfq /* pt_regs->flags (except IF = 0) */ + orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ + pushq $__USER32_CS /* pt_regs->cs */ pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ pushq %rax /* pt_regs->orig_ax */ @@ -92,12 +96,22 @@ ENTRY(entry_SYSENTER_compat) * Sysenter doesn't filter flags, so we need to clear NT * ourselves. To save a few cycles, we can check whether * NT was set instead of doing an unconditional popfq. + * This needs to happen before enabling interrupts so that + * we don't get preempted with NT set. */ testl $X86_EFLAGS_NT, EFLAGS(%rsp) jnz sysenter_fix_flags sysenter_flags_fixed: /* + * Re-enable interrupts. IRQ tracing already thinks that IRQs are + * on (since we treat user mode as having IRQs on), and the + * prologue above is too short for it to be worth adding a + * tracing round trip. + */ + ENABLE_INTERRUPTS(CLBR_NONE) + + /* * No need to do an access_ok() check here because RBP has been * 32-bit zero extended: */ @@ -244,7 +258,7 @@ sysexit_audit: #endif sysenter_fix_flags: - pushq $(X86_EFLAGS_IF|X86_EFLAGS_FIXED) + pushq $X86_EFLAGS_FIXED popfq jmp sysenter_flags_fixed -- cgit v0.10.2 From 0a6d1fa0d2b48fbae444e46e7f37a4832b2f8bdf Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:56 -0700 Subject: x86/vdso: Remove runtime 32-bit vDSO selection 32-bit userspace will now always see the same vDSO, which is exactly what used to be the int80 vDSO. Subsequent patches will clean it up and make it support SYSENTER and SYSCALL using alternatives. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/e7e6b3526fa442502e6125fe69486aab50813c32.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index a3d0767..3bfb39e 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -19,9 +19,7 @@ obj-y += vma.o # vDSO images to build vdso_img-$(VDSO64-y) += 64 vdso_img-$(VDSOX32-y) += x32 -vdso_img-$(VDSO32-y) += 32-int80 -vdso_img-$(CONFIG_IA32_EMULATION) += 32-syscall -vdso_img-$(VDSO32-y) += 32-sysenter +vdso_img-$(VDSO32-y) += 32 obj-$(VDSO32-y) += vdso32-setup.o @@ -122,15 +120,6 @@ $(obj)/%.so: $(obj)/%.so.dbg $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE $(call if_changed,vdso) -# -# Build multiple 32-bit vDSO images to choose from at boot time. -# -vdso32.so-$(VDSO32-y) += int80 -vdso32.so-$(CONFIG_IA32_EMULATION) += syscall -vdso32.so-$(VDSO32-y) += sysenter - -vdso32-images = $(vdso32.so-y:%=vdso32-%.so) - CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 @@ -139,14 +128,12 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds -targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) +targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o targets += vdso32/vclock_gettime.o -$(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) - KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -$(vdso32-images:%=$(obj)/%.dbg): KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) -$(vdso32-images:%=$(obj)/%.dbg): asflags-$(CONFIG_X86_64) += -m32 +$(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) +$(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32 KBUILD_CFLAGS_32 := $(filter-out -m64,$(KBUILD_CFLAGS)) KBUILD_CFLAGS_32 := $(filter-out -mcmodel=kernel,$(KBUILD_CFLAGS_32)) @@ -157,13 +144,13 @@ KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING -$(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) +$(obj)/vdso32.so.dbg: KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) -$(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ - $(obj)/vdso32/vdso32.lds \ - $(obj)/vdso32/vclock_gettime.o \ - $(obj)/vdso32/note.o \ - $(obj)/vdso32/%.o +$(obj)/vdso32.so.dbg: FORCE \ + $(obj)/vdso32/vdso32.lds \ + $(obj)/vdso32/vclock_gettime.o \ + $(obj)/vdso32/note.o \ + $(obj)/vdso32/system_call.o $(call if_changed,vdso) # @@ -206,4 +193,4 @@ $(vdso_img_insttargets): install_%: $(obj)/%.dbg $(MODLIB)/vdso FORCE PHONY += vdso_install $(vdso_img_insttargets) vdso_install: $(vdso_img_insttargets) FORCE -clean-files := vdso32-syscall* vdso32-sysenter* vdso32-int80* vdso64* vdso-image-*.c vdsox32.so* +clean-files := vdso32.so vdso32.so.dbg vdso64* vdso-image-*.c vdsox32.so* diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 8627db2..2637eb1 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -98,7 +98,6 @@ struct vdso_sym required_syms[] = { "VDSO_FAKE_SECTION_TABLE_END", false }, {"VDSO32_NOTE_MASK", true}, - {"VDSO32_SYSENTER_RETURN", true}, {"__kernel_vsyscall", true}, {"__kernel_sigreturn", true}, {"__kernel_rt_sigreturn", true}, diff --git a/arch/x86/entry/vdso/vdso32-setup.c b/arch/x86/entry/vdso/vdso32-setup.c index e904c27..08a317a 100644 --- a/arch/x86/entry/vdso/vdso32-setup.c +++ b/arch/x86/entry/vdso/vdso32-setup.c @@ -48,35 +48,9 @@ __setup("vdso32=", vdso32_setup); __setup_param("vdso=", vdso_setup, vdso32_setup, 0); #endif -#ifdef CONFIG_X86_64 - -#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SYSENTER32)) -#define vdso32_syscall() (boot_cpu_has(X86_FEATURE_SYSCALL32)) - -#else /* CONFIG_X86_32 */ - -#define vdso32_sysenter() (boot_cpu_has(X86_FEATURE_SEP)) -#define vdso32_syscall() (0) - -#endif /* CONFIG_X86_64 */ - -#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT) -const struct vdso_image *selected_vdso32; -#endif - int __init sysenter_setup(void) { -#ifdef CONFIG_COMPAT - if (vdso32_syscall()) - selected_vdso32 = &vdso_image_32_syscall; - else -#endif - if (vdso32_sysenter()) - selected_vdso32 = &vdso_image_32_sysenter; - else - selected_vdso32 = &vdso_image_32_int80; - - init_vdso_image(selected_vdso32); + init_vdso_image(&vdso_image_32); return 0; } diff --git a/arch/x86/entry/vdso/vdso32/int80.S b/arch/x86/entry/vdso/vdso32/int80.S deleted file mode 100644 index b15b7c0..0000000 --- a/arch/x86/entry/vdso/vdso32/int80.S +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Code for the vDSO. This version uses the old int $0x80 method. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - int $0x80 - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - .align 4 -.LENDFDEDLSI: - .previous - - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 - .previous diff --git a/arch/x86/entry/vdso/vdso32/syscall.S b/arch/x86/entry/vdso/vdso32/syscall.S deleted file mode 100644 index 6b286bb..0000000 --- a/arch/x86/entry/vdso/vdso32/syscall.S +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Code for the vDSO. This version uses the syscall instruction. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#define SYSCALL_ENTER_KERNEL syscall -#include "sigreturn.S" - -#include - - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - push %ebp -.Lpush_ebp: - movl %ecx, %ebp - syscall - movl %ebp, %ecx - popl %ebp -.Lpop_ebp: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - - .section .eh_frame,"a",@progbits -.LSTARTFRAME: - .long .LENDCIE-.LSTARTCIE -.LSTARTCIE: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIE: - - .long .LENDFDE1-.LSTARTFDE1 /* Length FDE */ -.LSTARTFDE1: - .long .LSTARTFDE1-.LSTARTFRAME /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 /* Augmentation length */ - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x40 + .Lpush_ebp-.LSTART_vsyscall /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 8 - .byte 0x85, 0x02 /* DW_CFA_offset %ebp -8 */ - .byte 0x40 + .Lpop_ebp-.Lpush_ebp /* DW_CFA_advance_loc */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .uleb128 4 - .align 4 -.LENDFDE1: - .previous - - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDE1-.LSTARTFRAME), 0 - .previous diff --git a/arch/x86/entry/vdso/vdso32/sysenter.S b/arch/x86/entry/vdso/vdso32/sysenter.S deleted file mode 100644 index e354bce..0000000 --- a/arch/x86/entry/vdso/vdso32/sysenter.S +++ /dev/null @@ -1,116 +0,0 @@ -/* - * Code for the vDSO. This version uses the sysenter instruction. - * - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - -/* - * The caller puts arg2 in %ecx, which gets pushed. The kernel will use - * %ecx itself for arg2. The pushing is because the sysexit instruction - * (found in entry.S) requires that we clobber %ecx with the desired %esp. - * User code might expect that %ecx is unclobbered though, as it would be - * for returning via the iret instruction, so we must push and pop. - * - * The caller puts arg3 in %edx, which the sysexit instruction requires - * for %eip. Thus, exactly as for arg2, we must push and pop. - * - * Arg6 is different. The caller puts arg6 in %ebp. Since the sysenter - * instruction clobbers %esp, the user's %esp won't even survive entry - * into the kernel. We store %esp in %ebp. Code in entry.S must fetch - * arg6 from the stack. - * - * You can not use this vsyscall for the clone() syscall because the - * three words on the parent stack do not get copied to the child. - */ - .text - .globl __kernel_vsyscall - .type __kernel_vsyscall,@function - ALIGN -__kernel_vsyscall: -.LSTART_vsyscall: - push %ecx -.Lpush_ecx: - push %edx -.Lpush_edx: - push %ebp -.Lenter_kernel: - movl %esp,%ebp - sysenter - - /* 7: align return point with nop's to make disassembly easier */ - .space 7,0x90 - - /* 14: System call restart point is here! (SYSENTER_RETURN-2) */ - int $0x80 - /* 16: System call normal return point is here! */ -VDSO32_SYSENTER_RETURN: /* Symbol used by sysenter.c via vdso32-syms.h */ - pop %ebp -.Lpop_ebp: - pop %edx -.Lpop_edx: - pop %ecx -.Lpop_ecx: - ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - /* What follows are the instructions for the table generation. - We have to record all changes of the stack pointer. */ - .byte 0x40 + (.Lpush_ecx-.LSTART_vsyscall) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x40 + (.Lpush_edx-.Lpush_ecx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0x40 + (.Lenter_kernel-.Lpush_edx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x10 /* RA at offset 16 now */ - .byte 0x85, 0x04 /* DW_CFA_offset %ebp -16 */ - /* Finally the epilogue. */ - .byte 0x40 + (.Lpop_ebp-.Lenter_kernel) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x0c /* RA at offset 12 now */ - .byte 0xc5 /* DW_CFA_restore %ebp */ - .byte 0x40 + (.Lpop_edx-.Lpop_ebp) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x08 /* RA at offset 8 now */ - .byte 0x40 + (.Lpop_ecx-.Lpop_edx) /* DW_CFA_advance_loc */ - .byte 0x0e /* DW_CFA_def_cfa_offset */ - .byte 0x04 /* RA at offset 4 now */ - .align 4 -.LENDFDEDLSI: - .previous - - /* - * Emit a symbol with the size of this .eh_frame data, - * to verify it matches the other versions. - */ -VDSO32_vsyscall_eh_frame_size = (.LENDFDEDLSI-.LSTARTFRAMEDLSI) diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S new file mode 100644 index 0000000..b15b7c0 --- /dev/null +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -0,0 +1,56 @@ +/* + * Code for the vDSO. This version uses the old int $0x80 method. + * + * First get the common code for the sigreturn entry points. + * This must come first. + */ +#include "sigreturn.S" + + .text + .globl __kernel_vsyscall + .type __kernel_vsyscall,@function + ALIGN +__kernel_vsyscall: +.LSTART_vsyscall: + int $0x80 + ret +.LEND_vsyscall: + .size __kernel_vsyscall,.-.LSTART_vsyscall + .previous + + .section .eh_frame,"a",@progbits +.LSTARTFRAMEDLSI: + .long .LENDCIEDLSI-.LSTARTCIEDLSI +.LSTARTCIEDLSI: + .long 0 /* CIE ID */ + .byte 1 /* Version number */ + .string "zR" /* NUL-terminated augmentation string */ + .uleb128 1 /* Code alignment factor */ + .sleb128 -4 /* Data alignment factor */ + .byte 8 /* Return address register column */ + .uleb128 1 /* Augmentation value length */ + .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ + .byte 0x0c /* DW_CFA_def_cfa */ + .uleb128 4 + .uleb128 4 + .byte 0x88 /* DW_CFA_offset, column 0x8 */ + .uleb128 1 + .align 4 +.LENDCIEDLSI: + .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ +.LSTARTFDEDLSI: + .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ + .long .LSTART_vsyscall-. /* PC-relative start address */ + .long .LEND_vsyscall-.LSTART_vsyscall + .uleb128 0 + .align 4 +.LENDFDEDLSI: + .previous + + /* + * Pad out the segment to match the size of the sysenter.S version. + */ +VDSO32_vsyscall_eh_frame_size = 0x40 + .section .data,"aw",@progbits + .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 + .previous diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 4345431..64df471 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -180,21 +180,10 @@ up_fail: #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) static int load_vdso32(void) { - int ret; - if (vdso32_enabled != 1) /* Other values all mean "disabled" */ return 0; - ret = map_vdso(selected_vdso32, false); - if (ret) - return ret; - - if (selected_vdso32->sym_VDSO32_SYSENTER_RETURN) - current_thread_info()->sysenter_return = - current->mm->context.vdso + - selected_vdso32->sym_VDSO32_SYSENTER_RETURN; - - return 0; + return map_vdso(&vdso_image_32, false); } #endif diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index a0a19b7..e6a5c275 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -289,7 +289,7 @@ int ia32_setup_frame(int sig, struct ksignal *ksig, /* Return stub is in 32bit vsyscall page */ if (current->mm->context.vdso) restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_sigreturn; + vdso_image_32.sym___kernel_sigreturn; else restorer = &frame->retcode; } @@ -368,7 +368,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig, restorer = ksig->ka.sa.sa_restorer; else restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_rt_sigreturn; + vdso_image_32.sym___kernel_rt_sigreturn; put_user_ex(ptr_to_compat(restorer), &frame->pretcode); /* diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 141c561..2ee05c4 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -328,7 +328,7 @@ else \ #define VDSO_ENTRY \ ((unsigned long)current->mm->context.vdso + \ - selected_vdso32->sym___kernel_vsyscall) + vdso_image_32.sym___kernel_vsyscall) struct linux_binprm; diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 8021bd2..5bcb1de 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -26,7 +26,6 @@ struct vdso_image { long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; long sym___kernel_vsyscall; - long sym_VDSO32_SYSENTER_RETURN; }; #ifdef CONFIG_X86_64 @@ -38,13 +37,7 @@ extern const struct vdso_image vdso_image_x32; #endif #if defined CONFIG_X86_32 || defined CONFIG_COMPAT -extern const struct vdso_image vdso_image_32_int80; -#ifdef CONFIG_COMPAT -extern const struct vdso_image vdso_image_32_syscall; -#endif -extern const struct vdso_image vdso_image_32_sysenter; - -extern const struct vdso_image *selected_vdso32; +extern const struct vdso_image vdso_image_32; #endif extern void __init init_vdso_image(const struct vdso_image *image); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index da52e6b..d87ce92 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -299,7 +299,7 @@ __setup_frame(int sig, struct ksignal *ksig, sigset_t *set, if (current->mm->context.vdso) restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_sigreturn; + vdso_image_32.sym___kernel_sigreturn; else restorer = &frame->retcode; if (ksig->ka.sa.sa_flags & SA_RESTORER) @@ -363,7 +363,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, /* Set up to return from userspace. */ restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_rt_sigreturn; + vdso_image_32.sym___kernel_rt_sigreturn; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; put_user_ex(restorer, &frame->pretcode); diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 1c30e4a..63320b6 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -965,17 +965,8 @@ char * __init xen_auto_xlated_memory_setup(void) static void __init fiddle_vdso(void) { #ifdef CONFIG_X86_32 - /* - * This could be called before selected_vdso32 is initialized, so - * just fiddle with both possible images. vdso_image_32_syscall - * can't be selected, since it only exists on 64-bit systems. - */ - u32 *mask; - mask = vdso_image_32_int80.data + - vdso_image_32_int80.sym_VDSO32_NOTE_MASK; - *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; - mask = vdso_image_32_sysenter.data + - vdso_image_32_sysenter.sym_VDSO32_NOTE_MASK; + u32 *mask = vdso_image_32.data + + vdso_image_32.sym_VDSO32_NOTE_MASK; *mask |= 1 << VDSO_NOTE_NONEGSEG_BIT; #endif } -- cgit v0.10.2 From 7b956f035a9ef8bd3ef5490f49fc1bd834d8a70a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:57 -0700 Subject: x86/asm: Re-add parts of the manual CFI infrastructure Commit: 131484c8da97 ("x86/debug: Remove perpetually broken, unmaintainable dwarf annotations") removed all the manual DWARF annotations outside the vDSO. It also removed the macros we used for the manual annotations. Re-add these macros so that we can clean up the vDSO annotations. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/4c70bb98a8b773c8ccfaabf6745e569ff43e7f65.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/Makefile b/arch/x86/Makefile index 747860c..2dfaa72 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -159,6 +159,12 @@ endif sp-$(CONFIG_X86_32) := esp sp-$(CONFIG_X86_64) := rsp +# do binutils support CFI? +cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_endproc,-DCONFIG_AS_CFI=1) +# is .cfi_signal_frame supported too? +cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) +cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) + # does binutils support specific instructions? asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1) asinstr += $(call as-instr,pshufb %xmm0$(comma)%xmm0,-DCONFIG_AS_SSSE3=1) @@ -166,8 +172,8 @@ asinstr += $(call as-instr,crc32l %eax$(comma)%eax,-DCONFIG_AS_CRC32=1) avx_instr := $(call as-instr,vxorps %ymm0$(comma)%ymm1$(comma)%ymm2,-DCONFIG_AS_AVX=1) avx2_instr :=$(call as-instr,vpbroadcastb %xmm0$(comma)%ymm1,-DCONFIG_AS_AVX2=1) -KBUILD_AFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) -KBUILD_CFLAGS += $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) +KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr) $(avx_instr) $(avx2_instr) LDFLAGS := -m elf_$(UTS_MACHINE) diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h new file mode 100644 index 0000000..de1cdaf --- /dev/null +++ b/arch/x86/include/asm/dwarf2.h @@ -0,0 +1,170 @@ +#ifndef _ASM_X86_DWARF2_H +#define _ASM_X86_DWARF2_H + +#ifndef __ASSEMBLY__ +#warning "asm/dwarf2.h should be only included in pure assembly files" +#endif + +/* + * Macros for dwarf2 CFI unwind table entries. + * See "as.info" for details on these pseudo ops. Unfortunately + * they are only supported in very new binutils, so define them + * away for older version. + */ + +#ifdef CONFIG_AS_CFI + +#define CFI_STARTPROC .cfi_startproc +#define CFI_ENDPROC .cfi_endproc +#define CFI_DEF_CFA .cfi_def_cfa +#define CFI_DEF_CFA_REGISTER .cfi_def_cfa_register +#define CFI_DEF_CFA_OFFSET .cfi_def_cfa_offset +#define CFI_ADJUST_CFA_OFFSET .cfi_adjust_cfa_offset +#define CFI_OFFSET .cfi_offset +#define CFI_REL_OFFSET .cfi_rel_offset +#define CFI_REGISTER .cfi_register +#define CFI_RESTORE .cfi_restore +#define CFI_REMEMBER_STATE .cfi_remember_state +#define CFI_RESTORE_STATE .cfi_restore_state +#define CFI_UNDEFINED .cfi_undefined +#define CFI_ESCAPE .cfi_escape + +#ifdef CONFIG_AS_CFI_SIGNAL_FRAME +#define CFI_SIGNAL_FRAME .cfi_signal_frame +#else +#define CFI_SIGNAL_FRAME +#endif + +#if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__) + /* + * Emit CFI data in .debug_frame sections, not .eh_frame sections. + * The latter we currently just discard since we don't do DWARF + * unwinding at runtime. So only the offline DWARF information is + * useful to anyone. Note we should not use this directive if this + * file is used in the vDSO assembly, or if vmlinux.lds.S gets + * changed so it doesn't discard .eh_frame. + */ + .cfi_sections .debug_frame +#endif + +#else + +/* + * Due to the structure of pre-exisiting code, don't use assembler line + * comment character # to ignore the arguments. Instead, use a dummy macro. + */ +.macro cfi_ignore a=0, b=0, c=0, d=0 +.endm + +#define CFI_STARTPROC cfi_ignore +#define CFI_ENDPROC cfi_ignore +#define CFI_DEF_CFA cfi_ignore +#define CFI_DEF_CFA_REGISTER cfi_ignore +#define CFI_DEF_CFA_OFFSET cfi_ignore +#define CFI_ADJUST_CFA_OFFSET cfi_ignore +#define CFI_OFFSET cfi_ignore +#define CFI_REL_OFFSET cfi_ignore +#define CFI_REGISTER cfi_ignore +#define CFI_RESTORE cfi_ignore +#define CFI_REMEMBER_STATE cfi_ignore +#define CFI_RESTORE_STATE cfi_ignore +#define CFI_UNDEFINED cfi_ignore +#define CFI_ESCAPE cfi_ignore +#define CFI_SIGNAL_FRAME cfi_ignore + +#endif + +/* + * An attempt to make CFI annotations more or less + * correct and shorter. It is implied that you know + * what you're doing if you use them. + */ +#ifdef __ASSEMBLY__ +#ifdef CONFIG_X86_64 + .macro pushq_cfi reg + pushq \reg + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro pushq_cfi_reg reg + pushq %\reg + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET \reg, 0 + .endm + + .macro popq_cfi reg + popq \reg + CFI_ADJUST_CFA_OFFSET -8 + .endm + + .macro popq_cfi_reg reg + popq %\reg + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE \reg + .endm + + .macro pushfq_cfi + pushfq + CFI_ADJUST_CFA_OFFSET 8 + .endm + + .macro popfq_cfi + popfq + CFI_ADJUST_CFA_OFFSET -8 + .endm + + .macro movq_cfi reg offset=0 + movq %\reg, \offset(%rsp) + CFI_REL_OFFSET \reg, \offset + .endm + + .macro movq_cfi_restore offset reg + movq \offset(%rsp), %\reg + CFI_RESTORE \reg + .endm +#else /*!CONFIG_X86_64*/ + .macro pushl_cfi reg + pushl \reg + CFI_ADJUST_CFA_OFFSET 4 + .endm + + .macro pushl_cfi_reg reg + pushl %\reg + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET \reg, 0 + .endm + + .macro popl_cfi reg + popl \reg + CFI_ADJUST_CFA_OFFSET -4 + .endm + + .macro popl_cfi_reg reg + popl %\reg + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE \reg + .endm + + .macro pushfl_cfi + pushfl + CFI_ADJUST_CFA_OFFSET 4 + .endm + + .macro popfl_cfi + popfl + CFI_ADJUST_CFA_OFFSET -4 + .endm + + .macro movl_cfi reg offset=0 + movl %\reg, \offset(%esp) + CFI_REL_OFFSET \reg, \offset + .endm + + .macro movl_cfi_restore offset reg + movl \offset(%esp), %\reg + CFI_RESTORE \reg + .endm +#endif /*!CONFIG_X86_64*/ +#endif /*__ASSEMBLY__*/ + +#endif /* _ASM_X86_DWARF2_H */ -- cgit v0.10.2 From f24f910884277aa6824bbc2dda4b5d0418d45c28 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:58 -0700 Subject: x86/vdso: Define BUILD_VDSO while building and emit .eh_frame in asm For the vDSO, user code wants runtime unwind info. Make sure that, if we use .cfi directives, we generate it. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/16e29ad8855e6508197000d8c41f56adb00d7580.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 3bfb39e..265c0ed 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -67,7 +67,7 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso%.so $(obj)/vdso2c FORCE CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ -fno-omit-frame-pointer -foptimize-sibling-calls \ - -DDISABLE_BRANCH_PROFILING + -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO $(vobjs): KBUILD_CFLAGS += $(CFL) @@ -131,7 +131,7 @@ targets += vdso32/vdso32.lds targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o targets += vdso32/vclock_gettime.o -KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) +KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO $(obj)/vdso32.so.dbg: KBUILD_AFLAGS = $(KBUILD_AFLAGS_32) $(obj)/vdso32.so.dbg: asflags-$(CONFIG_X86_64) += -m32 diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index de1cdaf..09133ba 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -36,15 +36,22 @@ #endif #if defined(CONFIG_AS_CFI_SECTIONS) && defined(__ASSEMBLY__) +#ifndef BUILD_VDSO /* * Emit CFI data in .debug_frame sections, not .eh_frame sections. * The latter we currently just discard since we don't do DWARF * unwinding at runtime. So only the offline DWARF information is - * useful to anyone. Note we should not use this directive if this - * file is used in the vDSO assembly, or if vmlinux.lds.S gets - * changed so it doesn't discard .eh_frame. + * useful to anyone. Note we should not use this directive if + * vmlinux.lds.S gets changed so it doesn't discard .eh_frame. */ .cfi_sections .debug_frame +#else + /* + * For the vDSO, emit both runtime unwind information and debug + * symbols for the .dbg file. + */ + .cfi_sections .eh_frame, .debug_frame +#endif #endif #else -- cgit v0.10.2 From 29c0ce9508458ed31a9db2ed425f64c0d6d3ddfb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:47:59 -0700 Subject: x86/vdso: Replace hex int80 CFI annotations with GAS directives Maintaining the current CFI annotations written in R'lyehian is difficult for most of us. Translate them to something a little closer to English. This will remove the CFI data for kernels built with extremely old versions of binutils. I think this is a fair tradeoff for the ability for mortals to edit the asm. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/ae3ff4ff5278b4bfc1e1dab368823469866d4b71.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index b15b7c0..b52cbfb 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -1,6 +1,10 @@ /* * Code for the vDSO. This version uses the old int $0x80 method. - * +*/ + +#include + +/* * First get the common code for the sigreturn entry points. * This must come first. */ @@ -11,46 +15,10 @@ .type __kernel_vsyscall,@function ALIGN __kernel_vsyscall: -.LSTART_vsyscall: + CFI_STARTPROC int $0x80 ret -.LEND_vsyscall: - .size __kernel_vsyscall,.-.LSTART_vsyscall - .previous - - .section .eh_frame,"a",@progbits -.LSTARTFRAMEDLSI: - .long .LENDCIEDLSI-.LSTARTCIEDLSI -.LSTARTCIEDLSI: - .long 0 /* CIE ID */ - .byte 1 /* Version number */ - .string "zR" /* NUL-terminated augmentation string */ - .uleb128 1 /* Code alignment factor */ - .sleb128 -4 /* Data alignment factor */ - .byte 8 /* Return address register column */ - .uleb128 1 /* Augmentation value length */ - .byte 0x1b /* DW_EH_PE_pcrel|DW_EH_PE_sdata4. */ - .byte 0x0c /* DW_CFA_def_cfa */ - .uleb128 4 - .uleb128 4 - .byte 0x88 /* DW_CFA_offset, column 0x8 */ - .uleb128 1 - .align 4 -.LENDCIEDLSI: - .long .LENDFDEDLSI-.LSTARTFDEDLSI /* Length FDE */ -.LSTARTFDEDLSI: - .long .LSTARTFDEDLSI-.LSTARTFRAMEDLSI /* CIE pointer */ - .long .LSTART_vsyscall-. /* PC-relative start address */ - .long .LEND_vsyscall-.LSTART_vsyscall - .uleb128 0 - .align 4 -.LENDFDEDLSI: - .previous + CFI_ENDPROC - /* - * Pad out the segment to match the size of the sysenter.S version. - */ -VDSO32_vsyscall_eh_frame_size = 0x40 - .section .data,"aw",@progbits - .space VDSO32_vsyscall_eh_frame_size-(.LENDFDEDLSI-.LSTARTFRAMEDLSI), 0 + .size __kernel_vsyscall,.-__kernel_vsyscall .previous -- cgit v0.10.2 From 7bcdea4d050cbe4912854a68b93494203eec8b24 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:00 -0700 Subject: x86/elf/64: Clear more registers in elf_common_init() Before we start calling execve in contexts that honor the full pt_regs, we need to teach it to initialize all registers. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/65a38a9edee61a1158cfd230800c61dbd963dac5.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 2ee05c4..1514753 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -171,11 +171,11 @@ do { \ static inline void elf_common_init(struct thread_struct *t, struct pt_regs *regs, const u16 ds) { - /* Commented-out registers are cleared in stub_execve */ - /*regs->ax = regs->bx =*/ regs->cx = regs->dx = 0; - regs->si = regs->di /*= regs->bp*/ = 0; + /* ax gets execve's return value. */ + /*regs->ax = */ regs->bx = regs->cx = regs->dx = 0; + regs->si = regs->di = regs->bp = 0; regs->r8 = regs->r9 = regs->r10 = regs->r11 = 0; - /*regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0;*/ + regs->r12 = regs->r13 = regs->r14 = regs->r15 = 0; t->fs = t->gs = 0; t->fsindex = t->gsindex = 0; t->ds = t->es = ds; -- cgit v0.10.2 From 8242c6c84a644e5f0f721e4ae2bd542f640c89f9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:01 -0700 Subject: x86/vdso/32: Save extra registers in the INT80 vsyscall path The goal is to integrate the SYSENTER and SYSCALL32 entry paths with the INT80 path. SYSENTER clobbers ESP and EIP. SYSCALL32 clobbers ECX (and, invisibly, R11). SYSRETL (long mode to compat mode) clobbers ECX and, invisibly, R11. SYSEXIT (which we only need for native 32-bit) clobbers ECX and EDX. This means that we'll need to provide ESP to the kernel in a register (I chose ECX, since it's only needed for SYSENTER) and we need to provide the args that normally live in ECX and EDX in memory. The epilogue needs to restore ECX and EDX, since user code relies on regs being preserved. We don't need to do anything special about EIP, since the kernel already knows where we are. The kernel will eventually need to know where int $0x80 lands, so add a vdso_image entry for it. The only user-visible effect of this code is that ptrace-induced changes to ECX and EDX during fast syscalls will be lost. This is already the case for the SYSENTER path. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/b860925adbee2d2627a0671fbfe23a7fd04127f8.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vdso2c.c b/arch/x86/entry/vdso/vdso2c.c index 2637eb1..785d992 100644 --- a/arch/x86/entry/vdso/vdso2c.c +++ b/arch/x86/entry/vdso/vdso2c.c @@ -101,6 +101,7 @@ struct vdso_sym required_syms[] = { {"__kernel_vsyscall", true}, {"__kernel_sigreturn", true}, {"__kernel_rt_sigreturn", true}, + {"int80_landing_pad", true}, }; __attribute__((format(printf, 1, 2))) __attribute__((noreturn)) diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index b52cbfb..d591fe9 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -16,7 +16,30 @@ ALIGN __kernel_vsyscall: CFI_STARTPROC - int $0x80 + /* + * Reshuffle regs so that all of any of the entry instructions + * will preserve enough state. + */ + pushl %edx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET edx, 0 + pushl %ecx + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET ecx, 0 + movl %esp, %ecx + + /* Enter using int $0x80 */ + movl (%esp), %ecx + int $0x80 +GLOBAL(int80_landing_pad) + + /* Restore ECX and EDX in case they were clobbered. */ + popl %ecx + CFI_RESTORE ecx + CFI_ADJUST_CFA_OFFSET -4 + popl %edx + CFI_RESTORE edx + CFI_ADJUST_CFA_OFFSET -4 ret CFI_ENDPROC diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 5bcb1de..756de91 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -26,6 +26,7 @@ struct vdso_image { long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; long sym___kernel_vsyscall; + long sym_int80_landing_pad; }; #ifdef CONFIG_X86_64 -- cgit v0.10.2 From e62a254a1f93fcc7299497a5c7231639400b8c3c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:02 -0700 Subject: x86/entry/64/compat: Disable SYSENTER and SYSCALL32 entries We've disabled the vDSO helpers to call them, so turn off the entries entirely (temporarily) in preparation for cleaning them up. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/8d6e84bf651519289dc532dcc230adfabbd2a3eb.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index bc678f0..06a8966 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -103,6 +103,14 @@ ENTRY(entry_SYSENTER_compat) jnz sysenter_fix_flags sysenter_flags_fixed: + /* Temporary: SYSENTER is disabled. */ +#ifdef CONFIG_CONTEXT_TRACKING + call enter_from_user_mode +#endif + ENABLE_INTERRUPTS(CLBR_NONE) + movl $11, %edi + call do_exit + /* * Re-enable interrupts. IRQ tracing already thinks that IRQs are * on (since we treat user mode as having IRQs on), and the @@ -324,6 +332,11 @@ ENTRY(entry_SYSCALL_compat) * it is too small to ever cause noticeable irq latency. */ SWAPGS_UNSAFE_STACK + + /* Temporary: SYSCALL32 is disabled. */ + movl $-ENOSYS, %eax + USERGS_SYSRET32 + movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp ENABLE_INTERRUPTS(CLBR_NONE) -- cgit v0.10.2 From c5f638ac90d514202155c87aa58730e86d484d9e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:03 -0700 Subject: x86/entry/64/compat: Remove audit optimizations These audit optimizations are messy and hard to maintain. We'll get a similar effect from opportunistic sysret when fast compat system calls are re-implemented. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/0bcca79ac7ff835d0e5a38725298865b01347a82.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 06a8966..75f5fcf 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -16,16 +16,6 @@ #include #include -/* Avoid __ASSEMBLER__'ifying just for this. */ -#include -#define AUDIT_ARCH_I386 (EM_386|__AUDIT_ARCH_LE) -#define __AUDIT_ARCH_LE 0x40000000 - -#ifndef CONFIG_AUDITSYSCALL -# define sysexit_audit ia32_ret_from_sys_call_irqs_off -# define sysretl_audit ia32_ret_from_sys_call_irqs_off -#endif - .section .entry.text, "ax" #ifdef CONFIG_PARAVIRT @@ -148,7 +138,7 @@ sysenter_dispatch: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysexit_audit + jnz ia32_ret_from_sys_call_irqs_off sysexit_from_sys_call: /* * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an @@ -200,81 +190,12 @@ sysexit_from_sys_call: */ USERGS_SYSRET32 -#ifdef CONFIG_AUDITSYSCALL - .macro auditsys_entry_common - /* - * At this point, registers hold syscall args in the 32-bit syscall ABI: - * EAX is syscall number, the 6 args are in EBX,ECX,EDX,ESI,EDI,EBP. - * - * We want to pass them to __audit_syscall_entry(), which is a 64-bit - * C function with 5 parameters, so shuffle them to match what - * the function expects: RDI,RSI,RDX,RCX,R8. - */ - movl %esi, %r8d /* arg5 (R8 ) <= 4th syscall arg (ESI) */ - xchg %ecx, %edx /* arg4 (RCX) <= 3rd syscall arg (EDX) */ - /* arg3 (RDX) <= 2nd syscall arg (ECX) */ - movl %ebx, %esi /* arg2 (RSI) <= 1st syscall arg (EBX) */ - movl %eax, %edi /* arg1 (RDI) <= syscall number (EAX) */ - call __audit_syscall_entry - - /* - * We are going to jump back to the syscall dispatch code. - * Prepare syscall args as required by the 64-bit C ABI. - * Registers clobbered by __audit_syscall_entry() are - * loaded from pt_regs on stack: - */ - movl ORIG_RAX(%rsp), %eax /* syscall number */ - movl %ebx, %edi /* arg1 */ - movl RCX(%rsp), %esi /* arg2 */ - movl RDX(%rsp), %edx /* arg3 */ - movl RSI(%rsp), %ecx /* arg4 */ - movl RDI(%rsp), %r8d /* arg5 */ - .endm - - .macro auditsys_exit exit - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call - movl %eax, %esi /* second arg, syscall return value */ - cmpl $-MAX_ERRNO, %eax /* is it an error ? */ - jbe 1f - movslq %eax, %rsi /* if error sign extend to 64 bits */ -1: setbe %al /* 1 if error, 0 if not */ - movzbl %al, %edi /* zero-extend that into %edi */ - call __audit_syscall_exit - movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %edi - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl %edi, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz \exit - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_ret_from_sys_call_irqs_off - .endm - -sysenter_auditsys: - auditsys_entry_common - movl %ebp, %r9d /* reload 6th syscall arg */ - jmp sysenter_dispatch - -sysexit_audit: - auditsys_exit sysexit_from_sys_call -#endif - sysenter_fix_flags: pushq $X86_EFLAGS_FIXED popfq jmp sysenter_flags_fixed sysenter_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz sysenter_auditsys -#endif SAVE_EXTRA_REGS xorl %eax, %eax /* Do not leak kernel information */ movq %rax, R11(%rsp) @@ -389,7 +310,7 @@ cstar_dispatch: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysretl_audit + jnz ia32_ret_from_sys_call_irqs_off sysretl_from_sys_call: andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) @@ -420,22 +341,7 @@ sysretl_from_sys_call: */ USERGS_SYSRET32 -#ifdef CONFIG_AUDITSYSCALL -cstar_auditsys: - movl %r9d, R9(%rsp) /* register to be clobbered by call */ - auditsys_entry_common - movl R9(%rsp), %r9d /* reload 6th syscall arg */ - jmp cstar_dispatch - -sysretl_audit: - auditsys_exit sysretl_from_sys_call -#endif - cstar_tracesys: -#ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jz cstar_auditsys -#endif xchgl %r9d, %ebp SAVE_EXTRA_REGS xorl %eax, %eax /* Do not leak kernel information */ -- cgit v0.10.2 From 2ec67971faccc21ff18878552ccfe4409088c4c8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:04 -0700 Subject: x86/entry/64/compat: Remove most of the fast system call machinery We now have only one code path that calls through the compat syscall table. This will make it much more pleasant to change the pt_regs vs register calling convention, which we need to do to move the call into C. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/320cda5573cefdc601b955d23fbe8f36c085432d.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 75f5fcf..3216e60 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -101,119 +101,13 @@ sysenter_flags_fixed: movl $11, %edi call do_exit - /* - * Re-enable interrupts. IRQ tracing already thinks that IRQs are - * on (since we treat user mode as having IRQs on), and the - * prologue above is too short for it to be worth adding a - * tracing round trip. - */ - ENABLE_INTERRUPTS(CLBR_NONE) - - /* - * No need to do an access_ok() check here because RBP has been - * 32-bit zero extended: - */ - ASM_STAC -1: movl (%rbp), %ebp - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz sysenter_tracesys - -sysenter_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ -sysenter_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call_irqs_off -sysexit_from_sys_call: - /* - * NB: SYSEXIT is not obviously safe for 64-bit kernels -- an - * NMI between STI and SYSEXIT has poorly specified behavior, - * and and NMI followed by an IRQ with usergs is fatal. So - * we just pretend we're using SYSEXIT but we really use - * SYSRETL instead. - * - * This code path is still called 'sysexit' because it pairs - * with 'sysenter' and it uses the SYSENTER calling convention. - */ - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RIP(%rsp), %ecx /* User %eip */ - movq RAX(%rsp), %rax - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - xorl %edx, %edx /* Do not leak kernel information */ - xorq %r8, %r8 - xorq %r9, %r9 - xorq %r10, %r10 - movl EFLAGS(%rsp), %r11d /* User eflags */ - TRACE_IRQS_ON - - /* - * SYSRETL works even on Intel CPUs. Use it in preference to SYSEXIT, - * since it avoids a dicey window with interrupts enabled. - */ - movl RSP(%rsp), %esp - - /* - * USERGS_SYSRET32 does: - * gsbase = user's gs base - * eip = ecx - * rflags = r11 - * cs = __USER32_CS - * ss = __USER_DS - * - * The prologue set RIP(%rsp) to VDSO32_SYSENTER_RETURN, which does: - * - * pop %ebp - * pop %edx - * pop %ecx - * - * Therefore, we invoke SYSRETL with EDX and R8-R10 zeroed to - * avoid info leaks. R11 ends up with VDSO32_SYSENTER_RETURN's - * address (already known to user code), and R12-R15 are - * callee-saved and therefore don't contain any interesting - * kernel data. - */ - USERGS_SYSRET32 + /* Unreachable. */ + ud2 sysenter_fix_flags: pushq $X86_EFLAGS_FIXED popfq jmp sysenter_flags_fixed - -sysenter_tracesys: - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - jmp sysenter_do_call ENDPROC(entry_SYSENTER_compat) /* @@ -280,142 +174,10 @@ ENTRY(entry_SYSCALL_compat) pushq $-ENOSYS /* pt_regs->ax */ sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ - /* - * No need to do an access_ok check here because r8 has been - * 32-bit zero extended: - */ - ASM_STAC -1: movl (%r8), %r9d - _ASM_EXTABLE(1b, ia32_badarg) - ASM_CLAC - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz cstar_tracesys - -cstar_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - /* r9 already loaded */ /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - -cstar_dispatch: - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_ret_from_sys_call_irqs_off - -sysretl_from_sys_call: - andl $~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl RIP(%rsp), %ecx - movl EFLAGS(%rsp), %r11d - movq RAX(%rsp), %rax - xorq %r10, %r10 - xorq %r9, %r9 - xorq %r8, %r8 - TRACE_IRQS_ON - movl RSP(%rsp), %esp - /* - * 64-bit->32-bit SYSRET restores eip from ecx, - * eflags from r11 (but RF and VM bits are forced to 0), - * cs and ss are loaded from MSRs. - * (Note: 32-bit->32-bit SYSRET is different: since r11 - * does not exist, it merely sets eflags.IF=1). - * - * NB: On AMD CPUs with the X86_BUG_SYSRET_SS_ATTRS bug, the ss - * descriptor is not reinitialized. This means that we must - * avoid SYSRET with SS == NULL, which could happen if we schedule, - * exit the kernel, and re-enter using an interrupt vector. (All - * interrupt entries on x86_64 set SS to NULL.) We prevent that - * from happening by reloading SS in __switch_to. - */ - USERGS_SYSRET32 - -cstar_tracesys: - xchgl %r9d, %ebp - SAVE_EXTRA_REGS - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %r9, R9(%rsp) - movq %rax, R8(%rsp) - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter - movl R9(%rsp), %r9d - - /* Reload arg registers from stack. (see sysenter_tracesys) */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - - RESTORE_EXTRA_REGS - xchgl %ebp, %r9d - jmp cstar_do_call + /* Unreachable. */ + ud2 END(entry_SYSCALL_compat) -ia32_badarg: - /* - * So far, we've entered kernel mode, set AC, turned on IRQs, and - * saved C regs except r8-r11. We haven't done any of the other - * standard entry work, though. We want to bail, but we shouldn't - * treat this as a syscall entry since we don't even know what the - * args are. Instead, treat this as a non-syscall entry, finish - * the entry work, and immediately exit after setting AX = -EFAULT. - * - * We're really just being polite here. Killing the task outright - * would be a reasonable action, too. Given that the only valid - * way to have gotten here is through the vDSO, and we already know - * that the stack pointer is bad, the task isn't going to survive - * for long no matter what we do. - */ - - ASM_CLAC /* undo STAC */ - movq $-EFAULT, RAX(%rsp) /* return -EFAULT if possible */ - - /* Fill in the rest of pt_regs */ - xorl %eax, %eax - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - SAVE_EXTRA_REGS - - /* Turn IRQs back off. */ - DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF - - /* Now finish entering normal kernel mode. */ -#ifdef CONFIG_CONTEXT_TRACKING - call enter_from_user_mode -#endif - - /* And exit again. */ - jmp retint_user - -ia32_ret_from_sys_call_irqs_off: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_NONE) - -ia32_ret_from_sys_call: - xorl %eax, %eax /* Do not leak kernel information */ - movq %rax, R11(%rsp) - movq %rax, R10(%rsp) - movq %rax, R9(%rsp) - movq %rax, R8(%rsp) - jmp int_ret_from_sys_call - /* * Emulated IA32 system calls via int 0x80. * -- cgit v0.10.2 From 8169aff611956ed360e3313e8c718f530f58f6cb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:05 -0700 Subject: x86/entry/64/compat: Set up full pt_regs for all compat syscalls This is conceptually simpler. More importantly, it eliminates the PTREGSCALL and execve stubs, which were not compatible with the C ABI. This means that C code can call through the compat syscall table. The execve stubs are a bit subtle. They did two things: they cleared some registers and they forced slow-path return. Neither is necessary any more: elf_common_init clears the extra registers and start_thread calls force_iret(). Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/f95b7f7dfaacf88a8cae85bb06226cae53769287.1444091584.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 7dc2850..83ad958 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -391,20 +391,16 @@ GLOBAL(stub_execveat) jmp return_from_execve END(stub_execveat) -#if defined(CONFIG_X86_X32_ABI) || defined(CONFIG_IA32_EMULATION) +#if defined(CONFIG_X86_X32_ABI) .align 8 GLOBAL(stub_x32_execve) -GLOBAL(stub32_execve) call compat_sys_execve jmp return_from_execve -END(stub32_execve) END(stub_x32_execve) .align 8 GLOBAL(stub_x32_execveat) -GLOBAL(stub32_execveat) call compat_sys_execveat jmp return_from_execve -END(stub32_execveat) END(stub_x32_execveat) #endif diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 3216e60..2c2aac5 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -219,12 +219,18 @@ ENTRY(entry_INT80_compat) pushq %rdx /* pt_regs->dx */ pushq %rcx /* pt_regs->cx */ pushq $-ENOSYS /* pt_regs->ax */ - pushq $0 /* pt_regs->r8 */ - pushq $0 /* pt_regs->r9 */ - pushq $0 /* pt_regs->r10 */ - pushq $0 /* pt_regs->r11 */ + xorq %r8,%r8 + pushq %r8 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r9 = 0 */ + pushq %r8 /* pt_regs->r10 = 0 */ + pushq %r8 /* pt_regs->r11 = 0 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp */ + pushq %r12 /* pt_regs->r12 */ + pushq %r13 /* pt_regs->r13 */ + pushq %r14 /* pt_regs->r14 */ + pushq %r15 /* pt_regs->r15 */ cld - sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) @@ -243,10 +249,10 @@ ia32_do_call: call *ia32_sys_call_table(, %rax, 8) movq %rax, RAX(%rsp) 1: + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call ia32_tracesys: - SAVE_EXTRA_REGS movq %rsp, %rdi /* &pt_regs -> arg1 */ call syscall_trace_enter /* @@ -261,25 +267,11 @@ ia32_tracesys: movl RSI(%rsp), %esi movl RDI(%rsp), %edi movl %eax, %eax /* zero extension */ - RESTORE_EXTRA_REGS jmp ia32_do_call END(entry_INT80_compat) - .macro PTREGSCALL label, func - ALIGN -GLOBAL(\label) - leaq \func(%rip), %rax - jmp ia32_ptregs_common - .endm - - PTREGSCALL stub32_rt_sigreturn, sys32_rt_sigreturn - PTREGSCALL stub32_sigreturn, sys32_sigreturn - PTREGSCALL stub32_fork, sys_fork - PTREGSCALL stub32_vfork, sys_vfork - ALIGN GLOBAL(stub32_clone) - leaq sys_clone(%rip), %rax /* * The 32-bit clone ABI is: clone(..., int tls_val, int *child_tidptr). * The 64-bit clone ABI is: clone(..., int *child_tidptr, int tls_val). @@ -288,12 +280,4 @@ GLOBAL(stub32_clone) * so we need to swap arguments here before calling it: */ xchg %r8, %rcx - jmp ia32_ptregs_common - - ALIGN -ia32_ptregs_common: - SAVE_EXTRA_REGS 8 - call *%rax - RESTORE_EXTRA_REGS 8 - ret -END(ia32_ptregs_common) + jmp sys_clone diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 7663c45..caa2c71 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -8,7 +8,7 @@ # 0 i386 restart_syscall sys_restart_syscall 1 i386 exit sys_exit -2 i386 fork sys_fork stub32_fork +2 i386 fork sys_fork sys_fork 3 i386 read sys_read 4 i386 write sys_write 5 i386 open sys_open compat_sys_open @@ -17,7 +17,7 @@ 8 i386 creat sys_creat 9 i386 link sys_link 10 i386 unlink sys_unlink -11 i386 execve sys_execve stub32_execve +11 i386 execve sys_execve compat_sys_execve 12 i386 chdir sys_chdir 13 i386 time sys_time compat_sys_time 14 i386 mknod sys_mknod @@ -125,7 +125,7 @@ 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo 117 i386 ipc sys_ipc compat_sys_ipc 118 i386 fsync sys_fsync -119 i386 sigreturn sys_sigreturn stub32_sigreturn +119 i386 sigreturn sys_sigreturn sys32_sigreturn 120 i386 clone sys_clone stub32_clone 121 i386 setdomainname sys_setdomainname 122 i386 uname sys_newuname @@ -179,7 +179,7 @@ 170 i386 setresgid sys_setresgid16 171 i386 getresgid sys_getresgid16 172 i386 prctl sys_prctl -173 i386 rt_sigreturn sys_rt_sigreturn stub32_rt_sigreturn +173 i386 rt_sigreturn sys_rt_sigreturn sys32_rt_sigreturn 174 i386 rt_sigaction sys_rt_sigaction compat_sys_rt_sigaction 175 i386 rt_sigprocmask sys_rt_sigprocmask 176 i386 rt_sigpending sys_rt_sigpending compat_sys_rt_sigpending @@ -196,7 +196,7 @@ 187 i386 sendfile sys_sendfile compat_sys_sendfile 188 i386 getpmsg 189 i386 putpmsg -190 i386 vfork sys_vfork stub32_vfork +190 i386 vfork sys_vfork sys_vfork 191 i386 ugetrlimit sys_getrlimit compat_sys_getrlimit 192 i386 mmap2 sys_mmap_pgoff 193 i386 truncate64 sys_truncate64 sys32_truncate64 @@ -364,7 +364,7 @@ 355 i386 getrandom sys_getrandom 356 i386 memfd_create sys_memfd_create 357 i386 bpf sys_bpf -358 i386 execveat sys_execveat stub32_execveat +358 i386 execveat sys_execveat compat_sys_execveat 359 i386 socket sys_socket 360 i386 socketpair sys_socketpair 361 i386 bind sys_bind -- cgit v0.10.2 From 034042cc1e2837a584cda0a5e4fc2b0a96b74543 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:06 -0700 Subject: x86/entry/syscalls: Move syscall table declarations into asm/syscalls.h The header was missing some compat declarations. Also make sys_call_ptr_t have a consistent type. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/3166aaff0fb43897998fcb6ef92991533f8c5c6c.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 8ea34f9..429460d 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -4,13 +4,12 @@ #include #include #include +#include #ifdef CONFIG_IA32_EMULATION #define SYM(sym, compat) compat #else #define SYM(sym, compat) sym -#define ia32_sys_call_table sys_call_table -#define __NR_syscall_compat_max __NR_syscall_max #endif #define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; @@ -19,8 +18,6 @@ #define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), -typedef asmlinkage void (*sys_call_ptr_t)(void); - extern asmlinkage void sys_ni_syscall(void); __visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index d6a756a..f3ff2ef 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,9 +20,19 @@ #include /* for TS_COMPAT */ #include -typedef void (*sys_call_ptr_t)(void); +typedef asmlinkage void (*sys_call_ptr_t)(void); extern const sys_call_ptr_t sys_call_table[]; +#if defined(CONFIG_X86_32) +#define ia32_sys_call_table sys_call_table +#define __NR_syscall_compat_max __NR_syscall_max +#define IA32_NR_syscalls NR_syscalls +#endif + +#if defined(CONFIG_IA32_EMULATION) +extern const sys_call_ptr_t ia32_sys_call_table[]; +#endif + /* * Only the low 32 bits of orig_ax are meaningful, so we return int. * This importantly ignores the high bits on 64-bit, so comparisons diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index bd16d6c..d738e9c 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -7,6 +7,7 @@ #include #include #include +#include #define __NO_STUBS @@ -30,8 +31,6 @@ #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, -typedef asmlinkage void (*sys_call_ptr_t)(void); - extern asmlinkage void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index a75d8700..1ff9a21 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -7,6 +7,7 @@ #include #include #include +#include #define __NO_STUBS @@ -43,8 +44,6 @@ #undef __SYSCALL_64 #define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, -typedef void (*sys_call_ptr_t)(void); - extern void sys_ni_syscall(void); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { -- cgit v0.10.2 From eb974c62565072e10c1422eb3205f5b611dd99a1 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:07 -0700 Subject: x86/syscalls: Give sys_call_ptr_t a useful type Syscalls are asmlinkage functions (on 32-bit kernels), take six args of type unsigned long, and return long. Note that uml could probably be slightly cleaned up on top of this patch. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/4d3ecc4a169388d47009175408b2961961744e6f.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/syscall_32.c b/arch/x86/entry/syscall_32.c index 429460d..9a66498 100644 --- a/arch/x86/entry/syscall_32.c +++ b/arch/x86/entry/syscall_32.c @@ -12,13 +12,13 @@ #define SYM(sym, compat) sym #endif -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long SYM(sym, compat)(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), -extern asmlinkage void sys_ni_syscall(void); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); __visible const sys_call_ptr_t ia32_sys_call_table[__NR_syscall_compat_max+1] = { /* diff --git a/arch/x86/entry/syscall_64.c b/arch/x86/entry/syscall_64.c index 4ac730b..41283d2 100644 --- a/arch/x86/entry/syscall_64.c +++ b/arch/x86/entry/syscall_64.c @@ -14,13 +14,13 @@ # define __SYSCALL_X32(nr, sym, compat) /* nothing */ #endif -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 #define __SYSCALL_64(nr, sym, compat) [nr] = sym, -extern void sys_ni_syscall(void); +extern long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); asmlinkage const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { /* diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h index f3ff2ef..999b7cd 100644 --- a/arch/x86/include/asm/syscall.h +++ b/arch/x86/include/asm/syscall.h @@ -20,7 +20,9 @@ #include /* for TS_COMPAT */ #include -typedef asmlinkage void (*sys_call_ptr_t)(void); +typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); extern const sys_call_ptr_t sys_call_table[]; #if defined(CONFIG_X86_32) diff --git a/arch/x86/um/sys_call_table_32.c b/arch/x86/um/sys_call_table_32.c index d738e9c..439c099 100644 --- a/arch/x86/um/sys_call_table_32.c +++ b/arch/x86/um/sys_call_table_32.c @@ -25,13 +25,13 @@ #define old_mmap sys_old_mmap -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_I386 #define __SYSCALL_I386(nr, sym, compat) [ nr ] = sym, -extern asmlinkage void sys_ni_syscall(void); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { /* diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 1ff9a21..b74ea6c 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -38,13 +38,13 @@ #define __SYSCALL_COMMON(nr, sym, compat) __SYSCALL_64(nr, sym, compat) #define __SYSCALL_X32(nr, sym, compat) /* Not supported */ -#define __SYSCALL_64(nr, sym, compat) extern asmlinkage void sym(void) ; +#define __SYSCALL_64(nr, sym, compat) extern asmlinkage long sym(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long) ; #include #undef __SYSCALL_64 #define __SYSCALL_64(nr, sym, compat) [ nr ] = sym, -extern void sys_ni_syscall(void); +extern asmlinkage long sys_ni_syscall(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); const sys_call_ptr_t sys_call_table[] ____cacheline_aligned = { /* -- cgit v0.10.2 From bd2d3a3ba67ac580f6e809aac36bf942f5447f91 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:08 -0700 Subject: x86/entry: Add do_syscall_32(), a C function to do 32-bit syscalls System calls are really quite simple. Add a helper to call a 32-bit system call. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/a77ed179834c27da436fb4a7fb23c8ee77abc11c.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d94a60c..41d1750 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -318,3 +318,46 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) local_irq_disable(); prepare_exit_to_usermode(regs); } + +#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) +/* + * Does a 32-bit syscall. Called with IRQs off and does all entry and + * exit work. + */ +__visible void do_int80_syscall_32(struct pt_regs *regs) +{ + struct thread_info *ti = pt_regs_to_thread_info(regs); + unsigned int nr = (unsigned int)regs->orig_ax; + +#ifdef CONFIG_IA32_EMULATION + ti->status |= TS_COMPAT; +#endif + + local_irq_enable(); + + if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { + /* + * Subtlety here: if ptrace pokes something larger than + * 2^32-1 into orig_ax, this truncates it. This may or + * may not be necessary, but it matches the old asm + * behavior. + */ + nr = syscall_trace_enter(regs); + } + + if (nr < IA32_NR_syscalls) { + /* + * It's possible that a 32-bit syscall implementation + * takes a 64-bit parameter but nonetheless assumes that + * the high bits are zero. Make sure we zero-extend all + * of the args. + */ + regs->ax = ia32_sys_call_table[nr]( + (unsigned int)regs->bx, (unsigned int)regs->cx, + (unsigned int)regs->dx, (unsigned int)regs->si, + (unsigned int)regs->di, (unsigned int)regs->bp); + } + + syscall_return_slowpath(regs); +} +#endif -- cgit v0.10.2 From ee08c6bd315e70756ad2c47ee6ea708a4a882b55 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:09 -0700 Subject: x86/entry/64/compat: Migrate the body of the syscall entry to C Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/a2f0fce68feeba798a24339b5a7ec1ec2dd9eaf7.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 83ad958..53616ca 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -582,7 +582,7 @@ retint_kernel: * At this label, code paths which return to kernel and to user, * which come from interrupts/exception and from syscalls, merge. */ -restore_regs_and_iret: +GLOBAL(restore_regs_and_iret) RESTORE_EXTRA_REGS restore_c_regs_and_iret: RESTORE_C_REGS diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 2c2aac5..63ef9fa 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -202,14 +202,17 @@ END(entry_SYSCALL_compat) ENTRY(entry_INT80_compat) /* * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. */ PARAVIRT_ADJUST_EXCEPTION_FRAME SWAPGS - ENABLE_INTERRUPTS(CLBR_NONE) - /* Zero-extending 32-bit regs, do not remove */ + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ movl %eax, %eax /* Construct struct pt_regs on stack (iret frame is already on stack) */ @@ -232,42 +235,19 @@ ENTRY(entry_INT80_compat) pushq %r15 /* pt_regs->r15 */ cld - orl $TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS) - testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) - jnz ia32_tracesys - -ia32_do_call: - /* 32-bit syscall -> 64-bit C ABI argument conversion */ - movl %edi, %r8d /* arg5 */ - movl %ebp, %r9d /* arg6 */ - xchg %ecx, %esi /* rsi:arg2, rcx:arg4 */ - movl %ebx, %edi /* arg1 */ - movl %edx, %edx /* arg3 (zero extension) */ - cmpq $(IA32_NR_syscalls-1), %rax - ja 1f - - call *ia32_sys_call_table(, %rax, 8) - movq %rax, RAX(%rsp) -1: - RESTORE_EXTRA_REGS - jmp int_ret_from_sys_call - -ia32_tracesys: - movq %rsp, %rdi /* &pt_regs -> arg1 */ - call syscall_trace_enter /* - * Reload arg registers from stack in case ptrace changed them. - * Don't reload %eax because syscall_trace_enter() returned - * the %rax value we should see. But do truncate it to 32 bits. - * If it's -1 to make us punt the syscall, then (u32)-1 is still - * an appropriately invalid value. + * User mode is traced as though IRQs are on, and the interrupt + * gate turned them off. */ - movl RCX(%rsp), %ecx - movl RDX(%rsp), %edx - movl RSI(%rsp), %esi - movl RDI(%rsp), %edi - movl %eax, %eax /* zero extension */ - jmp ia32_do_call + TRACE_IRQS_OFF + + movq %rsp, %rdi + call do_int80_syscall_32 + + /* Go back to user mode. */ + TRACE_IRQS_ON + SWAPGS + jmp restore_regs_and_iret END(entry_INT80_compat) ALIGN -- cgit v0.10.2 From 710246df58041106b7de645f4b45770f8a59a269 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:10 -0700 Subject: x86/entry: Add C code for fast system call entries This handles both SYSENTER and SYSCALL. The asm glue will take care of the differences. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/6041a58a9b8ef6d2522ab4350deb1a1945eb563f.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 41d1750..1b2606e 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -24,6 +24,8 @@ #include #include +#include +#include #define CREATE_TRACE_POINTS #include @@ -360,4 +362,45 @@ __visible void do_int80_syscall_32(struct pt_regs *regs) syscall_return_slowpath(regs); } + +__visible void do_fast_syscall_32(struct pt_regs *regs) +{ + /* + * Called using the internal vDSO SYSENTER/SYSCALL32 calling + * convention. Adjust regs so it looks like we entered using int80. + */ + + unsigned long landing_pad = (unsigned long)current->mm->context.vdso + + vdso_image_32.sym_int80_landing_pad; + + /* + * SYSENTER loses EIP, and even SYSCALL32 needs us to skip forward + * so that 'regs->ip -= 2' lands back on an int $0x80 instruction. + * Fix it up. + */ + regs->ip = landing_pad; + + /* + * Fetch ECX from where the vDSO stashed it. + * + * WARNING: We are in CONTEXT_USER and RCU isn't paying attention! + */ + local_irq_enable(); + if (get_user(*(u32 *)®s->cx, + (u32 __user __force *)(unsigned long)(u32)regs->sp)) { + /* User code screwed up. */ + local_irq_disable(); + regs->ax = -EFAULT; +#ifdef CONFIG_CONTEXT_TRACKING + enter_from_user_mode(); +#endif + prepare_exit_to_usermode(regs); + return; + } + local_irq_disable(); + + /* Now this is just like a normal syscall. */ + do_int80_syscall_32(regs); + return; +} #endif -- cgit v0.10.2 From a474e67c913d3ebaf02ba9d7835d5299d226c3ed Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:11 -0700 Subject: x86/vdso/compat: Wire up SYSENTER and SYSCSALL for compat userspace What, you didn't realize that SYSENTER and SYSCALL were actually the same thing? :) Unlike the old code, this actually passes the ptrace_syscall_32 test on AMD systems. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/b74615af58d785aa02d917213ec64e2022a2c796.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 63ef9fa..8f109de5 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -52,15 +52,18 @@ ENTRY(entry_SYSENTER_compat) SWAPGS_UNSAFE_STACK movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* Zero-extending 32-bit regs, do not remove */ - movl %ebp, %ebp + /* + * User tracing code (ptrace or signal handlers) might assume that + * the saved RAX contains a 32-bit number when we're invoking a 32-bit + * syscall. Just in case the high bits are nonzero, zero-extend + * the syscall number. (This could almost certainly be deleted + * with no ill effects.) + */ movl %eax, %eax - movl ASM_THREAD_INFO(TI_sysenter_return, %rsp, 0), %r10d - /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ - pushq %rbp /* pt_regs->sp */ + pushq %rcx /* pt_regs->sp */ /* * Push flags. This is nasty. First, interrupts are currently @@ -70,17 +73,28 @@ ENTRY(entry_SYSENTER_compat) */ pushfq /* pt_regs->flags (except IF = 0) */ orl $X86_EFLAGS_IF, (%rsp) /* Fix saved flags */ + ASM_CLAC /* Clear AC after saving FLAGS */ pushq $__USER32_CS /* pt_regs->cs */ - pushq %r10 /* pt_regs->ip = thread_info->sysenter_return */ + xorq %r8,%r8 + pushq %r8 /* pt_regs->ip = 0 (placeholder) */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ - pushq %rcx /* pt_regs->cx */ + pushq %rcx /* pt_regs->cx (will be overwritten) */ pushq $-ENOSYS /* pt_regs->ax */ + pushq %r8 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r9 = 0 */ + pushq %r8 /* pt_regs->r10 = 0 */ + pushq %r8 /* pt_regs->r11 = 0 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp */ + pushq %r8 /* pt_regs->r12 = 0 */ + pushq %r8 /* pt_regs->r13 = 0 */ + pushq %r8 /* pt_regs->r14 = 0 */ + pushq %r8 /* pt_regs->r15 = 0 */ cld - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ /* * Sysenter doesn't filter flags, so we need to clear NT @@ -93,16 +107,15 @@ ENTRY(entry_SYSENTER_compat) jnz sysenter_fix_flags sysenter_flags_fixed: - /* Temporary: SYSENTER is disabled. */ -#ifdef CONFIG_CONTEXT_TRACKING - call enter_from_user_mode -#endif - ENABLE_INTERRUPTS(CLBR_NONE) - movl $11, %edi - call do_exit + /* + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. + */ + TRACE_IRQS_OFF - /* Unreachable. */ - ud2 + movq %rsp, %rdi + call do_fast_syscall_32 + jmp .Lsyscall_32_done sysenter_fix_flags: pushq $X86_EFLAGS_FIXED @@ -135,26 +148,14 @@ ENDPROC(entry_SYSENTER_compat) * edi arg5 * esp user stack * 0(%esp) arg6 - * - * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. We set up a complete hardware stack frame to share code - * with the int 0x80 path. */ ENTRY(entry_SYSCALL_compat) - /* - * Interrupts are off on entry. - * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, - * it is too small to ever cause noticeable irq latency. - */ + /* Interrupts are off on entry. */ SWAPGS_UNSAFE_STACK - /* Temporary: SYSCALL32 is disabled. */ - movl $-ENOSYS, %eax - USERGS_SYSRET32 - + /* Stash user ESP and switch to the kernel stack. */ movl %esp, %r8d movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - ENABLE_INTERRUPTS(CLBR_NONE) /* Zero-extending 32-bit regs, do not remove */ movl %eax, %eax @@ -169,13 +170,29 @@ ENTRY(entry_SYSCALL_compat) pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ - pushq %rbp /* pt_regs->cx */ - movl %ebp, %ecx + pushq %rcx /* pt_regs->cx (will be overwritten) */ pushq $-ENOSYS /* pt_regs->ax */ - sub $(10*8), %rsp /* pt_regs->r8-11, bp, bx, r12-15 not saved */ + xorq %r8,%r8 + pushq %r8 /* pt_regs->r8 = 0 */ + pushq %r8 /* pt_regs->r9 = 0 */ + pushq %r8 /* pt_regs->r10 = 0 */ + pushq %r8 /* pt_regs->r11 = 0 */ + pushq %rbx /* pt_regs->rbx */ + pushq %rbp /* pt_regs->rbp */ + pushq %r8 /* pt_regs->r12 = 0 */ + pushq %r8 /* pt_regs->r13 = 0 */ + pushq %r8 /* pt_regs->r14 = 0 */ + pushq %r8 /* pt_regs->r15 = 0 */ - /* Unreachable. */ - ud2 + /* + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. + */ + TRACE_IRQS_OFF + + movq %rsp, %rdi + call do_fast_syscall_32 + jmp .Lsyscall_32_done END(entry_SYSCALL_compat) /* @@ -243,6 +260,7 @@ ENTRY(entry_INT80_compat) movq %rsp, %rdi call do_int80_syscall_32 +.Lsyscall_32_done: /* Go back to user mode. */ TRACE_IRQS_ON diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index d591fe9..00157ca 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -3,6 +3,8 @@ */ #include +#include +#include /* * First get the common code for the sigreturn entry points. @@ -28,6 +30,12 @@ __kernel_vsyscall: CFI_REL_OFFSET ecx, 0 movl %esp, %ecx +#ifdef CONFIG_X86_64 + /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ + ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \ + "syscall", X86_FEATURE_SYSCALL32 +#endif + /* Enter using int $0x80 */ movl (%esp), %ecx int $0x80 -- cgit v0.10.2 From 7841b408717d4c3b1b334c8f1fef7f18c98cd2bd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:12 -0700 Subject: x86/entry/compat: Implement opportunistic SYSRETL for compat syscalls If CS, SS and IP are as expected and FLAGS is compatible with SYSRETL, then return from fast compat syscalls (both SYSCALL and SYSENTER) using SYSRETL. Unlike native 64-bit opportunistic SYSRET, this is not invisible to user code: RCX and R8-R15 end up in a different state than shown saved in pt_regs. To compensate, we only do this when returning to the vDSO fast syscall return path. This won't interfere with syscall restart, as we won't use SYSRETL when returning to the INT80 restart instruction. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/aa15e49db33773eb10b73d73466b6d5466d7856a.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 1b2606e..88dc5ba 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -363,7 +363,8 @@ __visible void do_int80_syscall_32(struct pt_regs *regs) syscall_return_slowpath(regs); } -__visible void do_fast_syscall_32(struct pt_regs *regs) +/* Returns 0 to return using IRET or 1 to return using SYSRETL. */ +__visible long do_fast_syscall_32(struct pt_regs *regs) { /* * Called using the internal vDSO SYSENTER/SYSCALL32 calling @@ -395,12 +396,28 @@ __visible void do_fast_syscall_32(struct pt_regs *regs) enter_from_user_mode(); #endif prepare_exit_to_usermode(regs); - return; + return 0; /* Keep it simple: use IRET. */ } local_irq_disable(); /* Now this is just like a normal syscall. */ do_int80_syscall_32(regs); - return; + +#ifdef CONFIG_X86_64 + /* + * Opportunistic SYSRETL: if possible, try to return using SYSRETL. + * SYSRETL is available on all 64-bit CPUs, so we don't need to + * bother with SYSEXIT. + * + * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, + * because the ECX fixup above will ensure that this is essentially + * never the case. + */ + return regs->cs == __USER32_CS && regs->ss == __USER_DS && + regs->ip == landing_pad && + (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; +#else + return 0; +#endif } #endif diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 8f109de5..cf9641c 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -115,7 +115,9 @@ sysenter_flags_fixed: movq %rsp, %rdi call do_fast_syscall_32 - jmp .Lsyscall_32_done + testl %eax, %eax + jz .Lsyscall_32_done + jmp sysret32_from_system_call sysenter_fix_flags: pushq $X86_EFLAGS_FIXED @@ -192,7 +194,43 @@ ENTRY(entry_SYSCALL_compat) movq %rsp, %rdi call do_fast_syscall_32 - jmp .Lsyscall_32_done + testl %eax, %eax + jz .Lsyscall_32_done + + /* Opportunistic SYSRET */ +sysret32_from_system_call: + TRACE_IRQS_ON /* User mode traces as IRQs on. */ + movq RBX(%rsp), %rbx /* pt_regs->rbx */ + movq RBP(%rsp), %rbp /* pt_regs->rbp */ + movq EFLAGS(%rsp), %r11 /* pt_regs->flags (in r11) */ + movq RIP(%rsp), %rcx /* pt_regs->ip (in rcx) */ + addq $RAX, %rsp /* Skip r8-r15 */ + popq %rax /* pt_regs->rax */ + popq %rdx /* Skip pt_regs->cx */ + popq %rdx /* pt_regs->dx */ + popq %rsi /* pt_regs->si */ + popq %rdi /* pt_regs->di */ + + /* + * USERGS_SYSRET32 does: + * GSBASE = user's GS base + * EIP = ECX + * RFLAGS = R11 + * CS = __USER32_CS + * SS = __USER_DS + * + * ECX will not match pt_regs->cx, but we're returning to a vDSO + * trampoline that will fix up RCX, so this is okay. + * + * R12-R15 are callee-saved, so they contain whatever was in them + * when the system call started, which is already known to user + * code. We zero R8-R10 to avoid info leaks. + */ + xorq %r8, %r8 + xorq %r9, %r9 + xorq %r10, %r10 + movq RSP-ORIG_RAX(%rsp), %rsp + USERGS_SYSRET32 END(entry_SYSCALL_compat) /* -- cgit v0.10.2 From 39e8701f33d65c7f51d749a5d12a1379065e0926 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:13 -0700 Subject: x86/entry/32: Open-code return tracking from fork and kthreads syscall_exit is going away, and return tracing is just a function call now, so open-code the two non-syscall 32-bit users. While we're at it, update the big register layout comment. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/a6b3c472fda7cda0e368c3ccd553dea7447dfdd2.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index a08ded4..36b6beb 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -3,7 +3,7 @@ * * entry_32.S contains the system-call and low-level fault and trap handling routines. * - * Stack layout in 'syscall_exit': + * Stack layout while running C code: * ptrace needs to have all registers on the stack. * If the order here is changed, it needs to be * updated in fork.c:copy_process(), signal.c:do_signal(), @@ -211,7 +211,11 @@ ENTRY(ret_from_fork) popl %eax pushl $0x0202 # Reset kernel eflags popfl - jmp syscall_exit + + /* When we fork, we trace the syscall return in the child, too. */ + movl %esp, %eax + call syscall_return_slowpath + jmp restore_all END(ret_from_fork) ENTRY(ret_from_kernel_thread) @@ -224,7 +228,15 @@ ENTRY(ret_from_kernel_thread) movl PT_EBP(%esp), %eax call *PT_EBX(%esp) movl $0, PT_EAX(%esp) - jmp syscall_exit + + /* + * Kernel threads return to userspace as if returning from a syscall. + * We should check whether anything actually uses this path and, if so, + * consider switching it over to ret_from_fork. + */ + movl %esp, %eax + call syscall_return_slowpath + jmp restore_all ENDPROC(ret_from_kernel_thread) /* -- cgit v0.10.2 From 150ac78d63afb96360dab448b7b4d33c98c8266c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:14 -0700 Subject: x86/entry/32: Switch INT80 to the new C syscall path Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/a7e8d8df96838eae3208dd0441023f3ce7a81831.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 36b6beb..02881e5 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -153,13 +153,13 @@ #endif /* CONFIG_X86_32_LAZY_GS */ -.macro SAVE_ALL +.macro SAVE_ALL pt_regs_ax=%eax cld PUSH_GS pushl %fs pushl %es pushl %ds - pushl %eax + pushl \pt_regs_ax pushl %ebp pushl %edi pushl %esi @@ -370,20 +370,17 @@ ENDPROC(entry_SYSENTER_32) # system call handler stub ENTRY(entry_INT80_32) ASM_CLAC - pushl %eax # save orig_eax - SAVE_ALL - GET_THREAD_INFO(%ebp) - # system call tracing in operation / emulation - testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) - jnz syscall_trace_entry - cmpl $(NR_syscalls), %eax - jae syscall_badsys -syscall_call: - call *sys_call_table(, %eax, 4) -syscall_after_call: - movl %eax, PT_EAX(%esp) # store the return value -syscall_exit: - jmp syscall_exit_work + pushl %eax /* pt_regs->orig_ax */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, load -ENOSYS into ax */ + + /* + * User mode is traced as though IRQs are on, and the interrupt gate + * turned them off. + */ + TRACE_IRQS_OFF + + movl %esp, %eax + call do_int80_syscall_32 restore_all: TRACE_IRQS_IRET @@ -491,11 +488,6 @@ syscall_fault: jmp resume_userspace END(syscall_fault) -syscall_badsys: - movl $-ENOSYS, %eax - jmp syscall_after_call -END(syscall_badsys) - sysenter_badsys: movl $-ENOSYS, %eax jmp sysenter_after_call -- cgit v0.10.2 From 5f310f739b4cc343f3f087681e41bbc2f0ce902d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:15 -0700 Subject: x86/entry/32: Re-implement SYSENTER using the new C path Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/5b99659e8be70f3dd10cd8970a5c90293d9ad9a7.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 88dc5ba..0ed023d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -363,7 +363,7 @@ __visible void do_int80_syscall_32(struct pt_regs *regs) syscall_return_slowpath(regs); } -/* Returns 0 to return using IRET or 1 to return using SYSRETL. */ +/* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible long do_fast_syscall_32(struct pt_regs *regs) { /* @@ -417,7 +417,20 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) regs->ip == landing_pad && (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0; #else - return 0; + /* + * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT. + * + * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP, + * because the ECX fixup above will ensure that this is essentially + * never the case. + * + * We don't allow syscalls at all from VM86 mode, but we still + * need to check VM, because we might be returning from sys_vm86. + */ + return static_cpu_has(X86_FEATURE_SEP) && + regs->cs == __USER_CS && regs->ss == __USER_DS && + regs->ip == landing_pad && + (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0; #endif } #endif diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 02881e5..c1c7c63 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -287,76 +287,47 @@ need_resched: END(resume_kernel) #endif -/* - * SYSENTER_RETURN points to after the SYSENTER instruction - * in the vsyscall page. See vsyscall-sysentry.S, which defines - * the symbol. - */ - # SYSENTER call handler stub ENTRY(entry_SYSENTER_32) movl TSS_sysenter_sp0(%esp), %esp sysenter_past_esp: + pushl $__USER_DS /* pt_regs->ss */ + pushl %ecx /* pt_regs->cx */ + pushfl /* pt_regs->flags (except IF = 0) */ + orl $X86_EFLAGS_IF, (%esp) /* Fix IF */ + pushl $__USER_CS /* pt_regs->cs */ + pushl $0 /* pt_regs->ip = 0 (placeholder) */ + pushl %eax /* pt_regs->orig_ax */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ + /* - * Interrupts are disabled here, but we can't trace it until - * enough kernel state to call TRACE_IRQS_OFF can be called - but - * we immediately enable interrupts at that point anyway. - */ - pushl $__USER_DS - pushl %ebp - pushfl - orl $X86_EFLAGS_IF, (%esp) - pushl $__USER_CS - /* - * Push current_thread_info()->sysenter_return to the stack. - * A tiny bit of offset fixup is necessary: TI_sysenter_return - * is relative to thread_info, which is at the bottom of the - * kernel stack page. 4*4 means the 4 words pushed above; - * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; - * and THREAD_SIZE takes us to the bottom. + * User mode is traced as though IRQs are on, and SYSENTER + * turned them off. */ - pushl ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) - - pushl %eax - SAVE_ALL - ENABLE_INTERRUPTS(CLBR_NONE) - -/* - * Load the potential sixth argument from user stack. - * Careful about security. - */ - cmpl $__PAGE_OFFSET-3, %ebp - jae syscall_fault - ASM_STAC -1: movl (%ebp), %ebp - ASM_CLAC - movl %ebp, PT_EBP(%esp) - _ASM_EXTABLE(1b, syscall_fault) - - GET_THREAD_INFO(%ebp) - - testl $_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp) - jnz syscall_trace_entry -sysenter_do_call: - cmpl $(NR_syscalls), %eax - jae sysenter_badsys - call *sys_call_table(, %eax, 4) -sysenter_after_call: - movl %eax, PT_EAX(%esp) - LOCKDEP_SYS_EXIT - DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - movl TI_flags(%ebp), %ecx - testl $_TIF_ALLWORK_MASK, %ecx - jnz syscall_exit_work_irqs_off -sysenter_exit: -/* if something modifies registers it must also disable sysexit */ - movl PT_EIP(%esp), %edx - movl PT_OLDESP(%esp), %ecx - xorl %ebp, %ebp - TRACE_IRQS_ON + + movl %esp, %eax + call do_fast_syscall_32 + testl %eax, %eax + jz .Lsyscall_32_done + +/* Opportunistic SYSEXIT */ + TRACE_IRQS_ON /* User mode traces as IRQs on. */ + movl PT_EIP(%esp), %edx /* pt_regs->ip */ + movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ + popl %ebx /* pt_regs->bx */ + addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ + popl %esi /* pt_regs->si */ + popl %edi /* pt_regs->di */ + popl %ebp /* pt_regs->bp */ + popl %eax /* pt_regs->ax */ 1: mov PT_FS(%esp), %fs PTGS_TO_GS + + /* + * Return back to the vDSO, which will pop ecx and edx. + * Don't bother with DS and ES (they already contain __USER_DS). + */ ENABLE_INTERRUPTS_SYSEXIT .pushsection .fixup, "ax" @@ -371,7 +342,7 @@ ENDPROC(entry_SYSENTER_32) ENTRY(entry_INT80_32) ASM_CLAC pushl %eax /* pt_regs->orig_ax */ - SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest, load -ENOSYS into ax */ + SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ /* * User mode is traced as though IRQs are on, and the interrupt gate @@ -381,6 +352,7 @@ ENTRY(entry_INT80_32) movl %esp, %eax call do_int80_syscall_32 +.Lsyscall_32_done: restore_all: TRACE_IRQS_IRET @@ -457,42 +429,6 @@ ldt_ss: #endif ENDPROC(entry_INT80_32) - # perform syscall exit tracing - ALIGN -syscall_trace_entry: - movl $-ENOSYS, PT_EAX(%esp) - movl %esp, %eax - call syscall_trace_enter - /* What it returned is what we'll actually use. */ - cmpl $(NR_syscalls), %eax - jnae syscall_call - jmp syscall_exit -END(syscall_trace_entry) - - # perform syscall exit tracing - ALIGN -syscall_exit_work_irqs_off: - TRACE_IRQS_ON - ENABLE_INTERRUPTS(CLBR_ANY) - -syscall_exit_work: - movl %esp, %eax - call syscall_return_slowpath - jmp restore_all -END(syscall_exit_work) - -syscall_fault: - ASM_CLAC - GET_THREAD_INFO(%ebp) - movl $-EFAULT, PT_EAX(%esp) - jmp resume_userspace -END(syscall_fault) - -sysenter_badsys: - movl $-ENOSYS, %eax - jmp sysenter_after_call -END(sysenter_badsys) - .macro FIXUP_ESPFIX_STACK /* * Switch back for ESPFIX stack to the normal zerobased stack diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 00157ca..93bd845 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -34,6 +34,8 @@ __kernel_vsyscall: /* If SYSENTER (Intel) or SYSCALL32 (AMD) is available, use it. */ ALTERNATIVE_2 "", "sysenter", X86_FEATURE_SYSENTER32, \ "syscall", X86_FEATURE_SYSCALL32 +#else + ALTERNATIVE "", "sysenter", X86_FEATURE_SEP #endif /* Enter using int $0x80 */ -- cgit v0.10.2 From 487e3bf4f77699160aa81a414200060a78a67c3d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:16 -0700 Subject: x86/asm: Remove thread_info.sysenter_return It's no longer needed. We could reinstate something like it as an optimization, which would remove two cachelines from the fast syscall entry working set. I benchmarked it, and it makes no difference whatsoever to the performance of cache-hot compat syscalls on Sandy Bridge. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/f08cc0cff30201afe9bb565c47134c0a6c1a96a2.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 8afdc3e..a1ecd21 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -59,7 +59,6 @@ struct thread_info { __u32 cpu; /* current CPU */ int saved_preempt_count; mm_segment_t addr_limit; - void __user *sysenter_return; unsigned int sig_on_uaccess_error:1; unsigned int uaccess_err:1; /* uaccess failed */ }; diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 8e3d22a1..95a18e2 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -54,9 +54,6 @@ void common(void) { OFFSET(IA32_SIGCONTEXT_ip, sigcontext_ia32, ip); BLANK(); - OFFSET(TI_sysenter_return, thread_info, sysenter_return); - - BLANK(); OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext); #endif -- cgit v0.10.2 From 8b13c2552ffc8e54e57598df36707183933e8e8c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:17 -0700 Subject: x86/entry: Remove unnecessary IRQ twiddling in fast 32-bit syscalls This is slightly messy, but it eliminates an unnecessary cli;sti pair. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/22f34b1096694a37326f36c53407b8dd90f37948.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 0ed023d..0d1c842 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -323,10 +323,10 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) /* - * Does a 32-bit syscall. Called with IRQs off and does all entry and - * exit work. + * Does a 32-bit syscall. Called with IRQs on and does all entry and + * exit work and returns with IRQs off. */ -__visible void do_int80_syscall_32(struct pt_regs *regs) +static void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned int nr = (unsigned int)regs->orig_ax; @@ -335,8 +335,6 @@ __visible void do_int80_syscall_32(struct pt_regs *regs) ti->status |= TS_COMPAT; #endif - local_irq_enable(); - if (READ_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY) { /* * Subtlety here: if ptrace pokes something larger than @@ -363,6 +361,13 @@ __visible void do_int80_syscall_32(struct pt_regs *regs) syscall_return_slowpath(regs); } +/* Handles int $0x80 */ +__visible void do_int80_syscall_32(struct pt_regs *regs) +{ + local_irq_enable(); + do_syscall_32_irqs_on(regs); +} + /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible long do_fast_syscall_32(struct pt_regs *regs) { @@ -398,10 +403,9 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) prepare_exit_to_usermode(regs); return 0; /* Keep it simple: use IRET. */ } - local_irq_disable(); /* Now this is just like a normal syscall. */ - do_int80_syscall_32(regs); + do_syscall_32_irqs_on(regs); #ifdef CONFIG_X86_64 /* -- cgit v0.10.2 From 460d12453e1afe20416ce9536cfecb31d17a9abd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:18 -0700 Subject: x86/entry: Make irqs_disabled checks in exit code depend on lockdep These checks are quite slow. Disable them in non-lockdep kernels to reduce the performance hit. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/eccff2a154ae6fb50f40228901003a6e9c24f3d0.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 0d1c842..03aacd1 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -219,7 +219,7 @@ static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) /* Called with IRQs disabled. */ __visible void prepare_exit_to_usermode(struct pt_regs *regs) { - if (WARN_ON(!irqs_disabled())) + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) local_irq_disable(); lockdep_sys_exit(); @@ -281,8 +281,8 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) CT_WARN_ON(ct_state() != CONTEXT_KERNEL); - if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled", - regs->orig_ax)) + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && + WARN(irqs_disabled(), "syscall %ld left IRQs disabled", regs->orig_ax)) local_irq_enable(); /* -- cgit v0.10.2 From 33c52129f45e06d9ce23e1a3d50bf9fd6770748b Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:19 -0700 Subject: x86/entry: Force inlining of 32-bit syscall code On systems that support fast syscalls, we only really care about the performance of the fast syscall path. Forcibly inline it and add a likely annotation. This saves 4-6 cycles. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/8472036ff1f4b426b4c4c3e3d0b3bf5264407c0c.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 03aacd1..d5eee85 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -324,9 +324,11 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) /* * Does a 32-bit syscall. Called with IRQs on and does all entry and - * exit work and returns with IRQs off. + * exit work and returns with IRQs off. This function is extremely hot + * in workloads that use it, and it's usually called from + * do_fast_syscall_32, so forcibly inline it to improve performance. */ -static void do_syscall_32_irqs_on(struct pt_regs *regs) +static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned int nr = (unsigned int)regs->orig_ax; @@ -345,7 +347,7 @@ static void do_syscall_32_irqs_on(struct pt_regs *regs) nr = syscall_trace_enter(regs); } - if (nr < IA32_NR_syscalls) { + if (likely(nr < IA32_NR_syscalls)) { /* * It's possible that a 32-bit syscall implementation * takes a 64-bit parameter but nonetheless assumes that -- cgit v0.10.2 From c68ca6787bdd6d2df37cf950135aa11e71af358a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:20 -0700 Subject: x86/entry: Micro-optimize compat fast syscall arg fetch We're following a 32-bit pointer, and the uaccess code isn't smart enough to figure out that the access_ok() check isn't needed. This saves about three cycles on a cache-hot fast syscall. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/bdff034e2f23c5eb974c760cf494cb5bddce8f29.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d5eee85..08a945d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -394,8 +394,20 @@ __visible long do_fast_syscall_32(struct pt_regs *regs) * WARNING: We are in CONTEXT_USER and RCU isn't paying attention! */ local_irq_enable(); - if (get_user(*(u32 *)®s->cx, - (u32 __user __force *)(unsigned long)(u32)regs->sp)) { + if ( +#ifdef CONFIG_X86_64 + /* + * Micro-optimization: the pointer we're following is explicitly + * 32 bits, so it can't be out of range. + */ + __get_user(*(u32 *)®s->cx, + (u32 __user __force *)(unsigned long)(u32)regs->sp) +#else + get_user(*(u32 *)®s->cx, + (u32 __user __force *)(unsigned long)(u32)regs->sp) +#endif + ) { + /* User code screwed up. */ local_irq_disable(); regs->ax = -EFAULT; -- cgit v0.10.2 From 4aabd140f9cbe0361401a1368bac74df1010abf5 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:21 -0700 Subject: x86/entry: Hide two syscall entry assertions behind CONFIG_DEBUG_ENTRY This shaves a few cycles off the slow paths. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/ce383fa9e129286ce6da6e00b53acd4c9fb5d06a.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 08a945d..778ca70 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -71,7 +71,8 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) unsigned long ret = 0; u32 work; - BUG_ON(regs != task_pt_regs(current)); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + BUG_ON(regs != task_pt_regs(current)); work = ACCESS_ONCE(current_thread_info()->flags) & _TIF_WORK_SYSCALL_ENTRY; @@ -160,7 +161,8 @@ long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, u32 work = ACCESS_ONCE(current_thread_info()->flags) & _TIF_WORK_SYSCALL_ENTRY; - BUG_ON(regs != task_pt_regs(current)); + if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) + BUG_ON(regs != task_pt_regs(current)); /* * If we stepped into a sysenter/syscall insn, it trapped in -- cgit v0.10.2 From dd636071c3d8044c802b7a365e9934724a929530 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:22 -0700 Subject: x86/entry: Use pt_regs_to_thread_info() in syscall entry tracing It generates simpler and faster code than current_thread_info(). Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/a3b6633e7dcb9f673c1b619afae602d29d27d2cf.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 778ca70..d087421 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -30,6 +30,13 @@ #define CREATE_TRACE_POINTS #include +static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) +{ + unsigned long top_of_stack = + (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; + return (struct thread_info *)(top_of_stack - THREAD_SIZE); +} + #ifdef CONFIG_CONTEXT_TRACKING /* Called on entry from user mode with IRQs off. */ __visible void enter_from_user_mode(void) @@ -68,14 +75,14 @@ static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch) */ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) { + struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned long ret = 0; u32 work; if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); - work = ACCESS_ONCE(current_thread_info()->flags) & - _TIF_WORK_SYSCALL_ENTRY; + work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; #ifdef CONFIG_CONTEXT_TRACKING /* @@ -157,9 +164,9 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch) long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch, unsigned long phase1_result) { + struct thread_info *ti = pt_regs_to_thread_info(regs); long ret = 0; - u32 work = ACCESS_ONCE(current_thread_info()->flags) & - _TIF_WORK_SYSCALL_ENTRY; + u32 work = ACCESS_ONCE(ti->flags) & _TIF_WORK_SYSCALL_ENTRY; if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) BUG_ON(regs != task_pt_regs(current)); @@ -211,13 +218,6 @@ long syscall_trace_enter(struct pt_regs *regs) return syscall_trace_enter_phase2(regs, arch, phase1_result); } -static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) -{ - unsigned long top_of_stack = - (unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING; - return (struct thread_info *)(top_of_stack - THREAD_SIZE); -} - /* Called with IRQs disabled. */ __visible void prepare_exit_to_usermode(struct pt_regs *regs) { -- cgit v0.10.2 From 39b48e575e92e31251b74b4b48cea2129cee90bd Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:23 -0700 Subject: x86/entry: Split and inline prepare_exit_to_usermode() GCC is unable to properly optimize functions that have a very short likely case and a longer and register-heavier cold part -- it fails to sink all of the register saving and stack frame setup code into the unlikely part. Help it out with prepare_exit_to_usermode() by splitting it into two parts and inline the hot part. Saves 6-8 cycles for compat syscalls. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/9fc53eda4a5b924070952f12fa4ae3e477640a07.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index d087421..66ccbd6 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -218,14 +218,12 @@ long syscall_trace_enter(struct pt_regs *regs) return syscall_trace_enter_phase2(regs, arch, phase1_result); } -/* Called with IRQs disabled. */ -__visible void prepare_exit_to_usermode(struct pt_regs *regs) -{ - if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) - local_irq_disable(); - - lockdep_sys_exit(); +#define EXIT_TO_USERMODE_LOOP_FLAGS \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NEED_RESCHED | _TIF_USER_RETURN_NOTIFY) +static void exit_to_usermode_loop(struct pt_regs *regs, u32 cached_flags) +{ /* * In order to return to user mode, we need to have IRQs off with * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY, @@ -235,14 +233,6 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) * work to clear some of the flags can sleep. */ while (true) { - u32 cached_flags = - READ_ONCE(pt_regs_to_thread_info(regs)->flags); - - if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | - _TIF_UPROBE | _TIF_NEED_RESCHED | - _TIF_USER_RETURN_NOTIFY))) - break; - /* We have work to do. */ local_irq_enable(); @@ -266,7 +256,30 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs) /* Disable IRQs and retry */ local_irq_disable(); + + cached_flags = READ_ONCE(pt_regs_to_thread_info(regs)->flags); + + if (!(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) + break; + } +} + +/* Called with IRQs disabled. */ +__visible inline void prepare_exit_to_usermode(struct pt_regs *regs) +{ + u32 cached_flags; + + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) + local_irq_disable(); + + lockdep_sys_exit(); + + cached_flags = + READ_ONCE(pt_regs_to_thread_info(regs)->flags); + + if (unlikely(cached_flags & EXIT_TO_USERMODE_LOOP_FLAGS)) + exit_to_usermode_loop(regs, cached_flags); user_enter(); } -- cgit v0.10.2 From f5e6a9753ac2965564a14e6285a06f44043ed9c8 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 5 Oct 2015 17:48:24 -0700 Subject: x86/entry: Split and inline syscall_return_slowpath() GCC is unable to properly optimize functions that have a very short likely case and a longer and register-heavier cold part -- it fails to sink all of the register saving and stack frame setup code into the unlikely part. Help it out with syscall_return_slowpath() by splitting it into two parts and inline the hot part. Saves 6 cycles for compat syscalls. Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/0f773a894ab15c589ac794c2d34ca6ba9b5335c9.1444091585.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 66ccbd6..b53e04d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -284,15 +284,40 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) user_enter(); } +#define SYSCALL_EXIT_WORK_FLAGS \ + (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ + _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT) + +static void syscall_slow_exit_work(struct pt_regs *regs, u32 cached_flags) +{ + bool step; + + audit_syscall_exit(regs); + + if (cached_flags & _TIF_SYSCALL_TRACEPOINT) + trace_sys_exit(regs, regs->ax); + + /* + * If TIF_SYSCALL_EMU is set, we only get here because of + * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). + * We already reported this syscall instruction in + * syscall_trace_enter(). + */ + step = unlikely( + (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) + == _TIF_SINGLESTEP); + if (step || cached_flags & _TIF_SYSCALL_TRACE) + tracehook_report_syscall_exit(regs, step); +} + /* * Called with IRQs on and fully valid regs. Returns with IRQs off in a * state such that we can immediately switch to user mode. */ -__visible void syscall_return_slowpath(struct pt_regs *regs) +__visible inline void syscall_return_slowpath(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); u32 cached_flags = READ_ONCE(ti->flags); - bool step; CT_WARN_ON(ct_state() != CONTEXT_KERNEL); @@ -304,25 +329,8 @@ __visible void syscall_return_slowpath(struct pt_regs *regs) * First do one-time work. If these work items are enabled, we * want to run them exactly once per syscall exit with IRQs on. */ - if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | - _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) { - audit_syscall_exit(regs); - - if (cached_flags & _TIF_SYSCALL_TRACEPOINT) - trace_sys_exit(regs, regs->ax); - - /* - * If TIF_SYSCALL_EMU is set, we only get here because of - * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). - * We already reported this syscall instruction in - * syscall_trace_enter(). - */ - step = unlikely( - (cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU)) - == _TIF_SINGLESTEP); - if (step || cached_flags & _TIF_SYSCALL_TRACE) - tracehook_report_syscall_exit(regs, step); - } + if (unlikely(cached_flags & SYSCALL_EXIT_WORK_FLAGS)) + syscall_slow_exit_work(regs, cached_flags); #ifdef CONFIG_COMPAT /* -- cgit v0.10.2 From 374a3a3916a70fc6236bc2b8f8ac02548a128a54 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 9 Oct 2015 19:08:59 +0200 Subject: x86/entry/64/compat: Document sysenter_fix_flags's reason for existence The code under the label can normally be inline, without the jumping back and forth but the latter is an optimization. Document that. Signed-off-by: Borislav Petkov Acked-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20151009170859.GA24266@pd.tnic Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index cf9641c..92b0b27 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -102,6 +102,12 @@ ENTRY(entry_SYSENTER_compat) * NT was set instead of doing an unconditional popfq. * This needs to happen before enabling interrupts so that * we don't get preempted with NT set. + * + * NB.: sysenter_fix_flags is a label with the code under it moved + * out-of-line as an optimization: NT is unlikely to be set in the + * majority of the cases and instead of polluting the I$ unnecessarily, + * we're keeping that code behind a branch which will predict as + * not-taken and therefore its instructions won't be fetched. */ testl $X86_EFLAGS_NT, EFLAGS(%rsp) jnz sysenter_fix_flags -- cgit v0.10.2 From 893a3ec27e1dae62a904f78d53244001979748a6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 12 Oct 2015 17:32:12 -0700 Subject: selftests/x86: Style fixes for the 'unwind_vdso' test Checkpatch is really quite bad for user code like this, but it caught two legit style issues. Reported-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/3335040bdd40d2bca4b1a28a3f8b165361c801b7.1444696194.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/unwind_vdso.c b/tools/testing/selftests/x86/unwind_vdso.c index 5992ff2..00a26a8 100644 --- a/tools/testing/selftests/x86/unwind_vdso.c +++ b/tools/testing/selftests/x86/unwind_vdso.c @@ -134,7 +134,7 @@ _Unwind_Reason_Code trace_fn(struct _Unwind_Context * ctx, void *opaque) static void sigtrap(int sig, siginfo_t *info, void *ctx_void) { - ucontext_t *ctx = (ucontext_t*)ctx_void; + ucontext_t *ctx = (ucontext_t *)ctx_void; struct unwind_state state; unsigned long ip = ctx->uc_mcontext.gregs[REG_EIP]; @@ -192,7 +192,9 @@ int main() * affected by libc/19006 (https://sourceware.org/PR19006). */ printf("[WARN]\tsyscall(2) didn't enter AT_SYSINFO\n"); - } if (get_eflags() & X86_EFLAGS_TF) { + } + + if (get_eflags() & X86_EFLAGS_TF) { printf("[FAIL]\tTF is still set\n"); nerrs++; } -- cgit v0.10.2 From af22aa7c766d50712b9afeca53e9e4208ce6284c Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 12 Oct 2015 17:32:13 -0700 Subject: x86/asm: Remove the xyz_cfi macros from dwarf2.h They are currently unused, and I don't think that anyone was ever particularly happy with them. They had the unfortunate property that they made it easy to CFI-annotate things without thinking about them -- when pushing, do you want to just update the CFA offset, or do you also want to update the saved location of the register being pushed? Suggested-by: Ingo Molnar Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1447bfbd10bb268b4593b32534ecefa1f4df287e.1444696194.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index 09133ba..b7a1ab8 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -81,97 +81,4 @@ #endif -/* - * An attempt to make CFI annotations more or less - * correct and shorter. It is implied that you know - * what you're doing if you use them. - */ -#ifdef __ASSEMBLY__ -#ifdef CONFIG_X86_64 - .macro pushq_cfi reg - pushq \reg - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro pushq_cfi_reg reg - pushq %\reg - CFI_ADJUST_CFA_OFFSET 8 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popq_cfi reg - popq \reg - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro popq_cfi_reg reg - popq %\reg - CFI_ADJUST_CFA_OFFSET -8 - CFI_RESTORE \reg - .endm - - .macro pushfq_cfi - pushfq - CFI_ADJUST_CFA_OFFSET 8 - .endm - - .macro popfq_cfi - popfq - CFI_ADJUST_CFA_OFFSET -8 - .endm - - .macro movq_cfi reg offset=0 - movq %\reg, \offset(%rsp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movq_cfi_restore offset reg - movq \offset(%rsp), %\reg - CFI_RESTORE \reg - .endm -#else /*!CONFIG_X86_64*/ - .macro pushl_cfi reg - pushl \reg - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro pushl_cfi_reg reg - pushl %\reg - CFI_ADJUST_CFA_OFFSET 4 - CFI_REL_OFFSET \reg, 0 - .endm - - .macro popl_cfi reg - popl \reg - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro popl_cfi_reg reg - popl %\reg - CFI_ADJUST_CFA_OFFSET -4 - CFI_RESTORE \reg - .endm - - .macro pushfl_cfi - pushfl - CFI_ADJUST_CFA_OFFSET 4 - .endm - - .macro popfl_cfi - popfl - CFI_ADJUST_CFA_OFFSET -4 - .endm - - .macro movl_cfi reg offset=0 - movl %\reg, \offset(%esp) - CFI_REL_OFFSET \reg, \offset - .endm - - .macro movl_cfi_restore offset reg - movl \offset(%esp), %\reg - CFI_RESTORE \reg - .endm -#endif /*!CONFIG_X86_64*/ -#endif /*__ASSEMBLY__*/ - #endif /* _ASM_X86_DWARF2_H */ -- cgit v0.10.2 From 612bece654ff6cd43160e201985be826e96b8bcb Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 12 Oct 2015 17:32:14 -0700 Subject: um/x86: Fix build after x86 syscall changes I didn't realize that um didn't include x86's asm/syscall.h. Re-add a missing typedef. Reported-by: Richard Weinberger Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 034042cc1e28 ("x86/entry/syscalls: Move syscall table declarations into asm/syscalls.h") Link: http://lkml.kernel.org/r/8d15b9a88f4fd49e3342757e0a34624ee5ce9220.1444696194.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/um/asm/syscall.h b/arch/x86/um/asm/syscall.h index 9fe77b7..81d6562 100644 --- a/arch/x86/um/asm/syscall.h +++ b/arch/x86/um/asm/syscall.h @@ -3,6 +3,10 @@ #include +typedef asmlinkage long (*sys_call_ptr_t)(unsigned long, unsigned long, + unsigned long, unsigned long, + unsigned long, unsigned long); + static inline int syscall_get_arch(void) { #ifdef CONFIG_X86_32 -- cgit v0.10.2 From 657c1eea0019e80685a84cbb1919794243a187c9 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 16 Oct 2015 15:42:54 -0700 Subject: x86/entry/32: Fix entry_INT80_32() to expect interrupts to be on When I rewrote entry_INT80_32, I thought that int80 was an interrupt gate. It's a trap gate. *facepalm* Thanks to Brian Gerst for pointing out that it's better to change the entry code than to change the gate type. Suggested-by: Brian Gerst Reported-and-tested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 150ac78d63af ("x86/entry/32: Switch INT80 to the new C syscall path") Link: http://lkml.kernel.org/r/dc09d9b574a5c1dcca996847875c73f8341ce0ad.1445035014.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index b53e04d..a89fdbc 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -351,7 +351,14 @@ __visible inline void syscall_return_slowpath(struct pt_regs *regs) * in workloads that use it, and it's usually called from * do_fast_syscall_32, so forcibly inline it to improve performance. */ -static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) +#ifdef CONFIG_X86_32 +/* 32-bit kernels use a trap gate for INT80, and the asm code calls here. */ +__visible +#else +/* 64-bit kernels use do_syscall_32_irqs_off() instead. */ +static +#endif +__always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) { struct thread_info *ti = pt_regs_to_thread_info(regs); unsigned int nr = (unsigned int)regs->orig_ax; @@ -386,12 +393,14 @@ static __always_inline void do_syscall_32_irqs_on(struct pt_regs *regs) syscall_return_slowpath(regs); } -/* Handles int $0x80 */ -__visible void do_int80_syscall_32(struct pt_regs *regs) +#ifdef CONFIG_X86_64 +/* Handles INT80 on 64-bit kernels */ +__visible void do_syscall_32_irqs_off(struct pt_regs *regs) { local_irq_enable(); do_syscall_32_irqs_on(regs); } +#endif /* Returns 0 to return using IRET or 1 to return using SYSEXIT/SYSRETL. */ __visible long do_fast_syscall_32(struct pt_regs *regs) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index c1c7c63..4f97f49 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -345,13 +345,13 @@ ENTRY(entry_INT80_32) SAVE_ALL pt_regs_ax=$-ENOSYS /* save rest */ /* - * User mode is traced as though IRQs are on, and the interrupt gate - * turned them off. + * User mode is traced as though IRQs are on. Unlike the 64-bit + * case, INT80 is a trap gate on 32-bit kernels, so interrupts + * are already on (unless user code is messing around with iopl). */ - TRACE_IRQS_OFF movl %esp, %eax - call do_int80_syscall_32 + call do_syscall_32_irqs_on .Lsyscall_32_done: restore_all: diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S index 92b0b27..c320183 100644 --- a/arch/x86/entry/entry_64_compat.S +++ b/arch/x86/entry/entry_64_compat.S @@ -303,7 +303,7 @@ ENTRY(entry_INT80_compat) TRACE_IRQS_OFF movq %rsp, %rdi - call do_int80_syscall_32 + call do_syscall_32_irqs_off .Lsyscall_32_done: /* Go back to user mode. */ -- cgit v0.10.2 From 3bd29515d1cad26fa85a1a9b442de8816c1f5c54 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 16 Oct 2015 15:42:55 -0700 Subject: x86/entry/32: Fix FS and GS restore in opportunistic SYSEXIT We either need to restore them before popping and thus changing ESP, or we need to adjust the offsets. The former is simpler. Reported-and-tested-by: Borislav Petkov Signed-off-by: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 5f310f739b4c x86/entry/32: ("Re-implement SYSENTER using the new C path") Link: http://lkml.kernel.org/r/461e5c7d8fa3821529893a4893ac9c4bc37f9e17.1445035014.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4f97f49..3eb572e 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -315,14 +315,14 @@ sysenter_past_esp: TRACE_IRQS_ON /* User mode traces as IRQs on. */ movl PT_EIP(%esp), %edx /* pt_regs->ip */ movl PT_OLDESP(%esp), %ecx /* pt_regs->sp */ +1: mov PT_FS(%esp), %fs + PTGS_TO_GS popl %ebx /* pt_regs->bx */ addl $2*4, %esp /* skip pt_regs->cx and pt_regs->dx */ popl %esi /* pt_regs->si */ popl %edi /* pt_regs->di */ popl %ebp /* pt_regs->bp */ popl %eax /* pt_regs->ax */ -1: mov PT_FS(%esp), %fs - PTGS_TO_GS /* * Return back to the vDSO, which will pop ecx and edx. -- cgit v0.10.2