From b2f680380ddf2f003882e59e00acd6c1952f91fc Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Wed, 9 Mar 2016 15:05:56 -0500 Subject: x86/mm/32: Add support for 64-bit __get_user() on 32-bit kernels The existing __get_user() implementation does not support fetching 64-bit values on 32-bit x86. Implement this in a way that does not generate any incorrect warnings as cautioned by Russell King. Test code available at: http://www.kvack.org/~bcrl/x86_32-get_user.tar . Signed-off-by: Benjamin LaHaise Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index a969ae6..8b3fb76 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -333,7 +333,26 @@ do { \ } while (0) #ifdef CONFIG_X86_32 -#define __get_user_asm_u64(x, ptr, retval, errret) (x) = __get_user_bad() +#define __get_user_asm_u64(x, ptr, retval, errret) \ +({ \ + __typeof__(ptr) __ptr = (ptr); \ + asm volatile(ASM_STAC "\n" \ + "1: movl %2,%%eax\n" \ + "2: movl %3,%%edx\n" \ + "3: " ASM_CLAC "\n" \ + ".section .fixup,\"ax\"\n" \ + "4: mov %4,%0\n" \ + " xorl %%eax,%%eax\n" \ + " xorl %%edx,%%edx\n" \ + " jmp 3b\n" \ + ".previous\n" \ + _ASM_EXTABLE(1b, 4b) \ + _ASM_EXTABLE(2b, 4b) \ + : "=r" (retval), "=A"(x) \ + : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \ + "i" (errret), "0" (retval)); \ +}) + #define __get_user_asm_ex_u64(x, ptr) (x) = __get_user_bad() #else #define __get_user_asm_u64(x, ptr, retval, errret) \ @@ -420,7 +439,7 @@ do { \ #define __get_user_nocheck(x, ptr, size) \ ({ \ int __gu_err; \ - unsigned long __gu_val; \ + __inttype(*(ptr)) __gu_val; \ __uaccess_begin(); \ __get_user_size(__gu_val, (ptr), (size), __gu_err, -EFAULT); \ __uaccess_end(); \ -- cgit v0.10.2 From 67d7a982bab6702d84415ea889996fae72a7d3b2 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Tue, 10 May 2016 23:07:02 +0200 Subject: x86/extable: Ensure entries are swapped completely when sorting The x86 exception table sorting was changed in this recent commit: 29934b0fb8ff ("x86/extable: use generic search and sort routines") ... to use the arch independent code in lib/extable.c. However, the patch was mangled somehow on its way into the kernel from the last version posted at: https://lkml.org/lkml/2016/1/27/232 The committed version kind of attempted to incorporate the changes of contemporary commit done in the x86 tree: 548acf19234d ("x86/mm: Expand the exception table logic to allow new handling options") ... as in _completely_ _ignoring_ the x86 specific 'handler' member of struct exception_table_entry. This effectively broke the sorting as entries will only be partly swapped now. Fortunately, the x86 Kconfig selects BUILDTIME_EXTABLE_SORT, so the exception table doesn't need to be sorted at runtime. However, in case that ever changes, we better not break the exception table sorting just because of that. Fix this by providing a swap_ex_entry_fixup() macro that takes care of the 'handler' member. Signed-off-by: Mathias Krause Reviewed-by: Ard Biesheuvel Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Link: http://lkml.kernel.org/r/1462914422-2911-1-git-send-email-minipli@googlemail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8b3fb76..86c48f3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -108,6 +108,14 @@ struct exception_table_entry { #define ARCH_HAS_RELATIVE_EXTABLE +#define swap_ex_entry_fixup(a, b, tmp, delta) \ + do { \ + (a)->fixup = (b)->fixup + (delta); \ + (b)->fixup = (tmp).fixup - (delta); \ + (a)->handler = (b)->handler + (delta); \ + (b)->handler = (tmp).handler - (delta); \ + } while (0) + extern int fixup_exception(struct pt_regs *regs, int trapnr); extern bool ex_has_fault_handler(unsigned long ip); extern int early_fixup_exception(unsigned long *ip); -- cgit v0.10.2 From f5967101e9de12addcda4510dfbac66d7c5779c3 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 May 2016 12:56:27 +0200 Subject: x86/hweight: Get rid of the special calling convention People complained about ARCH_HWEIGHT_CFLAGS and how it throws a wrench into kcov, lto, etc, experimentations. Add asm versions for __sw_hweight{32,64}() and do explicit saving and restoring of clobbered registers. This gets rid of the special calling convention. We get to call those functions on !X86_FEATURE_POPCNT CPUs. We still need to hardcode POPCNT and register operands as some old gas versions which we support, do not know about POPCNT. Btw, remove redundant REX prefix from 32-bit POPCNT because alternatives can do padding now. Suggested-by: H. Peter Anvin Signed-off-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1464605787-20603-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 0a7b885..729d41d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -294,11 +294,6 @@ config X86_32_LAZY_GS def_bool y depends on X86_32 && !CC_STACKPROTECTOR -config ARCH_HWEIGHT_CFLAGS - string - default "-fcall-saved-ecx -fcall-saved-edx" if X86_32 - default "-fcall-saved-rdi -fcall-saved-rsi -fcall-saved-rdx -fcall-saved-rcx -fcall-saved-r8 -fcall-saved-r9 -fcall-saved-r10 -fcall-saved-r11" if X86_64 - config ARCH_SUPPORTS_UPROBES def_bool y diff --git a/arch/x86/include/asm/arch_hweight.h b/arch/x86/include/asm/arch_hweight.h index 02e799f..e7cd631 100644 --- a/arch/x86/include/asm/arch_hweight.h +++ b/arch/x86/include/asm/arch_hweight.h @@ -4,8 +4,8 @@ #include #ifdef CONFIG_64BIT -/* popcnt %edi, %eax -- redundant REX prefix for alignment */ -#define POPCNT32 ".byte 0xf3,0x40,0x0f,0xb8,0xc7" +/* popcnt %edi, %eax */ +#define POPCNT32 ".byte 0xf3,0x0f,0xb8,0xc7" /* popcnt %rdi, %rax */ #define POPCNT64 ".byte 0xf3,0x48,0x0f,0xb8,0xc7" #define REG_IN "D" @@ -17,19 +17,15 @@ #define REG_OUT "a" #endif -/* - * __sw_hweightXX are called from within the alternatives below - * and callee-clobbered registers need to be taken care of. See - * ARCH_HWEIGHT_CFLAGS in for the respective - * compiler switches. - */ +#define __HAVE_ARCH_SW_HWEIGHT + static __always_inline unsigned int __arch_hweight32(unsigned int w) { - unsigned int res = 0; + unsigned int res; asm (ALTERNATIVE("call __sw_hweight32", POPCNT32, X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + : "="REG_OUT (res) + : REG_IN (w)); return res; } @@ -53,11 +49,11 @@ static inline unsigned long __arch_hweight64(__u64 w) #else static __always_inline unsigned long __arch_hweight64(__u64 w) { - unsigned long res = 0; + unsigned long res; asm (ALTERNATIVE("call __sw_hweight64", POPCNT64, X86_FEATURE_POPCNT) - : "="REG_OUT (res) - : REG_IN (w)); + : "="REG_OUT (res) + : REG_IN (w)); return res; } diff --git a/arch/x86/kernel/i386_ksyms_32.c b/arch/x86/kernel/i386_ksyms_32.c index 64341aa..d40ee8a 100644 --- a/arch/x86/kernel/i386_ksyms_32.c +++ b/arch/x86/kernel/i386_ksyms_32.c @@ -42,3 +42,5 @@ EXPORT_SYMBOL(empty_zero_page); EXPORT_SYMBOL(___preempt_schedule); EXPORT_SYMBOL(___preempt_schedule_notrace); #endif + +EXPORT_SYMBOL(__sw_hweight32); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index cd05942..f1aebfb 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -44,6 +44,9 @@ EXPORT_SYMBOL(clear_page); EXPORT_SYMBOL(csum_partial); +EXPORT_SYMBOL(__sw_hweight32); +EXPORT_SYMBOL(__sw_hweight64); + /* * Export string functions. We normally rely on gcc builtin for most of these, * but gcc sometimes decides not to inline them. diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 72a5767..ec969cc 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -25,7 +25,7 @@ lib-y += memcpy_$(BITS).o lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o -obj-y += msr.o msr-reg.o msr-reg-export.o +obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o ifeq ($(CONFIG_X86_32),y) obj-y += atomic64_32.o diff --git a/arch/x86/lib/hweight.S b/arch/x86/lib/hweight.S new file mode 100644 index 0000000..02de3d7 --- /dev/null +++ b/arch/x86/lib/hweight.S @@ -0,0 +1,77 @@ +#include + +#include + +/* + * unsigned int __sw_hweight32(unsigned int w) + * %rdi: w + */ +ENTRY(__sw_hweight32) + +#ifdef CONFIG_X86_64 + movl %edi, %eax # w +#endif + __ASM_SIZE(push,) %__ASM_REG(dx) + movl %eax, %edx # w -> t + shrl %edx # t >>= 1 + andl $0x55555555, %edx # t &= 0x55555555 + subl %edx, %eax # w -= t + + movl %eax, %edx # w -> t + shrl $2, %eax # w_tmp >>= 2 + andl $0x33333333, %edx # t &= 0x33333333 + andl $0x33333333, %eax # w_tmp &= 0x33333333 + addl %edx, %eax # w = w_tmp + t + + movl %eax, %edx # w -> t + shrl $4, %edx # t >>= 4 + addl %edx, %eax # w_tmp += t + andl $0x0f0f0f0f, %eax # w_tmp &= 0x0f0f0f0f + imull $0x01010101, %eax, %eax # w_tmp *= 0x01010101 + shrl $24, %eax # w = w_tmp >> 24 + __ASM_SIZE(pop,) %__ASM_REG(dx) + ret +ENDPROC(__sw_hweight32) + +ENTRY(__sw_hweight64) +#ifdef CONFIG_X86_64 + pushq %rdx + + movq %rdi, %rdx # w -> t + movabsq $0x5555555555555555, %rax + shrq %rdx # t >>= 1 + andq %rdx, %rax # t &= 0x5555555555555555 + movabsq $0x3333333333333333, %rdx + subq %rax, %rdi # w -= t + + movq %rdi, %rax # w -> t + shrq $2, %rdi # w_tmp >>= 2 + andq %rdx, %rax # t &= 0x3333333333333333 + andq %rdi, %rdx # w_tmp &= 0x3333333333333333 + addq %rdx, %rax # w = w_tmp + t + + movq %rax, %rdx # w -> t + shrq $4, %rdx # t >>= 4 + addq %rdx, %rax # w_tmp += t + movabsq $0x0f0f0f0f0f0f0f0f, %rdx + andq %rdx, %rax # w_tmp &= 0x0f0f0f0f0f0f0f0f + movabsq $0x0101010101010101, %rdx + imulq %rdx, %rax # w_tmp *= 0x0101010101010101 + shrq $56, %rax # w = w_tmp >> 56 + + popq %rdx + ret +#else /* CONFIG_X86_32 */ + /* We're getting an u64 arg in (%eax,%edx): unsigned long hweight64(__u64 w) */ + pushl %ecx + + call __sw_hweight32 + movl %eax, %ecx # stash away result + movl %edx, %eax # second part of input + call __sw_hweight32 + addl %ecx, %eax # result + + popl %ecx + ret +#endif +ENDPROC(__sw_hweight64) diff --git a/lib/Makefile b/lib/Makefile index ff6a7a6..07d06a8 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -15,9 +15,6 @@ KCOV_INSTRUMENT_rbtree.o := n KCOV_INSTRUMENT_list_debug.o := n KCOV_INSTRUMENT_debugobjects.o := n KCOV_INSTRUMENT_dynamic_debug.o := n -# Kernel does not boot if we instrument this file as it uses custom calling -# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS). -KCOV_INSTRUMENT_hweight.o := n lib-y := ctype.o string.o vsprintf.o cmdline.o \ rbtree.o radix-tree.o dump_stack.o timerqueue.o\ @@ -74,8 +71,6 @@ obj-$(CONFIG_HAS_IOMEM) += iomap_copy.o devres.o obj-$(CONFIG_CHECK_SIGNATURE) += check_signature.o obj-$(CONFIG_DEBUG_LOCKING_API_SELFTESTS) += locking-selftest.o -GCOV_PROFILE_hweight.o := n -CFLAGS_hweight.o = $(subst $(quote),,$(CONFIG_ARCH_HWEIGHT_CFLAGS)) obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o obj-$(CONFIG_BTREE) += btree.o diff --git a/lib/hweight.c b/lib/hweight.c index 9a5c1f2..43273a7 100644 --- a/lib/hweight.c +++ b/lib/hweight.c @@ -9,6 +9,7 @@ * The Hamming Weight of a number is the total number of bits set in it. */ +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned int __sw_hweight32(unsigned int w) { #ifdef CONFIG_ARCH_HAS_FAST_MULTIPLIER @@ -25,6 +26,7 @@ unsigned int __sw_hweight32(unsigned int w) #endif } EXPORT_SYMBOL(__sw_hweight32); +#endif unsigned int __sw_hweight16(unsigned int w) { @@ -43,6 +45,7 @@ unsigned int __sw_hweight8(unsigned int w) } EXPORT_SYMBOL(__sw_hweight8); +#ifndef __HAVE_ARCH_SW_HWEIGHT unsigned long __sw_hweight64(__u64 w) { #if BITS_PER_LONG == 32 @@ -65,3 +68,4 @@ unsigned long __sw_hweight64(__u64 w) #endif } EXPORT_SYMBOL(__sw_hweight64); +#endif -- cgit v0.10.2 From 2823d4da5d8a0c222747b24eceb65f5b30717d02 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:37 -0700 Subject: x86, bitops: remove use of "sbb" to return CF Use SETC instead of SBB to return the value of CF from assembly. Using SETcc enables uniformity with other flags-returning pieces of assembly code. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-2-git-send-email-hpa@linux.intel.com Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 7766d1c..b2b797d 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -230,11 +230,11 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr) */ static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; asm("bts %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR + "setc %0" + : "=qm" (oldbit), ADDR : "Ir" (nr)); return oldbit; } @@ -270,11 +270,11 @@ static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *a */ static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; asm volatile("btr %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR + "setc %0" + : "=qm" (oldbit), ADDR : "Ir" (nr)); return oldbit; } @@ -282,11 +282,11 @@ static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long /* WARNING: non atomic and it can be reordered! */ static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; asm volatile("btc %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit), ADDR + "setc %0" + : "=qm" (oldbit), ADDR : "Ir" (nr) : "memory"); return oldbit; @@ -313,11 +313,11 @@ static __always_inline int constant_test_bit(long nr, const volatile unsigned lo static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr) { - int oldbit; + unsigned char oldbit; asm volatile("bt %2,%1\n\t" - "sbb %0,%0" - : "=r" (oldbit) + "setc %0" + : "=qm" (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); return oldbit; diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e0ba66c..65039e9 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -510,9 +510,9 @@ do { \ /* This is not atomic against other CPUs -- CPU preemption needs to be off */ #define x86_test_and_clear_bit_percpu(bit, var) \ ({ \ - int old__; \ - asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \ - : "=r" (old__), "+m" (var) \ + unsigned char old__; \ + asm volatile("btr %2,"__percpu_arg(1)"\n\tsetc %0" \ + : "=qm" (old__), "+m" (var) \ : "dIr" (bit)); \ old__; \ }) @@ -532,11 +532,11 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, static inline int x86_this_cpu_variable_test_bit(int nr, const unsigned long __percpu *addr) { - int oldbit; + unsigned char oldbit; asm volatile("bt "__percpu_arg(2)",%1\n\t" - "sbb %0,%0" - : "=r" (oldbit) + "setc %0" + : "=qm" (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); return oldbit; diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 2138c9a..dd1e7d6 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -81,9 +81,9 @@ static inline int __const_sigismember(sigset_t *set, int _sig) static inline int __gen_sigismember(sigset_t *set, int _sig) { - int ret; - asm("btl %2,%1\n\tsbbl %0,%0" - : "=r"(ret) : "m"(*set), "Ir"(_sig-1) : "cc"); + unsigned char ret; + asm("btl %2,%1\n\tsetc %0" + : "=qm"(ret) : "m"(*set), "Ir"(_sig-1) : "cc"); return ret; } diff --git a/arch/x86/include/asm/sync_bitops.h b/arch/x86/include/asm/sync_bitops.h index f28a24b..cbf8847 100644 --- a/arch/x86/include/asm/sync_bitops.h +++ b/arch/x86/include/asm/sync_bitops.h @@ -79,10 +79,10 @@ static inline void sync_change_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; - asm volatile("lock; bts %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "+m" (ADDR) + asm volatile("lock; bts %2,%1\n\tsetc %0" + : "=qm" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; } @@ -97,10 +97,10 @@ static inline int sync_test_and_set_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; - asm volatile("lock; btr %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "+m" (ADDR) + asm volatile("lock; btr %2,%1\n\tsetc %0" + : "=qm" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; } @@ -115,10 +115,10 @@ static inline int sync_test_and_clear_bit(long nr, volatile unsigned long *addr) */ static inline int sync_test_and_change_bit(long nr, volatile unsigned long *addr) { - int oldbit; + unsigned char oldbit; - asm volatile("lock; btc %2,%1\n\tsbbl %0,%0" - : "=r" (oldbit), "+m" (ADDR) + asm volatile("lock; btc %2,%1\n\tsetc %0" + : "=qm" (oldbit), "+m" (ADDR) : "Ir" (nr) : "memory"); return oldbit; } diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 3dce1ca..01f30e5 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -440,10 +440,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs) static inline int is_revectored(int nr, struct revectored_struct *bitmap) { - __asm__ __volatile__("btl %2,%1\n\tsbbl %0,%0" - :"=r" (nr) - :"m" (*bitmap), "r" (nr)); - return nr; + return test_bit(nr, bitmap->__map); } #define val_byte(val, n) (((__u8 *)&val)[n]) -- cgit v0.10.2 From 117780eef7740729e803bdcc0d5f2f48137ea8e3 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:38 -0700 Subject: x86, asm: use bool for bitops and other assembly outputs The gcc people have confirmed that using "bool" when combined with inline assembly always is treated as a byte-sized operand that can be assumed to be 0 or 1, which is exactly what the SET instruction emits. Change the output types and intermediate variables of as many operations as practical to "bool". Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-3-git-send-email-hpa@linux.intel.com Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) diff --git a/arch/x86/boot/bitops.h b/arch/x86/boot/bitops.h index 878e4b9..0d41d68 100644 --- a/arch/x86/boot/bitops.h +++ b/arch/x86/boot/bitops.h @@ -16,14 +16,16 @@ #define BOOT_BITOPS_H #define _LINUX_BITOPS_H /* Inhibit inclusion of */ -static inline int constant_test_bit(int nr, const void *addr) +#include + +static inline bool constant_test_bit(int nr, const void *addr) { const u32 *p = (const u32 *)addr; return ((1UL << (nr & 31)) & (p[nr >> 5])) != 0; } -static inline int variable_test_bit(int nr, const void *addr) +static inline bool variable_test_bit(int nr, const void *addr) { - u8 v; + bool v; const u32 *p = (const u32 *)addr; asm("btl %2,%1; setc %0" : "=qm" (v) : "m" (*p), "Ir" (nr)); diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 9011a88..2edb2d5 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -176,16 +176,16 @@ static inline void wrgs32(u32 v, addr_t addr) } /* Note: these only return true/false, not a signed return value! */ -static inline int memcmp_fs(const void *s1, addr_t s2, size_t len) +static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len) { - u8 diff; + bool diff; asm volatile("fs; repe; cmpsb; setnz %0" : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } -static inline int memcmp_gs(const void *s1, addr_t s2, size_t len) +static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len) { - u8 diff; + bool diff; asm volatile("gs; repe; cmpsb; setnz %0" : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; diff --git a/arch/x86/boot/string.c b/arch/x86/boot/string.c index 318b846..cc3bd58 100644 --- a/arch/x86/boot/string.c +++ b/arch/x86/boot/string.c @@ -17,7 +17,7 @@ int memcmp(const void *s1, const void *s2, size_t len) { - u8 diff; + bool diff; asm("repe; cmpsb; setnz %0" : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; diff --git a/arch/x86/include/asm/apm.h b/arch/x86/include/asm/apm.h index 20370c6..93eebc63 100644 --- a/arch/x86/include/asm/apm.h +++ b/arch/x86/include/asm/apm.h @@ -45,11 +45,11 @@ static inline void apm_bios_call_asm(u32 func, u32 ebx_in, u32 ecx_in, : "memory", "cc"); } -static inline u8 apm_bios_call_simple_asm(u32 func, u32 ebx_in, - u32 ecx_in, u32 *eax) +static inline bool apm_bios_call_simple_asm(u32 func, u32 ebx_in, + u32 ecx_in, u32 *eax) { int cx, dx, si; - u8 error; + bool error; /* * N.B. We do NOT need a cld after the BIOS call diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index 69f1366..ab6f599 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -43,7 +43,7 @@ #ifdef CONFIG_ARCH_RANDOM /* Instead of arch_get_random_long() when alternatives haven't run. */ -static inline int rdrand_long(unsigned long *v) +static inline bool rdrand_long(unsigned long *v) { int ok; asm volatile("1: " RDRAND_LONG "\n\t" @@ -53,13 +53,13 @@ static inline int rdrand_long(unsigned long *v) "2:" : "=r" (ok), "=a" (*v) : "0" (RDRAND_RETRY_LOOPS)); - return ok; + return !!ok; } /* A single attempt at RDSEED */ static inline bool rdseed_long(unsigned long *v) { - unsigned char ok; + bool ok; asm volatile(RDSEED_LONG "\n\t" "setc %0" : "=qm" (ok), "=a" (*v)); @@ -67,7 +67,7 @@ static inline bool rdseed_long(unsigned long *v) } #define GET_RANDOM(name, type, rdrand, nop) \ -static inline int name(type *v) \ +static inline bool name(type *v) \ { \ int ok; \ alternative_io("movl $0, %0\n\t" \ @@ -80,13 +80,13 @@ static inline int name(type *v) \ X86_FEATURE_RDRAND, \ ASM_OUTPUT2("=r" (ok), "=a" (*v)), \ "0" (RDRAND_RETRY_LOOPS)); \ - return ok; \ + return !!ok; \ } #define GET_SEED(name, type, rdseed, nop) \ -static inline int name(type *v) \ +static inline bool name(type *v) \ { \ - unsigned char ok; \ + bool ok; \ alternative_io("movb $0, %0\n\t" \ nop, \ rdseed "\n\t" \ @@ -119,7 +119,7 @@ GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); #else -static inline int rdrand_long(unsigned long *v) +static inline bool rdrand_long(unsigned long *v) { return 0; } diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 3e86742..17d8812 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -75,7 +75,7 @@ static __always_inline void atomic_sub(int i, atomic_t *v) * true if the result is zero, or false for all * other cases. */ -static __always_inline int atomic_sub_and_test(int i, atomic_t *v) +static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e"); } @@ -112,7 +112,7 @@ static __always_inline void atomic_dec(atomic_t *v) * returns true if the result is 0, or false for all other * cases. */ -static __always_inline int atomic_dec_and_test(atomic_t *v) +static __always_inline bool atomic_dec_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); } @@ -125,7 +125,7 @@ static __always_inline int atomic_dec_and_test(atomic_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static __always_inline int atomic_inc_and_test(atomic_t *v) +static __always_inline bool atomic_inc_and_test(atomic_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e"); } @@ -139,7 +139,7 @@ static __always_inline int atomic_inc_and_test(atomic_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static __always_inline int atomic_add_negative(int i, atomic_t *v) +static __always_inline bool atomic_add_negative(int i, atomic_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s"); } diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 0373510..4f881d7 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -70,7 +70,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) * true if the result is zero, or false for all * other cases. */ -static inline int atomic64_sub_and_test(long i, atomic64_t *v) +static inline bool atomic64_sub_and_test(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", "e"); } @@ -109,7 +109,7 @@ static __always_inline void atomic64_dec(atomic64_t *v) * returns true if the result is 0, or false for all other * cases. */ -static inline int atomic64_dec_and_test(atomic64_t *v) +static inline bool atomic64_dec_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e"); } @@ -122,7 +122,7 @@ static inline int atomic64_dec_and_test(atomic64_t *v) * and returns true if the result is zero, or false for all * other cases. */ -static inline int atomic64_inc_and_test(atomic64_t *v) +static inline bool atomic64_inc_and_test(atomic64_t *v) { GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e"); } @@ -136,7 +136,7 @@ static inline int atomic64_inc_and_test(atomic64_t *v) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int atomic64_add_negative(long i, atomic64_t *v) +static inline bool atomic64_add_negative(long i, atomic64_t *v) { GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", "s"); } @@ -180,7 +180,7 @@ static inline long atomic64_xchg(atomic64_t *v, long new) * Atomically adds @a to @v, so long as it was not @u. * Returns the old value of @v. */ -static inline int atomic64_add_unless(atomic64_t *v, long a, long u) +static inline bool atomic64_add_unless(atomic64_t *v, long a, long u) { long c, old; c = atomic64_read(v); diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index b2b797d..8cbb7f4 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -201,7 +201,7 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c"); } @@ -213,7 +213,7 @@ static __always_inline int test_and_set_bit(long nr, volatile unsigned long *add * * This is the same as test_and_set_bit on x86. */ -static __always_inline int +static __always_inline bool test_and_set_bit_lock(long nr, volatile unsigned long *addr) { return test_and_set_bit(nr, addr); @@ -228,9 +228,9 @@ test_and_set_bit_lock(long nr, volatile unsigned long *addr) * If two examples of this operation race, one can appear to succeed * but actually fail. You must protect multiple accesses with a lock. */ -static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *addr) +static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; + bool oldbit; asm("bts %2,%1\n\t" "setc %0" @@ -247,7 +247,7 @@ static __always_inline int __test_and_set_bit(long nr, volatile unsigned long *a * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c"); } @@ -268,9 +268,9 @@ static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *a * accessed from a hypervisor on the same CPU if running in a VM: don't change * this without also updating arch/x86/kernel/kvm.c */ -static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long *addr) +static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; + bool oldbit; asm volatile("btr %2,%1\n\t" "setc %0" @@ -280,9 +280,9 @@ static __always_inline int __test_and_clear_bit(long nr, volatile unsigned long } /* WARNING: non atomic and it can be reordered! */ -static __always_inline int __test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline bool __test_and_change_bit(long nr, volatile unsigned long *addr) { - unsigned char oldbit; + bool oldbit; asm volatile("btc %2,%1\n\t" "setc %0" @@ -300,20 +300,20 @@ static __always_inline int __test_and_change_bit(long nr, volatile unsigned long * This operation is atomic and cannot be reordered. * It also implies a memory barrier. */ -static __always_inline int test_and_change_bit(long nr, volatile unsigned long *addr) +static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) { GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c"); } -static __always_inline int constant_test_bit(long nr, const volatile unsigned long *addr) +static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) { return ((1UL << (nr & (BITS_PER_LONG-1))) & (addr[nr >> _BITOPS_LONG_SHIFT])) != 0; } -static __always_inline int variable_test_bit(long nr, volatile const unsigned long *addr) +static __always_inline bool variable_test_bit(long nr, volatile const unsigned long *addr) { - unsigned char oldbit; + bool oldbit; asm volatile("bt %2,%1\n\t" "setc %0" @@ -329,7 +329,7 @@ static __always_inline int variable_test_bit(long nr, volatile const unsigned lo * @nr: bit number to test * @addr: Address to start counting from */ -static int test_bit(int nr, const volatile unsigned long *addr); +static bool test_bit(int nr, const volatile unsigned long *addr); #endif #define test_bit(nr, addr) \ diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 4ad6560..0cdc65b 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -50,7 +50,7 @@ static inline void local_sub(long i, local_t *l) * true if the result is zero, or false for all * other cases. */ -static inline int local_sub_and_test(long i, local_t *l) +static inline bool local_sub_and_test(long i, local_t *l) { GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", "e"); } @@ -63,7 +63,7 @@ static inline int local_sub_and_test(long i, local_t *l) * returns true if the result is 0, or false for all other * cases. */ -static inline int local_dec_and_test(local_t *l) +static inline bool local_dec_and_test(local_t *l) { GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e"); } @@ -76,7 +76,7 @@ static inline int local_dec_and_test(local_t *l) * and returns true if the result is zero, or false for all * other cases. */ -static inline int local_inc_and_test(local_t *l) +static inline bool local_inc_and_test(local_t *l) { GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e"); } @@ -90,7 +90,7 @@ static inline int local_inc_and_test(local_t *l) * if the result is negative, or false when * result is greater than or equal to zero. */ -static inline int local_add_negative(long i, local_t *l) +static inline bool local_add_negative(long i, local_t *l) { GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", "s"); } diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 65039e9..184d7f3 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -510,14 +510,14 @@ do { \ /* This is not atomic against other CPUs -- CPU preemption needs to be off */ #define x86_test_and_clear_bit_percpu(bit, var) \ ({ \ - unsigned char old__; \ + bool old__; \ asm volatile("btr %2,"__percpu_arg(1)"\n\tsetc %0" \ : "=qm" (old__), "+m" (var) \ : "dIr" (bit)); \ old__; \ }) -static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, +static __always_inline bool x86_this_cpu_constant_test_bit(unsigned int nr, const unsigned long __percpu *addr) { unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG; @@ -529,10 +529,10 @@ static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr, #endif } -static inline int x86_this_cpu_variable_test_bit(int nr, +static inline bool x86_this_cpu_variable_test_bit(int nr, const unsigned long __percpu *addr) { - unsigned char oldbit; + bool oldbit; asm volatile("bt "__percpu_arg(2)",%1\n\t" "setc %0" diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index 8f7866a..a15b73d 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -23,11 +23,11 @@ cc_label: \ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ - char c; \ + bool c; \ asm volatile (fullop "; set" cc " %1" \ : "+m" (var), "=qm" (c) \ : __VA_ARGS__ : "memory"); \ - return c != 0; \ + return c; \ } while (0) #define GEN_UNARY_RMWcc(op, var, arg0, cc) \ diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 453744c..c508770 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -77,7 +77,7 @@ static inline void __down_read(struct rw_semaphore *sem) /* * trylock for reading -- returns 1 if successful, 0 if contention */ -static inline int __down_read_trylock(struct rw_semaphore *sem) +static inline bool __down_read_trylock(struct rw_semaphore *sem) { long result, tmp; asm volatile("# beginning __down_read_trylock\n\t" @@ -93,7 +93,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem) : "+m" (sem->count), "=&a" (result), "=&r" (tmp) : "i" (RWSEM_ACTIVE_READ_BIAS) : "memory", "cc"); - return result >= 0 ? 1 : 0; + return result >= 0; } /* @@ -134,9 +134,10 @@ static inline int __down_write_killable(struct rw_semaphore *sem) /* * trylock for writing -- returns 1 if successful, 0 if contention */ -static inline int __down_write_trylock(struct rw_semaphore *sem) +static inline bool __down_write_trylock(struct rw_semaphore *sem) { - long result, tmp; + bool result; + long tmp0, tmp1; asm volatile("# beginning __down_write_trylock\n\t" " mov %0,%1\n\t" "1:\n\t" @@ -144,14 +145,14 @@ static inline int __down_write_trylock(struct rw_semaphore *sem) /* was the active mask 0 before? */ " jnz 2f\n\t" " mov %1,%2\n\t" - " add %3,%2\n\t" + " add %4,%2\n\t" LOCK_PREFIX " cmpxchg %2,%0\n\t" " jnz 1b\n\t" "2:\n\t" - " sete %b1\n\t" - " movzbl %b1, %k1\n\t" + " sete %3\n\t" "# ending __down_write_trylock\n\t" - : "+m" (sem->count), "=&a" (result), "=&r" (tmp) + : "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1), + "=qm" (result) : "er" (RWSEM_ACTIVE_WRITE_BIAS) : "memory", "cc"); return result; diff --git a/include/linux/random.h b/include/linux/random.h index e47e533..3d6e981 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -95,27 +95,27 @@ static inline void prandom_seed_state(struct rnd_state *state, u64 seed) #ifdef CONFIG_ARCH_RANDOM # include #else -static inline int arch_get_random_long(unsigned long *v) +static inline bool arch_get_random_long(unsigned long *v) { return 0; } -static inline int arch_get_random_int(unsigned int *v) +static inline bool arch_get_random_int(unsigned int *v) { return 0; } -static inline int arch_has_random(void) +static inline bool arch_has_random(void) { return 0; } -static inline int arch_get_random_seed_long(unsigned long *v) +static inline bool arch_get_random_seed_long(unsigned long *v) { return 0; } -static inline int arch_get_random_seed_int(unsigned int *v) +static inline bool arch_get_random_seed_int(unsigned int *v) { return 0; } -static inline int arch_has_random_seed(void) +static inline bool arch_has_random_seed(void) { return 0; } -- cgit v0.10.2 From 18fe58229d80c7f4f138a07e84ba608e1ebd232b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:39 -0700 Subject: x86, asm: change the GEN_*_RMWcc() macros to not quote the condition Change the lexical defintion of the GEN_*_RMWcc() macros to not take the condition code as a quoted string. This will help support changing them to use the new __GCC_ASM_FLAG_OUTPUTS__ feature in a subsequent patch. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-4-git-send-email-hpa@linux.intel.com Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 17d8812..7322c15 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -77,7 +77,7 @@ static __always_inline void atomic_sub(int i, atomic_t *v) */ static __always_inline bool atomic_sub_and_test(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", "e"); + GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); } /** @@ -114,7 +114,7 @@ static __always_inline void atomic_dec(atomic_t *v) */ static __always_inline bool atomic_dec_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); } /** @@ -127,7 +127,7 @@ static __always_inline bool atomic_dec_and_test(atomic_t *v) */ static __always_inline bool atomic_inc_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); } /** @@ -141,7 +141,7 @@ static __always_inline bool atomic_inc_and_test(atomic_t *v) */ static __always_inline bool atomic_add_negative(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", "s"); + GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); } /** diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 4f881d7..57bf925 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -72,7 +72,7 @@ static inline void atomic64_sub(long i, atomic64_t *v) */ static inline bool atomic64_sub_and_test(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", "e"); + GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); } /** @@ -111,7 +111,7 @@ static __always_inline void atomic64_dec(atomic64_t *v) */ static inline bool atomic64_dec_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); } /** @@ -124,7 +124,7 @@ static inline bool atomic64_dec_and_test(atomic64_t *v) */ static inline bool atomic64_inc_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", "e"); + GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); } /** @@ -138,7 +138,7 @@ static inline bool atomic64_inc_and_test(atomic64_t *v) */ static inline bool atomic64_add_negative(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", "s"); + GEN_BINARY_RMWcc(LOCK_PREFIX "addq", v->counter, "er", i, "%0", s); } /** diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 8cbb7f4..ed8f485 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -203,7 +203,7 @@ static __always_inline void change_bit(long nr, volatile unsigned long *addr) */ static __always_inline bool test_and_set_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c"); + GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", c); } /** @@ -249,7 +249,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * */ static __always_inline bool test_and_clear_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c"); + GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", c); } /** @@ -302,7 +302,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon */ static __always_inline bool test_and_change_bit(long nr, volatile unsigned long *addr) { - GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", "c"); + GEN_BINARY_RMWcc(LOCK_PREFIX "btc", *addr, "Ir", nr, "%0", c); } static __always_inline bool constant_test_bit(long nr, const volatile unsigned long *addr) diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 0cdc65b..7511978 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -52,7 +52,7 @@ static inline void local_sub(long i, local_t *l) */ static inline bool local_sub_and_test(long i, local_t *l) { - GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", "e"); + GEN_BINARY_RMWcc(_ASM_SUB, l->a.counter, "er", i, "%0", e); } /** @@ -65,7 +65,7 @@ static inline bool local_sub_and_test(long i, local_t *l) */ static inline bool local_dec_and_test(local_t *l) { - GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", "e"); + GEN_UNARY_RMWcc(_ASM_DEC, l->a.counter, "%0", e); } /** @@ -78,7 +78,7 @@ static inline bool local_dec_and_test(local_t *l) */ static inline bool local_inc_and_test(local_t *l) { - GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", "e"); + GEN_UNARY_RMWcc(_ASM_INC, l->a.counter, "%0", e); } /** @@ -92,7 +92,7 @@ static inline bool local_inc_and_test(local_t *l) */ static inline bool local_add_negative(long i, local_t *l) { - GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", "s"); + GEN_BINARY_RMWcc(_ASM_ADD, l->a.counter, "er", i, "%0", s); } /** diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index d397deb..17f2186 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -81,7 +81,7 @@ static __always_inline void __preempt_count_sub(int val) */ static __always_inline bool __preempt_count_dec_and_test(void) { - GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), "e"); + GEN_UNARY_RMWcc("decl", __preempt_count, __percpu_arg(0), e); } /* diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index a15b73d..e3264c4 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -5,7 +5,7 @@ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ - asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \ + asm_volatile_goto (fullop "; j" #cc " %l[cc_label]" \ : : "m" (var), ## __VA_ARGS__ \ : "memory" : cc_label); \ return 0; \ @@ -24,7 +24,7 @@ cc_label: \ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ bool c; \ - asm volatile (fullop "; set" cc " %1" \ + asm volatile (fullop "; set" #cc " %1" \ : "+m" (var), "=qm" (c) \ : __VA_ARGS__ : "memory"); \ return c; \ -- cgit v0.10.2 From ff3554b409b82d349f71e9d7082648b7b0a1a5bb Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:40 -0700 Subject: x86, asm: define CC_SET() and CC_OUT() macros The CC_SET() and CC_OUT() macros can be used together to take advantage of the new __GCC_ASM_FLAG_OUTPUTS__ feature in gcc 6+ while remaining backwards compatible. CC_SET() generates a SET instruction on older compilers; CC_OUT() makes sure the output is received in the correct variable. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-5-git-send-email-hpa@linux.intel.com Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) diff --git a/arch/x86/include/asm/asm.h b/arch/x86/include/asm/asm.h index f5063b6..7acb51c 100644 --- a/arch/x86/include/asm/asm.h +++ b/arch/x86/include/asm/asm.h @@ -42,6 +42,18 @@ #define _ASM_SI __ASM_REG(si) #define _ASM_DI __ASM_REG(di) +/* + * Macros to generate condition code outputs from inline assembly, + * The output operand must be type "bool". + */ +#ifdef __GCC_ASM_FLAG_OUTPUTS__ +# define CC_SET(c) "\n\t/* output condition code " #c "*/\n" +# define CC_OUT(c) "=@cc" #c +#else +# define CC_SET(c) "\n\tset" #c " %[_cc_" #c "]\n" +# define CC_OUT(c) [_cc_ ## c] "=qm" +#endif + /* Exception table entry */ #ifdef __ASSEMBLY__ # define _ASM_EXTABLE_HANDLE(from, to, handler) \ -- cgit v0.10.2 From ba741e356c49bfce0adcfa851080666870867f6b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:41 -0700 Subject: x86, asm: change GEN_*_RMWcc() to use CC_SET()/CC_OUT() Change the GEN_*_RMWcc() macros to use the CC_SET()/CC_OUT() macros defined in , and disable the use of asm goto if __GCC_ASM_FLAG_OUTPUTS__ is enabled. This allows gcc to receive the flags output directly in gcc 6+. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-6-git-send-email-hpa@linux.intel.com Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h index e3264c4..661dd30 100644 --- a/arch/x86/include/asm/rmwcc.h +++ b/arch/x86/include/asm/rmwcc.h @@ -1,7 +1,9 @@ #ifndef _ASM_X86_RMWcc #define _ASM_X86_RMWcc -#ifdef CC_HAVE_ASM_GOTO +#if !defined(__GCC_ASM_FLAG_OUTPUTS__) && defined(CC_HAVE_ASM_GOTO) + +/* Use asm goto */ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ @@ -19,13 +21,15 @@ cc_label: \ #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ __GEN_RMWcc(op " %1, " arg0, var, cc, vcon (val)) -#else /* !CC_HAVE_ASM_GOTO */ +#else /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ + +/* Use flags output or a set instruction */ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ bool c; \ - asm volatile (fullop "; set" #cc " %1" \ - : "+m" (var), "=qm" (c) \ + asm volatile (fullop ";" CC_SET(cc) \ + : "+m" (var), CC_OUT(cc) (c) \ : __VA_ARGS__ : "memory"); \ return c; \ } while (0) @@ -36,6 +40,6 @@ do { \ #define GEN_BINARY_RMWcc(op, var, vcon, val, arg0, cc) \ __GEN_RMWcc(op " %2, " arg0, var, cc, vcon (val)) -#endif /* CC_HAVE_ASM_GOTO */ +#endif /* defined(__GCC_ASM_FLAG_OUTPUTS__) || !defined(CC_HAVE_ASM_GOTO) */ #endif /* _ASM_X86_RMWcc */ -- cgit v0.10.2 From 86b61240d4c233b440cd29daf0baa440daf4a148 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:42 -0700 Subject: x86, asm: Use CC_SET()/CC_OUT() in Remove open-coded uses of set instructions to use CC_SET()/CC_OUT() in . Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-7-git-send-email-hpa@linux.intel.com Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index ed8f485..68557f52 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -233,8 +233,8 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long * bool oldbit; asm("bts %2,%1\n\t" - "setc %0" - : "=qm" (oldbit), ADDR + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); return oldbit; } @@ -273,8 +273,8 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long bool oldbit; asm volatile("btr %2,%1\n\t" - "setc %0" - : "=qm" (oldbit), ADDR + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); return oldbit; } @@ -285,8 +285,8 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon bool oldbit; asm volatile("btc %2,%1\n\t" - "setc %0" - : "=qm" (oldbit), ADDR + CC_SET(c) + : CC_OUT(c) (oldbit), ADDR : "Ir" (nr) : "memory"); return oldbit; @@ -316,8 +316,8 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l bool oldbit; asm volatile("bt %2,%1\n\t" - "setc %0" - : "=qm" (oldbit) + CC_SET(c) + : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); return oldbit; -- cgit v0.10.2 From 64be6d36f5674f3424d1901772f76e21874f4954 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:43 -0700 Subject: x86, asm: Use CC_SET()/CC_OUT() in Remove open-coded uses of set instructions to use CC_SET()/CC_OUT() in . Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-8-git-send-email-hpa@linux.intel.com Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 184d7f3..e02e3f8 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -511,8 +511,9 @@ do { \ #define x86_test_and_clear_bit_percpu(bit, var) \ ({ \ bool old__; \ - asm volatile("btr %2,"__percpu_arg(1)"\n\tsetc %0" \ - : "=qm" (old__), "+m" (var) \ + asm volatile("btr %2,"__percpu_arg(1)"\n\t" \ + CC_SET(c) \ + : CC_OUT(c) (old__), "+m" (var) \ : "dIr" (bit)); \ old__; \ }) @@ -535,8 +536,8 @@ static inline bool x86_this_cpu_variable_test_bit(int nr, bool oldbit; asm volatile("bt "__percpu_arg(2)",%1\n\t" - "setc %0" - : "=qm" (oldbit) + CC_SET(c) + : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); return oldbit; -- cgit v0.10.2 From 35ccfb7114e2f0f454f264c049b03c31f4c6bbc0 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:44 -0700 Subject: x86, asm: Use CC_SET()/CC_OUT() in Remove open-coded uses of set instructions to use CC_SET()/CC_OUT() in . Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-9-git-send-email-hpa@linux.intel.com Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index c508770..1e8be26 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -149,10 +149,10 @@ static inline bool __down_write_trylock(struct rw_semaphore *sem) LOCK_PREFIX " cmpxchg %2,%0\n\t" " jnz 1b\n\t" "2:\n\t" - " sete %3\n\t" + CC_SET(e) "# ending __down_write_trylock\n\t" : "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1), - "=qm" (result) + CC_OUT(e) (result) : "er" (RWSEM_ACTIVE_WRITE_BIAS) : "memory", "cc"); return result; -- cgit v0.10.2 From 66928b4eb92dfb6d87c204238057b9278b36452b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:45 -0700 Subject: x86, asm, boot: Use CC_SET()/CC_OUT() in arch/x86/boot/boot.h Remove open-coded uses of set instructions to use CC_SET()/CC_OUT() in arch/x86/boot/boot.h. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-10-git-send-email-hpa@linux.intel.com Reviewed-by: Andy Lutomirski Reviewed-by: Borislav Petkov Acked-by: Peter Zijlstra (Intel) diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 2edb2d5..7c1495f 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -24,6 +24,7 @@ #include #include #include +#include #include "bitops.h" #include "ctype.h" #include "cpuflags.h" @@ -179,15 +180,15 @@ static inline void wrgs32(u32 v, addr_t addr) static inline bool memcmp_fs(const void *s1, addr_t s2, size_t len) { bool diff; - asm volatile("fs; repe; cmpsb; setnz %0" - : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + asm volatile("fs; repe; cmpsb" CC_SET(nz) + : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } static inline bool memcmp_gs(const void *s1, addr_t s2, size_t len) { bool diff; - asm volatile("gs; repe; cmpsb; setnz %0" - : "=qm" (diff), "+D" (s1), "+S" (s2), "+c" (len)); + asm volatile("gs; repe; cmpsb" CC_SET(nz) + : CC_OUT(nz) (diff), "+D" (s1), "+S" (s2), "+c" (len)); return diff; } -- cgit v0.10.2 From 3b290398638ee4e57f1fb2e35c02005cba9a737f Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 8 Jun 2016 12:38:46 -0700 Subject: x86, asm: Use CC_SET()/CC_OUT() and static_cpu_has() in archrandom.h Use CC_SET()/CC_OUT() and static_cpu_has(). This produces code good enough to eliminate ad hoc use of alternatives in , greatly simplifying the code. While we are at it, make x86_init_rdrand() compile out completely if we don't need it. Signed-off-by: H. Peter Anvin Link: http://lkml.kernel.org/r/1465414726-197858-11-git-send-email-hpa@linux.intel.com v2: fix a conflict between and discovered by Ingo Molnar. There are a few places in x86-specific code where we need all of even when CONFIG_ARCH_RANDOM is disabled, so does not suffice. diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h index ab6f599..5b0579a 100644 --- a/arch/x86/include/asm/archrandom.h +++ b/arch/x86/include/asm/archrandom.h @@ -25,8 +25,6 @@ #include #include -#include -#include #define RDRAND_RETRY_LOOPS 10 @@ -40,97 +38,91 @@ # define RDSEED_LONG RDSEED_INT #endif -#ifdef CONFIG_ARCH_RANDOM +/* Unconditional execution of RDRAND and RDSEED */ -/* Instead of arch_get_random_long() when alternatives haven't run. */ static inline bool rdrand_long(unsigned long *v) { - int ok; - asm volatile("1: " RDRAND_LONG "\n\t" - "jc 2f\n\t" - "decl %0\n\t" - "jnz 1b\n\t" - "2:" - : "=r" (ok), "=a" (*v) - : "0" (RDRAND_RETRY_LOOPS)); - return !!ok; + bool ok; + unsigned int retry = RDRAND_RETRY_LOOPS; + do { + asm volatile(RDRAND_LONG "\n\t" + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + if (ok) + return true; + } while (--retry); + return false; +} + +static inline bool rdrand_int(unsigned int *v) +{ + bool ok; + unsigned int retry = RDRAND_RETRY_LOOPS; + do { + asm volatile(RDRAND_INT "\n\t" + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + if (ok) + return true; + } while (--retry); + return false; } -/* A single attempt at RDSEED */ static inline bool rdseed_long(unsigned long *v) { bool ok; asm volatile(RDSEED_LONG "\n\t" - "setc %0" - : "=qm" (ok), "=a" (*v)); + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); return ok; } -#define GET_RANDOM(name, type, rdrand, nop) \ -static inline bool name(type *v) \ -{ \ - int ok; \ - alternative_io("movl $0, %0\n\t" \ - nop, \ - "\n1: " rdrand "\n\t" \ - "jc 2f\n\t" \ - "decl %0\n\t" \ - "jnz 1b\n\t" \ - "2:", \ - X86_FEATURE_RDRAND, \ - ASM_OUTPUT2("=r" (ok), "=a" (*v)), \ - "0" (RDRAND_RETRY_LOOPS)); \ - return !!ok; \ -} - -#define GET_SEED(name, type, rdseed, nop) \ -static inline bool name(type *v) \ -{ \ - bool ok; \ - alternative_io("movb $0, %0\n\t" \ - nop, \ - rdseed "\n\t" \ - "setc %0", \ - X86_FEATURE_RDSEED, \ - ASM_OUTPUT2("=q" (ok), "=a" (*v))); \ - return ok; \ +static inline bool rdseed_int(unsigned int *v) +{ + bool ok; + asm volatile(RDSEED_INT "\n\t" + CC_SET(c) + : CC_OUT(c) (ok), "=a" (*v)); + return ok; } -#ifdef CONFIG_X86_64 - -GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5); -GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4); - -GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP5); -GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); - -#else - -GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3); -GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); - -GET_SEED(arch_get_random_seed_long, unsigned long, RDSEED_LONG, ASM_NOP4); -GET_SEED(arch_get_random_seed_int, unsigned int, RDSEED_INT, ASM_NOP4); - -#endif /* CONFIG_X86_64 */ - +/* Conditional execution based on CPU type */ #define arch_has_random() static_cpu_has(X86_FEATURE_RDRAND) #define arch_has_random_seed() static_cpu_has(X86_FEATURE_RDSEED) -#else +/* + * These are the generic interfaces; they must not be declared if the + * stubs in are to be invoked, + * i.e. CONFIG_ARCH_RANDOM is not defined. + */ +#ifdef CONFIG_ARCH_RANDOM -static inline bool rdrand_long(unsigned long *v) +static inline bool arch_get_random_long(unsigned long *v) { - return 0; + return arch_has_random() ? rdrand_long(v) : false; } -static inline bool rdseed_long(unsigned long *v) +static inline bool arch_get_random_int(unsigned int *v) { - return 0; + return arch_has_random() ? rdrand_int(v) : false; } -#endif /* CONFIG_ARCH_RANDOM */ +static inline bool arch_get_random_seed_long(unsigned long *v) +{ + return arch_has_random_seed() ? rdseed_long(v) : false; +} + +static inline bool arch_get_random_seed_int(unsigned int *v) +{ + return arch_has_random_seed() ? rdseed_int(v) : false; +} extern void x86_init_rdrand(struct cpuinfo_x86 *c); +#else /* !CONFIG_ARCH_RANDOM */ + +static inline void x86_init_rdrand(struct cpuinfo_x86 *c) { } + +#endif /* !CONFIG_ARCH_RANDOM */ + #endif /* ASM_X86_ARCHRANDOM_H */ diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c index f6f50c4..cfa97ff 100644 --- a/arch/x86/kernel/cpu/rdrand.c +++ b/arch/x86/kernel/cpu/rdrand.c @@ -39,9 +39,9 @@ __setup("nordrand", x86_rdrand_setup); */ #define SANITY_CHECK_LOOPS 8 +#ifdef CONFIG_ARCH_RANDOM void x86_init_rdrand(struct cpuinfo_x86 *c) { -#ifdef CONFIG_ARCH_RANDOM unsigned long tmp; int i; @@ -55,5 +55,5 @@ void x86_init_rdrand(struct cpuinfo_x86 *c) return; } } -#endif } +#endif -- cgit v0.10.2 From 99158f10e91768d34c5004c40c42f802b719bcae Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 24 May 2016 15:48:38 -0700 Subject: x86/xen: Simplify set_aliased_prot() A year ago, via the following commit: aa1acff356bb ("x86/xen: Probe target addresses in set_aliased_prot() before the hypercall") I added an explicit probe to work around a hypercall issue. The code can be simplified by using probe_kernel_read(). No change in functionality. Signed-off-by: Andy Lutomirski Reviewed-by: Andrew Cooper Acked-by: David Vrabel Cc: Andy Lutomirski Cc: Boris Ostrovsky Cc: Borislav Petkov Cc: Brian Gerst Cc: David Vrabel Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Jan Beulich Cc: Kees Cook Cc: Konrad Rzeszutek Wilk Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: xen-devel Link: http://lkml.kernel.org/r/0706f1a2538e481194514197298cca6b5e3f2638.1464129798.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 760789a..0f87db2 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -521,9 +521,7 @@ static void set_aliased_prot(void *v, pgprot_t prot) preempt_disable(); - pagefault_disable(); /* Avoid warnings due to being atomic. */ - __get_user(dummy, (unsigned char __user __force *)v); - pagefault_enable(); + probe_kernel_read(&dummy, v, 1); if (HYPERVISOR_update_va_mapping((unsigned long)v, pte, 0)) BUG(); -- cgit v0.10.2 From f0702555b16d31d61dc758fac6efb994c3fe3ec6 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 9 Jun 2016 13:57:04 -0700 Subject: x86/vdso/32: Assemble sigreturn.S separately sigreturn.S was historically included by the various __kernel_vsyscall implementations due to assumptions about all the 32-bit vDSO images having the sigreturn symbols at the same address. Those assumptions were removed in v3.16, and as of v4.4, there is only a single 32-bit vDSO left. Simplify the build process by assembling sigreturn.S into a normal object file. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d7b6dfde3c7397aa26977320da90448363b5a7e9.1465505753.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 253b72e..68b63fd 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -134,7 +134,7 @@ VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-m,elf_i386 -Wl,-soname=linux-gate.so.1 override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds -targets += vdso32/note.o vdso32/vclock_gettime.o vdso32/system_call.o +targets += vdso32/note.o vdso32/system_call.o vdso32/sigreturn.o targets += vdso32/vclock_gettime.o KBUILD_AFLAGS_32 := $(filter-out -m64,$(KBUILD_AFLAGS)) -DBUILD_VDSO @@ -156,7 +156,8 @@ $(obj)/vdso32.so.dbg: FORCE \ $(obj)/vdso32/vdso32.lds \ $(obj)/vdso32/vclock_gettime.o \ $(obj)/vdso32/note.o \ - $(obj)/vdso32/system_call.o + $(obj)/vdso32/system_call.o \ + $(obj)/vdso32/sigreturn.o $(call if_changed,vdso) # diff --git a/arch/x86/entry/vdso/vdso32/sigreturn.S b/arch/x86/entry/vdso/vdso32/sigreturn.S index d7ec4e2..20633e0 100644 --- a/arch/x86/entry/vdso/vdso32/sigreturn.S +++ b/arch/x86/entry/vdso/vdso32/sigreturn.S @@ -1,11 +1,3 @@ -/* - * Common code for the sigreturn entry points in vDSO images. - * So far this code is the same for both int80 and sysenter versions. - * This file is #include'd by int80.S et al to define them first thing. - * The kernel assumes that the addresses of these routines are constant - * for all vDSO implementations. - */ - #include #include #include diff --git a/arch/x86/entry/vdso/vdso32/system_call.S b/arch/x86/entry/vdso/vdso32/system_call.S index 0109ac6..ed4bc97 100644 --- a/arch/x86/entry/vdso/vdso32/system_call.S +++ b/arch/x86/entry/vdso/vdso32/system_call.S @@ -2,16 +2,11 @@ * AT_SYSINFO entry point */ +#include #include #include #include -/* - * First get the common code for the sigreturn entry points. - * This must come first. - */ -#include "sigreturn.S" - .text .globl __kernel_vsyscall .type __kernel_vsyscall,@function -- cgit v0.10.2 From a4455082dc6f0b5d51a23523f77600e8ede47c79 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 8 Jun 2016 10:25:33 -0700 Subject: x86/signals: Add missing signal_compat code for x86 features The 32-bit siginfo is a different binary format than the 64-bit one. So, when running 32-bit binaries on 64-bit kernels, we have to convert the kernel's 64-bit version to a 32-bit version that userspace can grok. We've added a few features to siginfo over the past few years and neglected to add them to arch/x86/kernel/signal_compat.c: 1. The si_addr_lsb used in SIGBUS's sent for machine checks 2. The upper/lower bounds for MPX SIGSEGV faults 3. The protection key for pkey faults I caught this with some protection keys unit tests and realized it affected a few more features. This was tested only with my protection keys patch that looks for a proper value in si_pkey. I didn't actually test the machine check or MPX code. Signed-off-by: Dave Hansen Cc: Al Viro Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac@vger.kernel.org Link: http://lkml.kernel.org/r/20160608172533.F8F05637@viggo.jf.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 5a3b2c1..a188061 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -40,6 +40,7 @@ typedef s32 compat_long_t; typedef s64 __attribute__((aligned(4))) compat_s64; typedef u32 compat_uint_t; typedef u32 compat_ulong_t; +typedef u32 compat_u32; typedef u64 __attribute__((aligned(4))) compat_u64; typedef u32 compat_uptr_t; @@ -181,6 +182,16 @@ typedef struct compat_siginfo { /* SIGILL, SIGFPE, SIGSEGV, SIGBUS */ struct { unsigned int _addr; /* faulting insn/memory ref. */ + short int _addr_lsb; /* Valid LSB of the reported address. */ + union { + /* used when si_code=SEGV_BNDERR */ + struct { + compat_uptr_t _lower; + compat_uptr_t _upper; + } _addr_bnd; + /* used when si_code=SEGV_PKUERR */ + compat_u32 _pkey; + }; } _sigfault; /* SIGPOLL */ diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index dc3c0b1..5335ad9 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -32,6 +32,21 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) &to->_sifields._pad[0]); switch (from->si_code >> 16) { case __SI_FAULT >> 16: + if (from->si_signo == SIGBUS && + (from->si_code == BUS_MCEERR_AR || + from->si_code == BUS_MCEERR_AO)) + put_user_ex(from->si_addr_lsb, &to->si_addr_lsb); + + if (from->si_signo == SIGSEGV) { + if (from->si_code == SEGV_BNDERR) { + compat_uptr_t lower = (unsigned long)&to->si_lower; + compat_uptr_t upper = (unsigned long)&to->si_upper; + put_user_ex(lower, &to->si_lower); + put_user_ex(upper, &to->si_upper); + } + if (from->si_code == SEGV_PKUERR) + put_user_ex(from->si_pkey, &to->si_pkey); + } break; case __SI_SYS >> 16: put_user_ex(from->si_syscall, &to->si_syscall); -- cgit v0.10.2 From 02e8fda2cc00419a11cf38199afea4c0d7172be8 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 8 Jun 2016 10:25:34 -0700 Subject: x86/signals: Add build-time checks to the siginfo compat code There were at least 3 features added to the __SI_FAULT area of the siginfo struct that did not make it to the compat siginfo: 1. The si_addr_lsb used in SIGBUS's sent for machine checks 2. The upper/lower bounds for MPX SIGSEGV faults 3. The protection key for pkey faults There was also some turmoil when I was attempting to add the pkey field because it needs to be a fixed size on 32 and 64-bit and not have any alignment constraints. This patch adds some compile-time checks to the compat code to make it harder to screw this up. Basically, the checks are supposed to trip any time someone changes the siginfo structure. That sounds bad, but it's what we want. If someone changes siginfo, we want them to also be _forced_ to go look at the compat code. The details are in the comments. Signed-off-by: Dave Hansen Cc: Al Viro Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tony Luck Cc: linux-edac@vger.kernel.org Link: http://lkml.kernel.org/r/20160608172534.C73DAFC3@viggo.jf.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index 5335ad9..b44564b 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -1,11 +1,104 @@ #include #include +/* + * The compat_siginfo_t structure and handing code is very easy + * to break in several ways. It must always be updated when new + * updates are made to the main siginfo_t, and + * copy_siginfo_to_user32() must be updated when the + * (arch-independent) copy_siginfo_to_user() is updated. + * + * It is also easy to put a new member in the compat_siginfo_t + * which has implicit alignment which can move internal structure + * alignment around breaking the ABI. This can happen if you, + * for instance, put a plain 64-bit value in there. + */ +static inline void signal_compat_build_tests(void) +{ + int _sifields_offset = offsetof(compat_siginfo_t, _sifields); + + /* + * If adding a new si_code, there is probably new data in + * the siginfo. Make sure folks bumping the si_code + * limits also have to look at this code. Make sure any + * new fields are handled in copy_siginfo_to_user32()! + */ + BUILD_BUG_ON(NSIGILL != 8); + BUILD_BUG_ON(NSIGFPE != 8); + BUILD_BUG_ON(NSIGSEGV != 4); + BUILD_BUG_ON(NSIGBUS != 5); + BUILD_BUG_ON(NSIGTRAP != 4); + BUILD_BUG_ON(NSIGCHLD != 6); + BUILD_BUG_ON(NSIGSYS != 1); + + /* This is part of the ABI and can never change in size: */ + BUILD_BUG_ON(sizeof(compat_siginfo_t) != 128); + /* + * The offsets of all the (unioned) si_fields are fixed + * in the ABI, of course. Make sure none of them ever + * move and are always at the beginning: + */ + BUILD_BUG_ON(offsetof(compat_siginfo_t, _sifields) != 3 * sizeof(int)); +#define CHECK_CSI_OFFSET(name) BUILD_BUG_ON(_sifields_offset != offsetof(compat_siginfo_t, _sifields.name)) + + /* + * Ensure that the size of each si_field never changes. + * If it does, it is a sign that the + * copy_siginfo_to_user32() code below needs to updated + * along with the size in the CHECK_SI_SIZE(). + * + * We repeat this check for both the generic and compat + * siginfos. + * + * Note: it is OK for these to grow as long as the whole + * structure stays within the padding size (checked + * above). + */ +#define CHECK_CSI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((compat_siginfo_t *)0)->_sifields.name)) +#define CHECK_SI_SIZE(name, size) BUILD_BUG_ON(size != sizeof(((siginfo_t *)0)->_sifields.name)) + + CHECK_CSI_OFFSET(_kill); + CHECK_CSI_SIZE (_kill, 2*sizeof(int)); + CHECK_SI_SIZE (_kill, 2*sizeof(int)); + + CHECK_CSI_OFFSET(_timer); + CHECK_CSI_SIZE (_timer, 5*sizeof(int)); + CHECK_SI_SIZE (_timer, 6*sizeof(int)); + + CHECK_CSI_OFFSET(_rt); + CHECK_CSI_SIZE (_rt, 3*sizeof(int)); + CHECK_SI_SIZE (_rt, 4*sizeof(int)); + + CHECK_CSI_OFFSET(_sigchld); + CHECK_CSI_SIZE (_sigchld, 5*sizeof(int)); + CHECK_SI_SIZE (_sigchld, 8*sizeof(int)); + + CHECK_CSI_OFFSET(_sigchld_x32); + CHECK_CSI_SIZE (_sigchld_x32, 7*sizeof(int)); + /* no _sigchld_x32 in the generic siginfo_t */ + + CHECK_CSI_OFFSET(_sigfault); + CHECK_CSI_SIZE (_sigfault, 4*sizeof(int)); + CHECK_SI_SIZE (_sigfault, 8*sizeof(int)); + + CHECK_CSI_OFFSET(_sigpoll); + CHECK_CSI_SIZE (_sigpoll, 2*sizeof(int)); + CHECK_SI_SIZE (_sigpoll, 4*sizeof(int)); + + CHECK_CSI_OFFSET(_sigsys); + CHECK_CSI_SIZE (_sigsys, 3*sizeof(int)); + CHECK_SI_SIZE (_sigsys, 4*sizeof(int)); + + /* any new si_fields should be added here */ +} + int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from) { int err = 0; bool ia32 = test_thread_flag(TIF_IA32); + signal_compat_build_tests(); + if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t))) return -EFAULT; -- cgit v0.10.2 From e754aedc26efde6baef2d7824fbecf998a5510a4 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Wed, 8 Jun 2016 10:25:35 -0700 Subject: x86/mpx, selftests: Add MPX self test I've had this code for a while, but never submitted it upstream. Now that Skylake hardware is out in the wild, folks can actually run this for real. It tests the following: 1. The MPX hardware is enabled by the kernel and doing what it is supposed to 2. The MPX management code is present and enabled in the kernel 3. MPX Signal handling 4. The MPX bounds table population code (on-demand population) 5. The MPX bounds table unmapping code (kernel-initiated freeing when unused) This has also caught bugs in the XSAVE code because MPX state is saved/restored with XSAVE. I'm submitting it now because it would have caught the recent issues with the compat_siginfo code not being properly augmented when new siginfo state is added. Signed-off-by: Dave Hansen Cc: Al Viro Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160608172535.5B40B0EE@viggo.jf.intel.com Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index c73425d..abe9c35 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -5,7 +5,7 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \ - check_initial_reg_state sigreturn ldt_gdt iopl + check_initial_reg_state sigreturn ldt_gdt iopl mpx-mini-test TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ vdso_restorer diff --git a/tools/testing/selftests/x86/mpx-debug.h b/tools/testing/selftests/x86/mpx-debug.h new file mode 100644 index 0000000..9230981 --- /dev/null +++ b/tools/testing/selftests/x86/mpx-debug.h @@ -0,0 +1,14 @@ +#ifndef _MPX_DEBUG_H +#define _MPX_DEBUG_H + +#ifndef DEBUG_LEVEL +#define DEBUG_LEVEL 0 +#endif +#define dprintf_level(level, args...) do { if(level <= DEBUG_LEVEL) printf(args); } while(0) +#define dprintf1(args...) dprintf_level(1, args) +#define dprintf2(args...) dprintf_level(2, args) +#define dprintf3(args...) dprintf_level(3, args) +#define dprintf4(args...) dprintf_level(4, args) +#define dprintf5(args...) dprintf_level(5, args) + +#endif /* _MPX_DEBUG_H */ diff --git a/tools/testing/selftests/x86/mpx-dig.c b/tools/testing/selftests/x86/mpx-dig.c new file mode 100644 index 0000000..ce85356 --- /dev/null +++ b/tools/testing/selftests/x86/mpx-dig.c @@ -0,0 +1,498 @@ +/* + * Written by Dave Hansen + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "mpx-debug.h" +#include "mpx-mm.h" +#include "mpx-hw.h" + +unsigned long bounds_dir_global; + +#define mpx_dig_abort() __mpx_dig_abort(__FILE__, __func__, __LINE__) +static void inline __mpx_dig_abort(const char *file, const char *func, int line) +{ + fprintf(stderr, "MPX dig abort @ %s::%d in %s()\n", file, line, func); + printf("MPX dig abort @ %s::%d in %s()\n", file, line, func); + abort(); +} + +/* + * run like this (BDIR finds the probably bounds directory): + * + * BDIR="$(cat /proc/$pid/smaps | grep -B1 2097152 \ + * | head -1 | awk -F- '{print $1}')"; + * ./mpx-dig $pid 0x$BDIR + * + * NOTE: + * assumes that the only 2097152-kb VMA is the bounds dir + */ + +long nr_incore(void *ptr, unsigned long size_bytes) +{ + int i; + long ret = 0; + long vec_len = size_bytes / PAGE_SIZE; + unsigned char *vec = malloc(vec_len); + int incore_ret; + + if (!vec) + mpx_dig_abort(); + + incore_ret = mincore(ptr, size_bytes, vec); + if (incore_ret) { + printf("mincore ret: %d\n", incore_ret); + perror("mincore"); + mpx_dig_abort(); + } + for (i = 0; i < vec_len; i++) + ret += vec[i]; + free(vec); + return ret; +} + +int open_proc(int pid, char *file) +{ + static char buf[100]; + int fd; + + snprintf(&buf[0], sizeof(buf), "/proc/%d/%s", pid, file); + fd = open(&buf[0], O_RDONLY); + if (fd < 0) + perror(buf); + + return fd; +} + +struct vaddr_range { + unsigned long start; + unsigned long end; +}; +struct vaddr_range *ranges; +int nr_ranges_allocated; +int nr_ranges_populated; +int last_range = -1; + +int __pid_load_vaddrs(int pid) +{ + int ret = 0; + int proc_maps_fd = open_proc(pid, "maps"); + char linebuf[10000]; + unsigned long start; + unsigned long end; + char rest[1000]; + FILE *f = fdopen(proc_maps_fd, "r"); + + if (!f) + mpx_dig_abort(); + nr_ranges_populated = 0; + while (!feof(f)) { + char *readret = fgets(linebuf, sizeof(linebuf), f); + int parsed; + + if (readret == NULL) { + if (feof(f)) + break; + mpx_dig_abort(); + } + + parsed = sscanf(linebuf, "%lx-%lx%s", &start, &end, rest); + if (parsed != 3) + mpx_dig_abort(); + + dprintf4("result[%d]: %lx-%lx<->%s\n", parsed, start, end, rest); + if (nr_ranges_populated >= nr_ranges_allocated) { + ret = -E2BIG; + break; + } + ranges[nr_ranges_populated].start = start; + ranges[nr_ranges_populated].end = end; + nr_ranges_populated++; + } + last_range = -1; + fclose(f); + close(proc_maps_fd); + return ret; +} + +int pid_load_vaddrs(int pid) +{ + int ret; + + dprintf2("%s(%d)\n", __func__, pid); + if (!ranges) { + nr_ranges_allocated = 4; + ranges = malloc(nr_ranges_allocated * sizeof(ranges[0])); + dprintf2("%s(%d) allocated %d ranges @ %p\n", __func__, pid, + nr_ranges_allocated, ranges); + assert(ranges != NULL); + } + do { + ret = __pid_load_vaddrs(pid); + if (!ret) + break; + if (ret == -E2BIG) { + dprintf2("%s(%d) need to realloc\n", __func__, pid); + nr_ranges_allocated *= 2; + ranges = realloc(ranges, + nr_ranges_allocated * sizeof(ranges[0])); + dprintf2("%s(%d) allocated %d ranges @ %p\n", __func__, + pid, nr_ranges_allocated, ranges); + assert(ranges != NULL); + dprintf1("reallocating to hold %d ranges\n", nr_ranges_allocated); + } + } while (1); + + dprintf2("%s(%d) done\n", __func__, pid); + + return ret; +} + +static inline int vaddr_in_range(unsigned long vaddr, struct vaddr_range *r) +{ + if (vaddr < r->start) + return 0; + if (vaddr >= r->end) + return 0; + return 1; +} + +static inline int vaddr_mapped_by_range(unsigned long vaddr) +{ + int i; + + if (last_range > 0 && vaddr_in_range(vaddr, &ranges[last_range])) + return 1; + + for (i = 0; i < nr_ranges_populated; i++) { + struct vaddr_range *r = &ranges[i]; + + if (vaddr_in_range(vaddr, r)) + continue; + last_range = i; + return 1; + } + return 0; +} + +const int bt_entry_size_bytes = sizeof(unsigned long) * 4; + +void *read_bounds_table_into_buf(unsigned long table_vaddr) +{ +#ifdef MPX_DIG_STANDALONE + static char bt_buf[MPX_BOUNDS_TABLE_SIZE_BYTES]; + off_t seek_ret = lseek(fd, table_vaddr, SEEK_SET); + if (seek_ret != table_vaddr) + mpx_dig_abort(); + + int read_ret = read(fd, &bt_buf, sizeof(bt_buf)); + if (read_ret != sizeof(bt_buf)) + mpx_dig_abort(); + return &bt_buf; +#else + return (void *)table_vaddr; +#endif +} + +int dump_table(unsigned long table_vaddr, unsigned long base_controlled_vaddr, + unsigned long bde_vaddr) +{ + unsigned long offset_inside_bt; + int nr_entries = 0; + int do_abort = 0; + char *bt_buf; + + dprintf3("%s() base_controlled_vaddr: 0x%012lx bde_vaddr: 0x%012lx\n", + __func__, base_controlled_vaddr, bde_vaddr); + + bt_buf = read_bounds_table_into_buf(table_vaddr); + + dprintf4("%s() read done\n", __func__); + + for (offset_inside_bt = 0; + offset_inside_bt < MPX_BOUNDS_TABLE_SIZE_BYTES; + offset_inside_bt += bt_entry_size_bytes) { + unsigned long bt_entry_index; + unsigned long bt_entry_controls; + unsigned long this_bt_entry_for_vaddr; + unsigned long *bt_entry_buf; + int i; + + dprintf4("%s() offset_inside_bt: 0x%lx of 0x%llx\n", __func__, + offset_inside_bt, MPX_BOUNDS_TABLE_SIZE_BYTES); + bt_entry_buf = (void *)&bt_buf[offset_inside_bt]; + if (!bt_buf) { + printf("null bt_buf\n"); + mpx_dig_abort(); + } + if (!bt_entry_buf) { + printf("null bt_entry_buf\n"); + mpx_dig_abort(); + } + dprintf4("%s() reading *bt_entry_buf @ %p\n", __func__, + bt_entry_buf); + if (!bt_entry_buf[0] && + !bt_entry_buf[1] && + !bt_entry_buf[2] && + !bt_entry_buf[3]) + continue; + + nr_entries++; + + bt_entry_index = offset_inside_bt/bt_entry_size_bytes; + bt_entry_controls = sizeof(void *); + this_bt_entry_for_vaddr = + base_controlled_vaddr + bt_entry_index*bt_entry_controls; + /* + * We sign extend vaddr bits 48->63 which effectively + * creates a hole in the virtual address space. + * This calculation corrects for the hole. + */ + if (this_bt_entry_for_vaddr > 0x00007fffffffffffUL) + this_bt_entry_for_vaddr |= 0xffff800000000000; + + if (!vaddr_mapped_by_range(this_bt_entry_for_vaddr)) { + printf("bt_entry_buf: %p\n", bt_entry_buf); + printf("there is a bte for %lx but no mapping\n", + this_bt_entry_for_vaddr); + printf(" bde vaddr: %016lx\n", bde_vaddr); + printf("base_controlled_vaddr: %016lx\n", base_controlled_vaddr); + printf(" table_vaddr: %016lx\n", table_vaddr); + printf(" entry vaddr: %016lx @ offset %lx\n", + table_vaddr + offset_inside_bt, offset_inside_bt); + do_abort = 1; + mpx_dig_abort(); + } + if (DEBUG_LEVEL < 4) + continue; + + printf("table entry[%lx]: ", offset_inside_bt); + for (i = 0; i < bt_entry_size_bytes; i += sizeof(unsigned long)) + printf("0x%016lx ", bt_entry_buf[i]); + printf("\n"); + } + if (do_abort) + mpx_dig_abort(); + dprintf4("%s() done\n", __func__); + return nr_entries; +} + +int search_bd_buf(char *buf, int len_bytes, unsigned long bd_offset_bytes, + int *nr_populated_bdes) +{ + unsigned long i; + int total_entries = 0; + + dprintf3("%s(%p, %x, %lx, ...) buf end: %p\n", __func__, buf, + len_bytes, bd_offset_bytes, buf + len_bytes); + + for (i = 0; i < len_bytes; i += sizeof(unsigned long)) { + unsigned long bd_index = (bd_offset_bytes + i) / sizeof(unsigned long); + unsigned long *bounds_dir_entry_ptr = (unsigned long *)&buf[i]; + unsigned long bounds_dir_entry; + unsigned long bd_for_vaddr; + unsigned long bt_start; + unsigned long bt_tail; + int nr_entries; + + dprintf4("%s() loop i: %ld bounds_dir_entry_ptr: %p\n", __func__, i, + bounds_dir_entry_ptr); + + bounds_dir_entry = *bounds_dir_entry_ptr; + if (!bounds_dir_entry) { + dprintf4("no bounds dir at index 0x%lx / 0x%lx " + "start at offset:%lx %lx\n", bd_index, bd_index, + bd_offset_bytes, i); + continue; + } + dprintf3("found bounds_dir_entry: 0x%lx @ " + "index 0x%lx buf ptr: %p\n", bounds_dir_entry, i, + &buf[i]); + /* mask off the enable bit: */ + bounds_dir_entry &= ~0x1; + (*nr_populated_bdes)++; + dprintf4("nr_populated_bdes: %p\n", nr_populated_bdes); + dprintf4("*nr_populated_bdes: %d\n", *nr_populated_bdes); + + bt_start = bounds_dir_entry; + bt_tail = bounds_dir_entry + MPX_BOUNDS_TABLE_SIZE_BYTES - 1; + if (!vaddr_mapped_by_range(bt_start)) { + printf("bounds directory 0x%lx points to nowhere\n", + bounds_dir_entry); + mpx_dig_abort(); + } + if (!vaddr_mapped_by_range(bt_tail)) { + printf("bounds directory end 0x%lx points to nowhere\n", + bt_tail); + mpx_dig_abort(); + } + /* + * Each bounds directory entry controls 1MB of virtual address + * space. This variable is the virtual address in the process + * of the beginning of the area controlled by this bounds_dir. + */ + bd_for_vaddr = bd_index * (1UL<<20); + + nr_entries = dump_table(bounds_dir_entry, bd_for_vaddr, + bounds_dir_global+bd_offset_bytes+i); + total_entries += nr_entries; + dprintf5("dir entry[%4ld @ %p]: 0x%lx %6d entries " + "total this buf: %7d bd_for_vaddrs: 0x%lx -> 0x%lx\n", + bd_index, buf+i, + bounds_dir_entry, nr_entries, total_entries, + bd_for_vaddr, bd_for_vaddr + (1UL<<20)); + } + dprintf3("%s(%p, %x, %lx, ...) done\n", __func__, buf, len_bytes, + bd_offset_bytes); + return total_entries; +} + +int proc_pid_mem_fd = -1; + +void *fill_bounds_dir_buf_other(long byte_offset_inside_bounds_dir, + long buffer_size_bytes, void *buffer) +{ + unsigned long seekto = bounds_dir_global + byte_offset_inside_bounds_dir; + int read_ret; + off_t seek_ret = lseek(proc_pid_mem_fd, seekto, SEEK_SET); + + if (seek_ret != seekto) + mpx_dig_abort(); + + read_ret = read(proc_pid_mem_fd, buffer, buffer_size_bytes); + /* there shouldn't practically be short reads of /proc/$pid/mem */ + if (read_ret != buffer_size_bytes) + mpx_dig_abort(); + + return buffer; +} +void *fill_bounds_dir_buf_self(long byte_offset_inside_bounds_dir, + long buffer_size_bytes, void *buffer) + +{ + unsigned char vec[buffer_size_bytes / PAGE_SIZE]; + char *dig_bounds_dir_ptr = + (void *)(bounds_dir_global + byte_offset_inside_bounds_dir); + /* + * use mincore() to quickly find the areas of the bounds directory + * that have memory and thus will be worth scanning. + */ + int incore_ret; + + int incore = 0; + int i; + + dprintf4("%s() dig_bounds_dir_ptr: %p\n", __func__, dig_bounds_dir_ptr); + + incore_ret = mincore(dig_bounds_dir_ptr, buffer_size_bytes, &vec[0]); + if (incore_ret) { + printf("mincore ret: %d\n", incore_ret); + perror("mincore"); + mpx_dig_abort(); + } + for (i = 0; i < sizeof(vec); i++) + incore += vec[i]; + dprintf4("%s() total incore: %d\n", __func__, incore); + if (!incore) + return NULL; + dprintf3("%s() total incore: %d\n", __func__, incore); + return dig_bounds_dir_ptr; +} + +int inspect_pid(int pid) +{ + static int dig_nr; + long offset_inside_bounds_dir; + char bounds_dir_buf[sizeof(unsigned long) * (1UL << 15)]; + char *dig_bounds_dir_ptr; + int total_entries = 0; + int nr_populated_bdes = 0; + int inspect_self; + + if (getpid() == pid) { + dprintf4("inspecting self\n"); + inspect_self = 1; + } else { + dprintf4("inspecting pid %d\n", pid); + mpx_dig_abort(); + } + + for (offset_inside_bounds_dir = 0; + offset_inside_bounds_dir < MPX_BOUNDS_TABLE_SIZE_BYTES; + offset_inside_bounds_dir += sizeof(bounds_dir_buf)) { + static int bufs_skipped; + int this_entries; + + if (inspect_self) { + dig_bounds_dir_ptr = + fill_bounds_dir_buf_self(offset_inside_bounds_dir, + sizeof(bounds_dir_buf), + &bounds_dir_buf[0]); + } else { + dig_bounds_dir_ptr = + fill_bounds_dir_buf_other(offset_inside_bounds_dir, + sizeof(bounds_dir_buf), + &bounds_dir_buf[0]); + } + if (!dig_bounds_dir_ptr) { + bufs_skipped++; + continue; + } + this_entries = search_bd_buf(dig_bounds_dir_ptr, + sizeof(bounds_dir_buf), + offset_inside_bounds_dir, + &nr_populated_bdes); + total_entries += this_entries; + } + printf("mpx dig (%3d) complete, SUCCESS (%8d / %4d)\n", ++dig_nr, + total_entries, nr_populated_bdes); + return total_entries + nr_populated_bdes; +} + +#ifdef MPX_DIG_REMOTE +int main(int argc, char **argv) +{ + int err; + char *c; + unsigned long bounds_dir_entry; + int pid; + + printf("mpx-dig starting...\n"); + err = sscanf(argv[1], "%d", &pid); + printf("parsing: '%s', err: %d\n", argv[1], err); + if (err != 1) + mpx_dig_abort(); + + err = sscanf(argv[2], "%lx", &bounds_dir_global); + printf("parsing: '%s': %d\n", argv[2], err); + if (err != 1) + mpx_dig_abort(); + + proc_pid_mem_fd = open_proc(pid, "mem"); + if (proc_pid_mem_fd < 0) + mpx_dig_abort(); + + inspect_pid(pid); + return 0; +} +#endif + +long inspect_me(struct mpx_bounds_dir *bounds_dir) +{ + int pid = getpid(); + + pid_load_vaddrs(pid); + bounds_dir_global = (unsigned long)bounds_dir; + dprintf4("enter %s() bounds dir: %p\n", __func__, bounds_dir); + return inspect_pid(pid); +} diff --git a/tools/testing/selftests/x86/mpx-hw.h b/tools/testing/selftests/x86/mpx-hw.h new file mode 100644 index 0000000..093c190 --- /dev/null +++ b/tools/testing/selftests/x86/mpx-hw.h @@ -0,0 +1,123 @@ +#ifndef _MPX_HW_H +#define _MPX_HW_H + +#include + +/* Describe the MPX Hardware Layout in here */ + +#define NR_MPX_BOUNDS_REGISTERS 4 + +#ifdef __i386__ + +#define MPX_BOUNDS_TABLE_ENTRY_SIZE_BYTES 16 /* 4 * 32-bits */ +#define MPX_BOUNDS_TABLE_SIZE_BYTES (1ULL << 14) /* 16k */ +#define MPX_BOUNDS_DIR_ENTRY_SIZE_BYTES 4 +#define MPX_BOUNDS_DIR_SIZE_BYTES (1ULL << 22) /* 4MB */ + +#define MPX_BOUNDS_TABLE_BOTTOM_BIT 2 +#define MPX_BOUNDS_TABLE_TOP_BIT 11 +#define MPX_BOUNDS_DIR_BOTTOM_BIT 12 +#define MPX_BOUNDS_DIR_TOP_BIT 31 + +#else + +/* + * Linear Address of "pointer" (LAp) + * 0 -> 2: ignored + * 3 -> 19: index in to bounds table + * 20 -> 47: index in to bounds directory + * 48 -> 63: ignored + */ + +#define MPX_BOUNDS_TABLE_ENTRY_SIZE_BYTES 32 +#define MPX_BOUNDS_TABLE_SIZE_BYTES (1ULL << 22) /* 4MB */ +#define MPX_BOUNDS_DIR_ENTRY_SIZE_BYTES 8 +#define MPX_BOUNDS_DIR_SIZE_BYTES (1ULL << 31) /* 2GB */ + +#define MPX_BOUNDS_TABLE_BOTTOM_BIT 3 +#define MPX_BOUNDS_TABLE_TOP_BIT 19 +#define MPX_BOUNDS_DIR_BOTTOM_BIT 20 +#define MPX_BOUNDS_DIR_TOP_BIT 47 + +#endif + +#define MPX_BOUNDS_DIR_NR_ENTRIES \ + (MPX_BOUNDS_DIR_SIZE_BYTES/MPX_BOUNDS_DIR_ENTRY_SIZE_BYTES) +#define MPX_BOUNDS_TABLE_NR_ENTRIES \ + (MPX_BOUNDS_TABLE_SIZE_BYTES/MPX_BOUNDS_TABLE_ENTRY_SIZE_BYTES) + +#define MPX_BOUNDS_TABLE_ENTRY_VALID_BIT 0x1 + +struct mpx_bd_entry { + union { + char x[MPX_BOUNDS_DIR_ENTRY_SIZE_BYTES]; + void *contents[1]; + }; +} __attribute__((packed)); + +struct mpx_bt_entry { + union { + char x[MPX_BOUNDS_TABLE_ENTRY_SIZE_BYTES]; + unsigned long contents[1]; + }; +} __attribute__((packed)); + +struct mpx_bounds_dir { + struct mpx_bd_entry entries[MPX_BOUNDS_DIR_NR_ENTRIES]; +} __attribute__((packed)); + +struct mpx_bounds_table { + struct mpx_bt_entry entries[MPX_BOUNDS_TABLE_NR_ENTRIES]; +} __attribute__((packed)); + +static inline unsigned long GET_BITS(unsigned long val, int bottombit, int topbit) +{ + int total_nr_bits = topbit - bottombit; + unsigned long mask = (1UL << total_nr_bits)-1; + return (val >> bottombit) & mask; +} + +static inline unsigned long __vaddr_bounds_table_index(void *vaddr) +{ + return GET_BITS((unsigned long)vaddr, MPX_BOUNDS_TABLE_BOTTOM_BIT, + MPX_BOUNDS_TABLE_TOP_BIT); +} + +static inline unsigned long __vaddr_bounds_directory_index(void *vaddr) +{ + return GET_BITS((unsigned long)vaddr, MPX_BOUNDS_DIR_BOTTOM_BIT, + MPX_BOUNDS_DIR_TOP_BIT); +} + +static inline struct mpx_bd_entry *mpx_vaddr_to_bd_entry(void *vaddr, + struct mpx_bounds_dir *bounds_dir) +{ + unsigned long index = __vaddr_bounds_directory_index(vaddr); + return &bounds_dir->entries[index]; +} + +static inline int bd_entry_valid(struct mpx_bd_entry *bounds_dir_entry) +{ + unsigned long __bd_entry = (unsigned long)bounds_dir_entry->contents; + return (__bd_entry & MPX_BOUNDS_TABLE_ENTRY_VALID_BIT); +} + +static inline struct mpx_bounds_table * +__bd_entry_to_bounds_table(struct mpx_bd_entry *bounds_dir_entry) +{ + unsigned long __bd_entry = (unsigned long)bounds_dir_entry->contents; + assert(__bd_entry & MPX_BOUNDS_TABLE_ENTRY_VALID_BIT); + __bd_entry &= ~MPX_BOUNDS_TABLE_ENTRY_VALID_BIT; + return (struct mpx_bounds_table *)__bd_entry; +} + +static inline struct mpx_bt_entry * +mpx_vaddr_to_bt_entry(void *vaddr, struct mpx_bounds_dir *bounds_dir) +{ + struct mpx_bd_entry *bde = mpx_vaddr_to_bd_entry(vaddr, bounds_dir); + struct mpx_bounds_table *bt = __bd_entry_to_bounds_table(bde); + unsigned long index = __vaddr_bounds_table_index(vaddr); + return &bt->entries[index]; +} + +#endif /* _MPX_HW_H */ diff --git a/tools/testing/selftests/x86/mpx-mini-test.c b/tools/testing/selftests/x86/mpx-mini-test.c new file mode 100644 index 0000000..616ee96 --- /dev/null +++ b/tools/testing/selftests/x86/mpx-mini-test.c @@ -0,0 +1,1585 @@ +/* + * mpx-mini-test.c: routines to test Intel MPX (Memory Protection eXtentions) + * + * Written by: + * "Ren, Qiaowei" + * "Wei, Gang" + * "Hansen, Dave" + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2. + */ + +/* + * 2014-12-05: Dave Hansen: fixed all of the compiler warnings, and made sure + * it works on 32-bit. + */ + +int inspect_every_this_many_mallocs = 100; +int zap_all_every_this_many_mallocs = 1000; + +#define _GNU_SOURCE +#define _LARGEFILE64_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "mpx-hw.h" +#include "mpx-debug.h" +#include "mpx-mm.h" + +#ifndef __always_inline +#define __always_inline inline __attribute__((always_inline) +#endif + +#ifndef TEST_DURATION_SECS +#define TEST_DURATION_SECS 3 +#endif + +void write_int_to(char *prefix, char *file, int int_to_write) +{ + char buf[100]; + int fd = open(file, O_RDWR); + int len; + int ret; + + assert(fd >= 0); + len = snprintf(buf, sizeof(buf), "%s%d", prefix, int_to_write); + assert(len >= 0); + assert(len < sizeof(buf)); + ret = write(fd, buf, len); + assert(ret == len); + ret = close(fd); + assert(!ret); +} + +void write_pid_to(char *prefix, char *file) +{ + write_int_to(prefix, file, getpid()); +} + +void trace_me(void) +{ +/* tracing events dir */ +#define TED "/sys/kernel/debug/tracing/events/" +/* + write_pid_to("common_pid=", TED "signal/filter"); + write_pid_to("common_pid=", TED "exceptions/filter"); + write_int_to("", TED "signal/enable", 1); + write_int_to("", TED "exceptions/enable", 1); +*/ + write_pid_to("", "/sys/kernel/debug/tracing/set_ftrace_pid"); + write_int_to("", "/sys/kernel/debug/tracing/trace", 0); +} + +#define test_failed() __test_failed(__FILE__, __LINE__) +static void __test_failed(char *f, int l) +{ + fprintf(stderr, "abort @ %s::%d\n", f, l); + abort(); +} + +/* Error Printf */ +#define eprintf(args...) fprintf(stderr, args) + +#ifdef __i386__ + +/* i386 directory size is 4MB */ +#define REG_IP_IDX REG_EIP +#define REX_PREFIX + +#define XSAVE_OFFSET_IN_FPMEM sizeof(struct _libc_fpstate) + +/* + * __cpuid() is from the Linux Kernel: + */ +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile( + "push %%ebx;" + "cpuid;" + "mov %%ebx, %1;" + "pop %%ebx" + : "=a" (*eax), + "=g" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +#else /* __i386__ */ + +#define REG_IP_IDX REG_RIP +#define REX_PREFIX "0x48, " + +#define XSAVE_OFFSET_IN_FPMEM 0 + +/* + * __cpuid() is from the Linux Kernel: + */ +static inline void __cpuid(unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + /* ecx is often an input as well as an output. */ + asm volatile( + "cpuid;" + : "=a" (*eax), + "=b" (*ebx), + "=c" (*ecx), + "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +#endif /* !__i386__ */ + +struct xsave_hdr_struct { + uint64_t xstate_bv; + uint64_t reserved1[2]; + uint64_t reserved2[5]; +} __attribute__((packed)); + +struct bndregs_struct { + uint64_t bndregs[8]; +} __attribute__((packed)); + +struct bndcsr_struct { + uint64_t cfg_reg_u; + uint64_t status_reg; +} __attribute__((packed)); + +struct xsave_struct { + uint8_t fpu_sse[512]; + struct xsave_hdr_struct xsave_hdr; + uint8_t ymm[256]; + uint8_t lwp[128]; + struct bndregs_struct bndregs; + struct bndcsr_struct bndcsr; +} __attribute__((packed)); + +uint8_t __attribute__((__aligned__(64))) buffer[4096]; +struct xsave_struct *xsave_buf = (struct xsave_struct *)buffer; + +uint8_t __attribute__((__aligned__(64))) test_buffer[4096]; +struct xsave_struct *xsave_test_buf = (struct xsave_struct *)test_buffer; + +uint64_t num_bnd_chk; + +static __always_inline void xrstor_state(struct xsave_struct *fx, uint64_t mask) +{ + uint32_t lmask = mask; + uint32_t hmask = mask >> 32; + + asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x2f\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); +} + +static __always_inline void xsave_state_1(void *_fx, uint64_t mask) +{ + uint32_t lmask = mask; + uint32_t hmask = mask >> 32; + unsigned char *fx = _fx; + + asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); +} + +static inline uint64_t xgetbv(uint32_t index) +{ + uint32_t eax, edx; + + asm volatile(".byte 0x0f,0x01,0xd0" /* xgetbv */ + : "=a" (eax), "=d" (edx) + : "c" (index)); + return eax + ((uint64_t)edx << 32); +} + +static uint64_t read_mpx_status_sig(ucontext_t *uctxt) +{ + memset(buffer, 0, sizeof(buffer)); + memcpy(buffer, + (uint8_t *)uctxt->uc_mcontext.fpregs + XSAVE_OFFSET_IN_FPMEM, + sizeof(struct xsave_struct)); + + return xsave_buf->bndcsr.status_reg; +} + +#include + +static uint8_t *get_next_inst_ip(uint8_t *addr) +{ + uint8_t *ip = addr; + uint8_t sib; + uint8_t rm; + uint8_t mod; + uint8_t base; + uint8_t modrm; + + /* determine the prefix. */ + switch(*ip) { + case 0xf2: + case 0xf3: + case 0x66: + ip++; + break; + } + + /* look for rex prefix */ + if ((*ip & 0x40) == 0x40) + ip++; + + /* Make sure we have a MPX instruction. */ + if (*ip++ != 0x0f) + return addr; + + /* Skip the op code byte. */ + ip++; + + /* Get the modrm byte. */ + modrm = *ip++; + + /* Break it down into parts. */ + rm = modrm & 7; + mod = (modrm >> 6); + + /* Init the parts of the address mode. */ + base = 8; + + /* Is it a mem mode? */ + if (mod != 3) { + /* look for scaled indexed addressing */ + if (rm == 4) { + /* SIB addressing */ + sib = *ip++; + base = sib & 7; + switch (mod) { + case 0: + if (base == 5) + ip += 4; + break; + + case 1: + ip++; + break; + + case 2: + ip += 4; + break; + } + + } else { + /* MODRM addressing */ + switch (mod) { + case 0: + /* DISP32 addressing, no base */ + if (rm == 5) + ip += 4; + break; + + case 1: + ip++; + break; + + case 2: + ip += 4; + break; + } + } + } + return ip; +} + +#ifdef si_lower +static inline void *__si_bounds_lower(siginfo_t *si) +{ + return si->si_lower; +} + +static inline void *__si_bounds_upper(siginfo_t *si) +{ + return si->si_upper; +} +#else +static inline void **__si_bounds_hack(siginfo_t *si) +{ + void *sigfault = &si->_sifields._sigfault; + void *end_sigfault = sigfault + sizeof(si->_sifields._sigfault); + void **__si_lower = end_sigfault; + + return __si_lower; +} + +static inline void *__si_bounds_lower(siginfo_t *si) +{ + return *__si_bounds_hack(si); +} + +static inline void *__si_bounds_upper(siginfo_t *si) +{ + return (*__si_bounds_hack(si)) + sizeof(void *); +} +#endif + +static int br_count; +static int expected_bnd_index = -1; +uint64_t shadow_plb[NR_MPX_BOUNDS_REGISTERS][2]; /* shadow MPX bound registers */ +unsigned long shadow_map[NR_MPX_BOUNDS_REGISTERS]; + +/* + * The kernel is supposed to provide some information about the bounds + * exception in the siginfo. It should match what we have in the bounds + * registers that we are checking against. Just check against the shadow copy + * since it is easily available, and we also check that *it* matches the real + * registers. + */ +void check_siginfo_vs_shadow(siginfo_t* si) +{ + int siginfo_ok = 1; + void *shadow_lower = (void *)(unsigned long)shadow_plb[expected_bnd_index][0]; + void *shadow_upper = (void *)(unsigned long)shadow_plb[expected_bnd_index][1]; + + if ((expected_bnd_index < 0) || + (expected_bnd_index >= NR_MPX_BOUNDS_REGISTERS)) { + fprintf(stderr, "ERROR: invalid expected_bnd_index: %d\n", + expected_bnd_index); + exit(6); + } + if (__si_bounds_lower(si) != shadow_lower) + siginfo_ok = 0; + if (__si_bounds_upper(si) != shadow_upper) + siginfo_ok = 0; + + if (!siginfo_ok) { + fprintf(stderr, "ERROR: siginfo bounds do not match " + "shadow bounds for register %d\n", expected_bnd_index); + exit(7); + } +} + +void handler(int signum, siginfo_t *si, void *vucontext) +{ + int i; + ucontext_t *uctxt = vucontext; + int trapno; + unsigned long ip; + + dprintf1("entered signal handler\n"); + + trapno = uctxt->uc_mcontext.gregs[REG_TRAPNO]; + ip = uctxt->uc_mcontext.gregs[REG_IP_IDX]; + + if (trapno == 5) { + typeof(si->si_addr) *si_addr_ptr = &si->si_addr; + uint64_t status = read_mpx_status_sig(uctxt); + uint64_t br_reason = status & 0x3; + + br_count++; + dprintf1("#BR 0x%jx (total seen: %d)\n", status, br_count); + +#define __SI_FAULT (3 << 16) +#define SEGV_BNDERR (__SI_FAULT|3) /* failed address bound checks */ + + dprintf2("Saw a #BR! status 0x%jx at %016lx br_reason: %jx\n", + status, ip, br_reason); + dprintf2("si_signo: %d\n", si->si_signo); + dprintf2(" signum: %d\n", signum); + dprintf2("info->si_code == SEGV_BNDERR: %d\n", + (si->si_code == SEGV_BNDERR)); + dprintf2("info->si_code: %d\n", si->si_code); + dprintf2("info->si_lower: %p\n", __si_bounds_lower(si)); + dprintf2("info->si_upper: %p\n", __si_bounds_upper(si)); + + check_siginfo_vs_shadow(si); + + for (i = 0; i < 8; i++) + dprintf3("[%d]: %p\n", i, si_addr_ptr[i]); + switch (br_reason) { + case 0: /* traditional BR */ + fprintf(stderr, + "Undefined status with bound exception:%jx\n", + status); + exit(5); + case 1: /* #BR MPX bounds exception */ + /* these are normal and we expect to see them */ + dprintf1("bounds exception (normal): status 0x%jx at %p si_addr: %p\n", + status, (void *)ip, si->si_addr); + num_bnd_chk++; + uctxt->uc_mcontext.gregs[REG_IP_IDX] = + (greg_t)get_next_inst_ip((uint8_t *)ip); + break; + case 2: + fprintf(stderr, "#BR status == 2, missing bounds table," + "kernel should have handled!!\n"); + exit(4); + break; + default: + fprintf(stderr, "bound check error: status 0x%jx at %p\n", + status, (void *)ip); + num_bnd_chk++; + uctxt->uc_mcontext.gregs[REG_IP_IDX] = + (greg_t)get_next_inst_ip((uint8_t *)ip); + fprintf(stderr, "bound check error: si_addr %p\n", si->si_addr); + exit(3); + } + } else if (trapno == 14) { + eprintf("ERROR: In signal handler, page fault, trapno = %d, ip = %016lx\n", + trapno, ip); + eprintf("si_addr %p\n", si->si_addr); + eprintf("REG_ERR: %lx\n", (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]); + test_failed(); + } else { + eprintf("unexpected trap %d! at 0x%lx\n", trapno, ip); + eprintf("si_addr %p\n", si->si_addr); + eprintf("REG_ERR: %lx\n", (unsigned long)uctxt->uc_mcontext.gregs[REG_ERR]); + test_failed(); + } +} + +static inline void cpuid_count(unsigned int op, int count, + unsigned int *eax, unsigned int *ebx, + unsigned int *ecx, unsigned int *edx) +{ + *eax = op; + *ecx = count; + __cpuid(eax, ebx, ecx, edx); +} + +#define XSTATE_CPUID 0x0000000d + +/* + * List of XSAVE features Linux knows about: + */ +enum xfeature_bit { + XSTATE_BIT_FP, + XSTATE_BIT_SSE, + XSTATE_BIT_YMM, + XSTATE_BIT_BNDREGS, + XSTATE_BIT_BNDCSR, + XSTATE_BIT_OPMASK, + XSTATE_BIT_ZMM_Hi256, + XSTATE_BIT_Hi16_ZMM, + + XFEATURES_NR_MAX, +}; + +#define XSTATE_FP (1 << XSTATE_BIT_FP) +#define XSTATE_SSE (1 << XSTATE_BIT_SSE) +#define XSTATE_YMM (1 << XSTATE_BIT_YMM) +#define XSTATE_BNDREGS (1 << XSTATE_BIT_BNDREGS) +#define XSTATE_BNDCSR (1 << XSTATE_BIT_BNDCSR) +#define XSTATE_OPMASK (1 << XSTATE_BIT_OPMASK) +#define XSTATE_ZMM_Hi256 (1 << XSTATE_BIT_ZMM_Hi256) +#define XSTATE_Hi16_ZMM (1 << XSTATE_BIT_Hi16_ZMM) + +#define MPX_XSTATES (XSTATE_BNDREGS | XSTATE_BNDCSR) /* 0x18 */ + +bool one_bit(unsigned int x, int bit) +{ + return !!(x & (1<xsave_hdr.xstate_bv = 0x10; + xsave_buf->bndcsr.cfg_reg_u = (unsigned long)l1base | 1; + xsave_buf->bndcsr.status_reg = 0; + + dprintf2("bf xrstor\n"); + dprintf2("xsave cndcsr: status %jx, configu %jx\n", + xsave_buf->bndcsr.status_reg, xsave_buf->bndcsr.cfg_reg_u); + xrstor_state(xsave_buf, 0x18); + dprintf2("after xrstor\n"); + + xsave_state_1(xsave_buf, 0x18); + + dprintf1("xsave bndcsr: status %jx, configu %jx\n", + xsave_buf->bndcsr.status_reg, xsave_buf->bndcsr.cfg_reg_u); +} + +#include + +struct mpx_bounds_dir *bounds_dir_ptr; + +unsigned long __bd_incore(const char *func, int line) +{ + unsigned long ret = nr_incore(bounds_dir_ptr, MPX_BOUNDS_DIR_SIZE_BYTES); + return ret; +} +#define bd_incore() __bd_incore(__func__, __LINE__) + +void check_clear(void *ptr, unsigned long sz) +{ + unsigned long *i; + + for (i = ptr; (void *)i < ptr + sz; i++) { + if (*i) { + dprintf1("%p is NOT clear at %p\n", ptr, i); + assert(0); + } + } + dprintf1("%p is clear for %lx\n", ptr, sz); +} + +void check_clear_bd(void) +{ + check_clear(bounds_dir_ptr, 2UL << 30); +} + +#define USE_MALLOC_FOR_BOUNDS_DIR 1 +bool process_specific_init(void) +{ + unsigned long size; + unsigned long *dir; + /* Guarantee we have the space to align it, add padding: */ + unsigned long pad = getpagesize(); + + size = 2UL << 30; /* 2GB */ + if (sizeof(unsigned long) == 4) + size = 4UL << 20; /* 4MB */ + dprintf1("trying to allocate %ld MB bounds directory\n", (size >> 20)); + + if (USE_MALLOC_FOR_BOUNDS_DIR) { + unsigned long _dir; + + dir = malloc(size + pad); + assert(dir); + _dir = (unsigned long)dir; + _dir += 0xfffUL; + _dir &= ~0xfffUL; + dir = (void *)_dir; + } else { + /* + * This makes debugging easier because the address + * calculations are simpler: + */ + dir = mmap((void *)0x200000000000, size + pad, + PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (dir == (void *)-1) { + perror("unable to allocate bounds directory"); + abort(); + } + check_clear(dir, size); + } + bounds_dir_ptr = (void *)dir; + madvise(bounds_dir_ptr, size, MADV_NOHUGEPAGE); + bd_incore(); + dprintf1("bounds directory: 0x%p -> 0x%p\n", bounds_dir_ptr, + (char *)bounds_dir_ptr + size); + check_clear(dir, size); + enable_mpx(dir); + check_clear(dir, size); + if (prctl(43, 0, 0, 0, 0)) { + printf("no MPX support\n"); + abort(); + return false; + } + return true; +} + +bool process_specific_finish(void) +{ + if (prctl(44)) { + printf("no MPX support\n"); + return false; + } + return true; +} + +void setup_handler() +{ + int r, rs; + struct sigaction newact; + struct sigaction oldact; + + /* #BR is mapped to sigsegv */ + int signum = SIGSEGV; + + newact.sa_handler = 0; /* void(*)(int)*/ + newact.sa_sigaction = handler; /* void (*)(int, siginfo_t*, void *) */ + + /*sigset_t - signals to block while in the handler */ + /* get the old signal mask. */ + rs = sigprocmask(SIG_SETMASK, 0, &newact.sa_mask); + assert(rs == 0); + + /* call sa_sigaction, not sa_handler*/ + newact.sa_flags = SA_SIGINFO; + + newact.sa_restorer = 0; /* void(*)(), obsolete */ + r = sigaction(signum, &newact, &oldact); + assert(r == 0); +} + +void mpx_prepare(void) +{ + dprintf2("%s()\n", __func__); + setup_handler(); + process_specific_init(); +} + +void mpx_cleanup(void) +{ + printf("%s(): %jd BRs. bye...\n", __func__, num_bnd_chk); + process_specific_finish(); +} + +/*-------------- the following is test case ---------------*/ +#include +#include +#include +#include +#include + +uint64_t num_lower_brs; +uint64_t num_upper_brs; + +#define MPX_CONFIG_OFFSET 1024 +#define MPX_BOUNDS_OFFSET 960 +#define MPX_HEADER_OFFSET 512 +#define MAX_ADDR_TESTED (1<<28) +#define TEST_ROUNDS 100 + +/* + 0F 1A /r BNDLDX-Load + 0F 1B /r BNDSTX-Store Extended Bounds Using Address Translation + 66 0F 1A /r BNDMOV bnd1, bnd2/m128 + 66 0F 1B /r BNDMOV bnd1/m128, bnd2 + F2 0F 1A /r BNDCU bnd, r/m64 + F2 0F 1B /r BNDCN bnd, r/m64 + F3 0F 1A /r BNDCL bnd, r/m64 + F3 0F 1B /r BNDMK bnd, m64 +*/ + +static __always_inline void xsave_state(void *_fx, uint64_t mask) +{ + uint32_t lmask = mask; + uint32_t hmask = mask >> 32; + unsigned char *fx = _fx; + + asm volatile(".byte " REX_PREFIX "0x0f,0xae,0x27\n\t" + : : "D" (fx), "m" (*fx), "a" (lmask), "d" (hmask) + : "memory"); +} + +static __always_inline void mpx_clear_bnd0(void) +{ + long size = 0; + void *ptr = NULL; + /* F3 0F 1B /r BNDMK bnd, m64 */ + /* f3 0f 1b 04 11 bndmk (%rcx,%rdx,1),%bnd0 */ + asm volatile(".byte 0xf3,0x0f,0x1b,0x04,0x11\n\t" + : : "c" (ptr), "d" (size-1) + : "memory"); +} + +static __always_inline void mpx_make_bound_helper(unsigned long ptr, + unsigned long size) +{ + /* F3 0F 1B /r BNDMK bnd, m64 */ + /* f3 0f 1b 04 11 bndmk (%rcx,%rdx,1),%bnd0 */ + asm volatile(".byte 0xf3,0x0f,0x1b,0x04,0x11\n\t" + : : "c" (ptr), "d" (size-1) + : "memory"); +} + +static __always_inline void mpx_check_lowerbound_helper(unsigned long ptr) +{ + /* F3 0F 1A /r NDCL bnd, r/m64 */ + /* f3 0f 1a 01 bndcl (%rcx),%bnd0 */ + asm volatile(".byte 0xf3,0x0f,0x1a,0x01\n\t" + : : "c" (ptr) + : "memory"); +} + +static __always_inline void mpx_check_upperbound_helper(unsigned long ptr) +{ + /* F2 0F 1A /r BNDCU bnd, r/m64 */ + /* f2 0f 1a 01 bndcu (%rcx),%bnd0 */ + asm volatile(".byte 0xf2,0x0f,0x1a,0x01\n\t" + : : "c" (ptr) + : "memory"); +} + +static __always_inline void mpx_movbndreg_helper() +{ + /* 66 0F 1B /r BNDMOV bnd1/m128, bnd2 */ + /* 66 0f 1b c2 bndmov %bnd0,%bnd2 */ + + asm volatile(".byte 0x66,0x0f,0x1b,0xc2\n\t"); +} + +static __always_inline void mpx_movbnd2mem_helper(uint8_t *mem) +{ + /* 66 0F 1B /r BNDMOV bnd1/m128, bnd2 */ + /* 66 0f 1b 01 bndmov %bnd0,(%rcx) */ + asm volatile(".byte 0x66,0x0f,0x1b,0x01\n\t" + : : "c" (mem) + : "memory"); +} + +static __always_inline void mpx_movbnd_from_mem_helper(uint8_t *mem) +{ + /* 66 0F 1A /r BNDMOV bnd1, bnd2/m128 */ + /* 66 0f 1a 01 bndmov (%rcx),%bnd0 */ + asm volatile(".byte 0x66,0x0f,0x1a,0x01\n\t" + : : "c" (mem) + : "memory"); +} + +static __always_inline void mpx_store_dsc_helper(unsigned long ptr_addr, + unsigned long ptr_val) +{ + /* 0F 1B /r BNDSTX-Store Extended Bounds Using Address Translation */ + /* 0f 1b 04 11 bndstx %bnd0,(%rcx,%rdx,1) */ + asm volatile(".byte 0x0f,0x1b,0x04,0x11\n\t" + : : "c" (ptr_addr), "d" (ptr_val) + : "memory"); +} + +static __always_inline void mpx_load_dsc_helper(unsigned long ptr_addr, + unsigned long ptr_val) +{ + /* 0F 1A /r BNDLDX-Load */ + /*/ 0f 1a 04 11 bndldx (%rcx,%rdx,1),%bnd0 */ + asm volatile(".byte 0x0f,0x1a,0x04,0x11\n\t" + : : "c" (ptr_addr), "d" (ptr_val) + : "memory"); +} + +void __print_context(void *__print_xsave_buffer, int line) +{ + uint64_t *bounds = (uint64_t *)(__print_xsave_buffer + MPX_BOUNDS_OFFSET); + uint64_t *cfg = (uint64_t *)(__print_xsave_buffer + MPX_CONFIG_OFFSET); + + int i; + eprintf("%s()::%d\n", "print_context", line); + for (i = 0; i < 4; i++) { + eprintf("bound[%d]: 0x%016lx 0x%016lx(0x%016lx)\n", i, + (unsigned long)bounds[i*2], + ~(unsigned long)bounds[i*2+1], + (unsigned long)bounds[i*2+1]); + } + + eprintf("cpcfg: %jx cpstatus: %jx\n", cfg[0], cfg[1]); +} +#define print_context(x) __print_context(x, __LINE__) +#ifdef DEBUG +#define dprint_context(x) print_context(x) +#else +#define dprint_context(x) do{}while(0) +#endif + +void init() +{ + int i; + + srand((unsigned int)time(NULL)); + + for (i = 0; i < 4; i++) { + shadow_plb[i][0] = 0; + shadow_plb[i][1] = ~(unsigned long)0; + } +} + +long int __mpx_random(int line) +{ +#ifdef NOT_SO_RANDOM + static long fake = 722122311; + fake += 563792075; + return fakse; +#else + return random(); +#endif +} +#define mpx_random() __mpx_random(__LINE__) + +uint8_t *get_random_addr() +{ + uint8_t*addr = (uint8_t *)(unsigned long)(rand() % MAX_ADDR_TESTED); + return (addr - (unsigned long)addr % sizeof(uint8_t *)); +} + +static inline bool compare_context(void *__xsave_buffer) +{ + uint64_t *bounds = (uint64_t *)(__xsave_buffer + MPX_BOUNDS_OFFSET); + + int i; + for (i = 0; i < 4; i++) { + dprintf3("shadow[%d]{%016lx/%016lx}\nbounds[%d]{%016lx/%016lx}\n", + i, (unsigned long)shadow_plb[i][0], (unsigned long)shadow_plb[i][1], + i, (unsigned long)bounds[i*2], ~(unsigned long)bounds[i*2+1]); + if ((shadow_plb[i][0] != bounds[i*2]) || + (shadow_plb[i][1] != ~(unsigned long)bounds[i*2+1])) { + eprintf("ERROR comparing shadow to real bound register %d\n", i); + eprintf("shadow{0x%016lx/0x%016lx}\nbounds{0x%016lx/0x%016lx}\n", + (unsigned long)shadow_plb[i][0], (unsigned long)shadow_plb[i][1], + (unsigned long)bounds[i*2], (unsigned long)bounds[i*2+1]); + return false; + } + } + + return true; +} + +void mkbnd_shadow(uint8_t *ptr, int index, long offset) +{ + uint64_t *lower = (uint64_t *)&(shadow_plb[index][0]); + uint64_t *upper = (uint64_t *)&(shadow_plb[index][1]); + *lower = (unsigned long)ptr; + *upper = (unsigned long)ptr + offset - 1; +} + +void check_lowerbound_shadow(uint8_t *ptr, int index) +{ + uint64_t *lower = (uint64_t *)&(shadow_plb[index][0]); + if (*lower > (uint64_t)(unsigned long)ptr) + num_lower_brs++; + else + dprintf1("LowerBoundChk passed:%p\n", ptr); +} + +void check_upperbound_shadow(uint8_t *ptr, int index) +{ + uint64_t upper = *(uint64_t *)&(shadow_plb[index][1]); + if (upper < (uint64_t)(unsigned long)ptr) + num_upper_brs++; + else + dprintf1("UpperBoundChk passed:%p\n", ptr); +} + +__always_inline void movbndreg_shadow(int src, int dest) +{ + shadow_plb[dest][0] = shadow_plb[src][0]; + shadow_plb[dest][1] = shadow_plb[src][1]; +} + +__always_inline void movbnd2mem_shadow(int src, unsigned long *dest) +{ + unsigned long *lower = (unsigned long *)&(shadow_plb[src][0]); + unsigned long *upper = (unsigned long *)&(shadow_plb[src][1]); + *dest = *lower; + *(dest+1) = *upper; +} + +__always_inline void movbnd_from_mem_shadow(unsigned long *src, int dest) +{ + unsigned long *lower = (unsigned long *)&(shadow_plb[dest][0]); + unsigned long *upper = (unsigned long *)&(shadow_plb[dest][1]); + *lower = *src; + *upper = *(src+1); +} + +__always_inline void stdsc_shadow(int index, uint8_t *ptr, uint8_t *ptr_val) +{ + shadow_map[0] = (unsigned long)shadow_plb[index][0]; + shadow_map[1] = (unsigned long)shadow_plb[index][1]; + shadow_map[2] = (unsigned long)ptr_val; + dprintf3("%s(%d, %p, %p) set shadow map[2]: %p\n", __func__, + index, ptr, ptr_val, ptr_val); + /*ptr ignored */ +} + +void lddsc_shadow(int index, uint8_t *ptr, uint8_t *ptr_val) +{ + uint64_t lower = shadow_map[0]; + uint64_t upper = shadow_map[1]; + uint8_t *value = (uint8_t *)shadow_map[2]; + + if (value != ptr_val) { + dprintf2("%s(%d, %p, %p) init shadow bounds[%d] " + "because %p != %p\n", __func__, index, ptr, + ptr_val, index, value, ptr_val); + shadow_plb[index][0] = 0; + shadow_plb[index][1] = ~(unsigned long)0; + } else { + shadow_plb[index][0] = lower; + shadow_plb[index][1] = upper; + } + /* ptr ignored */ +} + +static __always_inline void mpx_test_helper0(uint8_t *buf, uint8_t *ptr) +{ + mpx_make_bound_helper((unsigned long)ptr, 0x1800); +} + +static __always_inline void mpx_test_helper0_shadow(uint8_t *buf, uint8_t *ptr) +{ + mkbnd_shadow(ptr, 0, 0x1800); +} + +static __always_inline void mpx_test_helper1(uint8_t *buf, uint8_t *ptr) +{ + /* these are hard-coded to check bnd0 */ + expected_bnd_index = 0; + mpx_check_lowerbound_helper((unsigned long)(ptr-1)); + mpx_check_upperbound_helper((unsigned long)(ptr+0x1800)); + /* reset this since we do not expect any more bounds exceptions */ + expected_bnd_index = -1; +} + +static __always_inline void mpx_test_helper1_shadow(uint8_t *buf, uint8_t *ptr) +{ + check_lowerbound_shadow(ptr-1, 0); + check_upperbound_shadow(ptr+0x1800, 0); +} + +static __always_inline void mpx_test_helper2(uint8_t *buf, uint8_t *ptr) +{ + mpx_make_bound_helper((unsigned long)ptr, 0x1800); + mpx_movbndreg_helper(); + mpx_movbnd2mem_helper(buf); + mpx_make_bound_helper((unsigned long)(ptr+0x12), 0x1800); +} + +static __always_inline void mpx_test_helper2_shadow(uint8_t *buf, uint8_t *ptr) +{ + mkbnd_shadow(ptr, 0, 0x1800); + movbndreg_shadow(0, 2); + movbnd2mem_shadow(0, (unsigned long *)buf); + mkbnd_shadow(ptr+0x12, 0, 0x1800); +} + +static __always_inline void mpx_test_helper3(uint8_t *buf, uint8_t *ptr) +{ + mpx_movbnd_from_mem_helper(buf); +} + +static __always_inline void mpx_test_helper3_shadow(uint8_t *buf, uint8_t *ptr) +{ + movbnd_from_mem_shadow((unsigned long *)buf, 0); +} + +static __always_inline void mpx_test_helper4(uint8_t *buf, uint8_t *ptr) +{ + mpx_store_dsc_helper((unsigned long)buf, (unsigned long)ptr); + mpx_make_bound_helper((unsigned long)(ptr+0x12), 0x1800); +} + +static __always_inline void mpx_test_helper4_shadow(uint8_t *buf, uint8_t *ptr) +{ + stdsc_shadow(0, buf, ptr); + mkbnd_shadow(ptr+0x12, 0, 0x1800); +} + +static __always_inline void mpx_test_helper5(uint8_t *buf, uint8_t *ptr) +{ + mpx_load_dsc_helper((unsigned long)buf, (unsigned long)ptr); +} + +static __always_inline void mpx_test_helper5_shadow(uint8_t *buf, uint8_t *ptr) +{ + lddsc_shadow(0, buf, ptr); +} + +#define NR_MPX_TEST_FUNCTIONS 6 + +/* + * For compatibility reasons, MPX will clear the bounds registers + * when you make function calls (among other things). We have to + * preserve the registers in between calls to the "helpers" since + * they build on each other. + * + * Be very careful not to make any function calls inside the + * helpers, or anywhere else beween the xrstor and xsave. + */ +#define run_helper(helper_nr, buf, buf_shadow, ptr) do { \ + xrstor_state(xsave_test_buf, flags); \ + mpx_test_helper##helper_nr(buf, ptr); \ + xsave_state(xsave_test_buf, flags); \ + mpx_test_helper##helper_nr##_shadow(buf_shadow, ptr); \ +} while (0) + +static void run_helpers(int nr, uint8_t *buf, uint8_t *buf_shadow, uint8_t *ptr) +{ + uint64_t flags = 0x18; + + dprint_context(xsave_test_buf); + switch (nr) { + case 0: + run_helper(0, buf, buf_shadow, ptr); + break; + case 1: + run_helper(1, buf, buf_shadow, ptr); + break; + case 2: + run_helper(2, buf, buf_shadow, ptr); + break; + case 3: + run_helper(3, buf, buf_shadow, ptr); + break; + case 4: + run_helper(4, buf, buf_shadow, ptr); + break; + case 5: + run_helper(5, buf, buf_shadow, ptr); + break; + default: + test_failed(); + break; + } + dprint_context(xsave_test_buf); +} + +unsigned long buf_shadow[1024]; /* used to check load / store descriptors */ +extern long inspect_me(struct mpx_bounds_dir *bounds_dir); + +long cover_buf_with_bt_entries(void *buf, long buf_len) +{ + int i; + long nr_to_fill; + int ratio = 1000; + unsigned long buf_len_in_ptrs; + + /* Fill about 1/100 of the space with bt entries */ + nr_to_fill = buf_len / (sizeof(unsigned long) * ratio); + + if (!nr_to_fill) + dprintf3("%s() nr_to_fill: %ld\n", __func__, nr_to_fill); + + /* Align the buffer to pointer size */ + while (((unsigned long)buf) % sizeof(void *)) { + buf++; + buf_len--; + } + /* We are storing pointers, so make */ + buf_len_in_ptrs = buf_len / sizeof(void *); + + for (i = 0; i < nr_to_fill; i++) { + long index = (mpx_random() % buf_len_in_ptrs); + void *ptr = buf + index * sizeof(unsigned long); + unsigned long ptr_addr = (unsigned long)ptr; + + /* ptr and size can be anything */ + mpx_make_bound_helper((unsigned long)ptr, 8); + + /* + * take bnd0 and put it in to bounds tables "buf + index" is an + * address inside the buffer where we are pretending that we + * are going to put a pointer We do not, though because we will + * never load entries from the table, so it doesn't matter. + */ + mpx_store_dsc_helper(ptr_addr, (unsigned long)ptr); + dprintf4("storing bound table entry for %lx (buf start @ %p)\n", + ptr_addr, buf); + } + return nr_to_fill; +} + +unsigned long align_down(unsigned long alignme, unsigned long align_to) +{ + return alignme & ~(align_to-1); +} + +unsigned long align_up(unsigned long alignme, unsigned long align_to) +{ + return (alignme + align_to - 1) & ~(align_to-1); +} + +/* + * Using 1MB alignment guarantees that each no allocation + * will overlap with another's bounds tables. + * + * We have to cook our own allocator here. malloc() can + * mix other allocation with ours which means that even + * if we free all of our allocations, there might still + * be bounds tables for the *areas* since there is other + * valid memory there. + * + * We also can't use malloc() because a free() of an area + * might not free it back to the kernel. We want it + * completely unmapped an malloc() does not guarantee + * that. + */ +#ifdef __i386__ +long alignment = 4096; +long sz_alignment = 4096; +#else +long alignment = 1 * MB; +long sz_alignment = 1 * MB; +#endif +void *mpx_mini_alloc(unsigned long sz) +{ + unsigned long long tries = 0; + static void *last; + void *ptr; + void *try_at; + + sz = align_up(sz, sz_alignment); + + try_at = last + alignment; + while (1) { + ptr = mmap(try_at, sz, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (ptr == (void *)-1) + return NULL; + if (ptr == try_at) + break; + + munmap(ptr, sz); + try_at += alignment; +#ifdef __i386__ + /* + * This isn't quite correct for 32-bit binaries + * on 64-bit kernels since they can use the + * entire 32-bit address space, but it's close + * enough. + */ + if (try_at > (void *)0xC0000000) +#else + if (try_at > (void *)0x0000800000000000) +#endif + try_at = (void *)0x0; + if (!(++tries % 10000)) + dprintf1("stuck in %s(), tries: %lld\n", __func__, tries); + continue; + } + last = ptr; + dprintf3("mpx_mini_alloc(0x%lx) returning: %p\n", sz, ptr); + return ptr; +} +void mpx_mini_free(void *ptr, long sz) +{ + dprintf2("%s() ptr: %p\n", __func__, ptr); + if ((unsigned long)ptr > 0x100000000000) { + dprintf1("uh oh !!!!!!!!!!!!!!! pointer too high: %p\n", ptr); + test_failed(); + } + sz = align_up(sz, sz_alignment); + dprintf3("%s() ptr: %p before munmap\n", __func__, ptr); + munmap(ptr, sz); + dprintf3("%s() ptr: %p DONE\n", __func__, ptr); +} + +#define NR_MALLOCS 100 +struct one_malloc { + char *ptr; + int nr_filled_btes; + unsigned long size; +}; +struct one_malloc mallocs[NR_MALLOCS]; + +void free_one_malloc(int index) +{ + unsigned long free_ptr; + unsigned long mask; + + if (!mallocs[index].ptr) + return; + + mpx_mini_free(mallocs[index].ptr, mallocs[index].size); + dprintf4("freed[%d]: %p\n", index, mallocs[index].ptr); + + free_ptr = (unsigned long)mallocs[index].ptr; + mask = alignment-1; + dprintf4("lowerbits: %lx / %lx mask: %lx\n", free_ptr, + (free_ptr & mask), mask); + assert((free_ptr & mask) == 0); + + mallocs[index].ptr = NULL; +} + +#ifdef __i386__ +#define MPX_BOUNDS_TABLE_COVERS 4096 +#else +#define MPX_BOUNDS_TABLE_COVERS (1 * MB) +#endif +void zap_everything(void) +{ + long after_zap; + long before_zap; + int i; + + before_zap = inspect_me(bounds_dir_ptr); + dprintf1("zapping everything start: %ld\n", before_zap); + for (i = 0; i < NR_MALLOCS; i++) + free_one_malloc(i); + + after_zap = inspect_me(bounds_dir_ptr); + dprintf1("zapping everything done: %ld\n", after_zap); + /* + * We only guarantee to empty the thing out if our allocations are + * exactly aligned on the boundaries of a boudns table. + */ + if ((alignment >= MPX_BOUNDS_TABLE_COVERS) && + (sz_alignment >= MPX_BOUNDS_TABLE_COVERS)) { + if (after_zap != 0) + test_failed(); + + assert(after_zap == 0); + } +} + +void do_one_malloc(void) +{ + static int malloc_counter; + long sz; + int rand_index = (mpx_random() % NR_MALLOCS); + void *ptr = mallocs[rand_index].ptr; + + dprintf3("%s() enter\n", __func__); + + if (ptr) { + dprintf3("freeing one malloc at index: %d\n", rand_index); + free_one_malloc(rand_index); + if (mpx_random() % (NR_MALLOCS*3) == 3) { + int i; + dprintf3("zapping some more\n"); + for (i = rand_index; i < NR_MALLOCS; i++) + free_one_malloc(i); + } + if ((mpx_random() % zap_all_every_this_many_mallocs) == 4) + zap_everything(); + } + + /* 1->~1M */ + sz = (1 + mpx_random() % 1000) * 1000; + ptr = mpx_mini_alloc(sz); + if (!ptr) { + /* + * If we are failing allocations, just assume we + * are out of memory and zap everything. + */ + dprintf3("zapping everything because out of memory\n"); + zap_everything(); + goto out; + } + + dprintf3("malloc: %p size: 0x%lx\n", ptr, sz); + mallocs[rand_index].nr_filled_btes = cover_buf_with_bt_entries(ptr, sz); + mallocs[rand_index].ptr = ptr; + mallocs[rand_index].size = sz; +out: + if ((++malloc_counter) % inspect_every_this_many_mallocs == 0) + inspect_me(bounds_dir_ptr); +} + +void run_timed_test(void (*test_func)(void)) +{ + int done = 0; + long iteration = 0; + static time_t last_print; + time_t now; + time_t start; + + time(&start); + while (!done) { + time(&now); + if ((now - start) > TEST_DURATION_SECS) + done = 1; + + test_func(); + iteration++; + + if ((now - last_print > 1) || done) { + printf("iteration %ld complete, OK so far\n", iteration); + last_print = now; + } + } +} + +void check_bounds_table_frees(void) +{ + printf("executing unmaptest\n"); + inspect_me(bounds_dir_ptr); + run_timed_test(&do_one_malloc); + printf("done with malloc() fun\n"); +} + +void insn_test_failed(int test_nr, int test_round, void *buf, + void *buf_shadow, void *ptr) +{ + print_context(xsave_test_buf); + eprintf("ERROR: test %d round %d failed\n", test_nr, test_round); + while (test_nr == 5) { + struct mpx_bt_entry *bte; + struct mpx_bounds_dir *bd = (void *)bounds_dir_ptr; + struct mpx_bd_entry *bde = mpx_vaddr_to_bd_entry(buf, bd); + + printf(" bd: %p\n", bd); + printf("&bde: %p\n", bde); + printf("*bde: %lx\n", *(unsigned long *)bde); + if (!bd_entry_valid(bde)) + break; + + bte = mpx_vaddr_to_bt_entry(buf, bd); + printf(" te: %p\n", bte); + printf("bte[0]: %lx\n", bte->contents[0]); + printf("bte[1]: %lx\n", bte->contents[1]); + printf("bte[2]: %lx\n", bte->contents[2]); + printf("bte[3]: %lx\n", bte->contents[3]); + break; + } + test_failed(); +} + +void check_mpx_insns_and_tables(void) +{ + int successes = 0; + int failures = 0; + int buf_size = (1024*1024); + unsigned long *buf = malloc(buf_size); + const int total_nr_tests = NR_MPX_TEST_FUNCTIONS * TEST_ROUNDS; + int i, j; + + memset(buf, 0, buf_size); + memset(buf_shadow, 0, sizeof(buf_shadow)); + + for (i = 0; i < TEST_ROUNDS; i++) { + uint8_t *ptr = get_random_addr() + 8; + + for (j = 0; j < NR_MPX_TEST_FUNCTIONS; j++) { + if (0 && j != 5) { + successes++; + continue; + } + dprintf2("starting test %d round %d\n", j, i); + dprint_context(xsave_test_buf); + /* + * test5 loads an address from the bounds tables. + * The load will only complete if 'ptr' matches + * the load and the store, so with random addrs, + * the odds of this are very small. Make it + * higher by only moving 'ptr' 1/10 times. + */ + if (random() % 10 <= 0) + ptr = get_random_addr() + 8; + dprintf3("random ptr{%p}\n", ptr); + dprint_context(xsave_test_buf); + run_helpers(j, (void *)buf, (void *)buf_shadow, ptr); + dprint_context(xsave_test_buf); + if (!compare_context(xsave_test_buf)) { + insn_test_failed(j, i, buf, buf_shadow, ptr); + failures++; + goto exit; + } + successes++; + dprint_context(xsave_test_buf); + dprintf2("finished test %d round %d\n", j, i); + dprintf3("\n"); + dprint_context(xsave_test_buf); + } + } + +exit: + dprintf2("\nabout to free:\n"); + free(buf); + dprintf1("successes: %d\n", successes); + dprintf1(" failures: %d\n", failures); + dprintf1(" tests: %d\n", total_nr_tests); + dprintf1(" expected: %jd #BRs\n", num_upper_brs + num_lower_brs); + dprintf1(" saw: %d #BRs\n", br_count); + if (failures) { + eprintf("ERROR: non-zero number of failures\n"); + exit(20); + } + if (successes != total_nr_tests) { + eprintf("ERROR: succeded fewer than number of tries (%d != %d)\n", + successes, total_nr_tests); + exit(21); + } + if (num_upper_brs + num_lower_brs != br_count) { + eprintf("ERROR: unexpected number of #BRs: %jd %jd %d\n", + num_upper_brs, num_lower_brs, br_count); + eprintf("successes: %d\n", successes); + eprintf(" failures: %d\n", failures); + eprintf(" tests: %d\n", total_nr_tests); + eprintf(" expected: %jd #BRs\n", num_upper_brs + num_lower_brs); + eprintf(" saw: %d #BRs\n", br_count); + exit(22); + } +} + +/* + * This is supposed to SIGSEGV nicely once the kernel + * can no longer allocate vaddr space. + */ +void exhaust_vaddr_space(void) +{ + unsigned long ptr; + /* Try to make sure there is no room for a bounds table anywhere */ + unsigned long skip = MPX_BOUNDS_TABLE_SIZE_BYTES - PAGE_SIZE; +#ifdef __i386__ + unsigned long max_vaddr = 0xf7788000UL; +#else + unsigned long max_vaddr = 0x800000000000UL; +#endif + + dprintf1("%s() start\n", __func__); + /* do not start at 0, we aren't allowed to map there */ + for (ptr = PAGE_SIZE; ptr < max_vaddr; ptr += skip) { + void *ptr_ret; + int ret = madvise((void *)ptr, PAGE_SIZE, MADV_NORMAL); + + if (!ret) { + dprintf1("madvise() %lx ret: %d\n", ptr, ret); + continue; + } + ptr_ret = mmap((void *)ptr, PAGE_SIZE, PROT_READ|PROT_WRITE, + MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); + if (ptr_ret != (void *)ptr) { + perror("mmap"); + dprintf1("mmap(%lx) ret: %p\n", ptr, ptr_ret); + break; + } + if (!(ptr & 0xffffff)) + dprintf1("mmap(%lx) ret: %p\n", ptr, ptr_ret); + } + for (ptr = PAGE_SIZE; ptr < max_vaddr; ptr += skip) { + dprintf2("covering 0x%lx with bounds table entries\n", ptr); + cover_buf_with_bt_entries((void *)ptr, PAGE_SIZE); + } + dprintf1("%s() end\n", __func__); + printf("done with vaddr space fun\n"); +} + +void mpx_table_test(void) +{ + printf("starting mpx bounds table test\n"); + run_timed_test(check_mpx_insns_and_tables); + printf("done with mpx bounds table test\n"); +} + +int main(int argc, char **argv) +{ + int unmaptest = 0; + int vaddrexhaust = 0; + int tabletest = 0; + int i; + + check_mpx_support(); + mpx_prepare(); + srandom(11179); + + bd_incore(); + init(); + bd_incore(); + + trace_me(); + + xsave_state((void *)xsave_test_buf, 0x1f); + if (!compare_context(xsave_test_buf)) + printf("Init failed\n"); + + for (i = 1; i < argc; i++) { + if (!strcmp(argv[i], "unmaptest")) + unmaptest = 1; + if (!strcmp(argv[i], "vaddrexhaust")) + vaddrexhaust = 1; + if (!strcmp(argv[i], "tabletest")) + tabletest = 1; + } + if (!(unmaptest || vaddrexhaust || tabletest)) { + unmaptest = 1; + /* vaddrexhaust = 1; */ + tabletest = 1; + } + if (unmaptest) + check_bounds_table_frees(); + if (tabletest) + mpx_table_test(); + if (vaddrexhaust) + exhaust_vaddr_space(); + printf("%s completed successfully\n", argv[0]); + exit(0); +} + +#include "mpx-dig.c" diff --git a/tools/testing/selftests/x86/mpx-mm.h b/tools/testing/selftests/x86/mpx-mm.h new file mode 100644 index 0000000..af706a5 --- /dev/null +++ b/tools/testing/selftests/x86/mpx-mm.h @@ -0,0 +1,9 @@ +#ifndef _MPX_MM_H +#define _MPX_MM_H + +#define PAGE_SIZE 4096 +#define MB (1UL<<20) + +extern long nr_incore(void *ptr, unsigned long size_bytes); + +#endif /* _MPX_MM_H */ -- cgit v0.10.2 From 39380b80d72723282f0ea1d1bbf2294eae45013e Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Fri, 8 Jul 2016 11:38:28 +0200 Subject: x86/mm/pat, /dev/mem: Remove superfluous error message Currently it's possible for broken (or malicious) userspace to flood a kernel log indefinitely with messages a-la Program dmidecode tried to access /dev/mem between f0000->100000 because range_is_allowed() is case of CONFIG_STRICT_DEVMEM being turned on dumps this information each and every time devmem_is_allowed() fails. Reportedly userspace that is able to trigger contignuous flow of these messages exists. It would be possible to rate limit this message, but that'd have a questionable value; the administrator wouldn't get information about all the failing accessess, so then the information would be both superfluous and incomplete at the same time :) Returning EPERM (which is what is actually happening) is enough indication for userspace what has happened; no need to log this particular error as some sort of special condition. Signed-off-by: Jiri Kosina Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Kees Cook Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/alpine.LNX.2.00.1607081137020.24757@cbobk.fhfr.pm Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index fb0604f..db00e3e 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -755,11 +755,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) return 1; while (cursor < to) { - if (!devmem_is_allowed(pfn)) { - pr_info("x86/PAT: Program %s tried to access /dev/mem between [mem %#010Lx-%#010Lx], PAT prevents it\n", - current->comm, from, to - 1); + if (!devmem_is_allowed(pfn)) return 0; - } cursor += PAGE_SIZE; pfn++; } diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 71025c2..d633974 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -66,12 +66,8 @@ static inline int range_is_allowed(unsigned long pfn, unsigned long size) u64 cursor = from; while (cursor < to) { - if (!devmem_is_allowed(pfn)) { - printk(KERN_INFO - "Program %s tried to access /dev/mem between %Lx->%Lx.\n", - current->comm, from, to); + if (!devmem_is_allowed(pfn)) return 0; - } cursor += PAGE_SIZE; pfn++; } -- cgit v0.10.2 From b059a453b1cf1c8453c2b2ed373d3147d6264ebd Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 28 Jun 2016 14:35:38 +0300 Subject: x86/vdso: Add mremap hook to vm_special_mapping Add possibility for 32-bit user-space applications to move the vDSO mapping. Previously, when a user-space app called mremap() for the vDSO address, in the syscall return path it would land on the previous address of the vDSOpage, resulting in segmentation violation. Now it lands fine and returns to userspace with a remapped vDSO. This will also fix the context.vdso pointer for 64-bit, which does not affect the user of vDSO after mremap() currently, but this may change in the future. As suggested by Andy, return -EINVAL for mremap() that would split the vDSO image: that operation cannot possibly result in a working system so reject it. Renamed and moved the text_mapping structure declaration inside map_vdso(), as it used only there and now it complements the vvar_mapping variable. There is still a problem for remapping the vDSO in glibc applications: the linker relocates addresses for syscalls on the vDSO page, so you need to relink with the new addresses. Without that the next syscall through glibc may fail: Program received signal SIGSEGV, Segmentation fault. #0 0xf7fd9b80 in __kernel_vsyscall () #1 0xf7ec8238 in _exit () from /usr/lib32/libc.so.6 Signed-off-by: Dmitry Safonov Acked-by: Andy Lutomirski Cc: 0x7f454c46@gmail.com Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160628113539.13606-2-dsafonov@virtuozzo.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index ab220ac..3329844 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -97,10 +98,40 @@ static int vdso_fault(const struct vm_special_mapping *sm, return 0; } -static const struct vm_special_mapping text_mapping = { - .name = "[vdso]", - .fault = vdso_fault, -}; +static void vdso_fix_landing(const struct vdso_image *image, + struct vm_area_struct *new_vma) +{ +#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION + if (in_ia32_syscall() && image == &vdso_image_32) { + struct pt_regs *regs = current_pt_regs(); + unsigned long vdso_land = image->sym_int80_landing_pad; + unsigned long old_land_addr = vdso_land + + (unsigned long)current->mm->context.vdso; + + /* Fixing userspace landing - look at do_fast_syscall_32 */ + if (regs->ip == old_land_addr) + regs->ip = new_vma->vm_start + vdso_land; + } +#endif +} + +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + const struct vdso_image *image = current->mm->context.vdso_image; + + if (image->size != new_size) + return -EINVAL; + + if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) + return -EFAULT; + + vdso_fix_landing(image, new_vma); + current->mm->context.vdso = (void __user *)new_vma->vm_start; + + return 0; +} static int vvar_fault(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf) @@ -151,6 +182,12 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) struct vm_area_struct *vma; unsigned long addr, text_start; int ret = 0; + + static const struct vm_special_mapping vdso_mapping = { + .name = "[vdso]", + .fault = vdso_fault, + .mremap = vdso_mremap, + }; static const struct vm_special_mapping vvar_mapping = { .name = "[vvar]", .fault = vvar_fault, @@ -185,7 +222,7 @@ static int map_vdso(const struct vdso_image *image, bool calculate_addr) image->size, VM_READ|VM_EXEC| VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, - &text_mapping); + &vdso_mapping); if (IS_ERR(vma)) { ret = PTR_ERR(vma); diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ca3e517..917f2b6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -594,6 +594,9 @@ struct vm_special_mapping { int (*fault)(const struct vm_special_mapping *sm, struct vm_area_struct *vma, struct vm_fault *vmf); + + int (*mremap)(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma); }; enum tlb_flush_reason { diff --git a/mm/mmap.c b/mm/mmap.c index de2c176..234edff 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2943,9 +2943,19 @@ static const char *special_mapping_name(struct vm_area_struct *vma) return ((struct vm_special_mapping *)vma->vm_private_data)->name; } +static int special_mapping_mremap(struct vm_area_struct *new_vma) +{ + struct vm_special_mapping *sm = new_vma->vm_private_data; + + if (sm->mremap) + return sm->mremap(sm, new_vma); + return 0; +} + static const struct vm_operations_struct special_mapping_vmops = { .close = special_mapping_close, .fault = special_mapping_fault, + .mremap = special_mapping_mremap, .name = special_mapping_name, }; -- cgit v0.10.2 From f80fd3a5fff88a9ace7e8cd11d07cf874a63ea9f Mon Sep 17 00:00:00 2001 From: Dmitry Safonov Date: Tue, 28 Jun 2016 14:35:39 +0300 Subject: selftests/x86: Add vDSO mremap() test Should print this on vDSO remapping success (on new kernels): [root@localhost ~]# ./test_mremap_vdso_32 AT_SYSINFO_EHDR is 0xf773f000 [NOTE] Moving vDSO: [f773f000, f7740000] -> [a000000, a001000] [OK] Or print that mremap() for vDSOs is unsupported: [root@localhost ~]# ./test_mremap_vdso_32 AT_SYSINFO_EHDR is 0xf773c000 [NOTE] Moving vDSO: [0xf773c000, 0xf773d000] -> [0xf7737000, 0xf7738000] [FAIL] mremap() of the vDSO does not work on this kernel! Suggested-by: Andy Lutomirski Signed-off-by: Dmitry Safonov Acked-by: Andy Lutomirski Cc: 0x7f454c46@gmail.com Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Shuah Khan Cc: Thomas Gleixner Cc: linux-kselftest@vger.kernel.org Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/20160628113539.13606-3-dsafonov@virtuozzo.com Signed-off-by: Ingo Molnar diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile index c73425d..543a6d0 100644 --- a/tools/testing/selftests/x86/Makefile +++ b/tools/testing/selftests/x86/Makefile @@ -4,7 +4,7 @@ include ../lib.mk .PHONY: all all_32 all_64 warn_32bit_failure clean -TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall \ +TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall test_mremap_vdso \ check_initial_reg_state sigreturn ldt_gdt iopl TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault test_syscall_vdso unwind_vdso \ test_FCMOV test_FCOMI test_FISTTP \ diff --git a/tools/testing/selftests/x86/test_mremap_vdso.c b/tools/testing/selftests/x86/test_mremap_vdso.c new file mode 100644 index 0000000..bf0d687 --- /dev/null +++ b/tools/testing/selftests/x86/test_mremap_vdso.c @@ -0,0 +1,111 @@ +/* + * 32-bit test to check vDSO mremap. + * + * Copyright (c) 2016 Dmitry Safonov + * Suggested-by: Andrew Lutomirski + * + * This program is free software; you can redistribute it and/or modify + * it under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ +/* + * Can be built statically: + * gcc -Os -Wall -static -m32 test_mremap_vdso.c + */ +#define _GNU_SOURCE +#include +#include +#include +#include + +#include +#include +#include +#include + +#define PAGE_SIZE 4096 + +static int try_to_remap(void *vdso_addr, unsigned long size) +{ + void *dest_addr, *new_addr; + + /* Searching for memory location where to remap */ + dest_addr = mmap(0, size, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); + if (dest_addr == MAP_FAILED) { + printf("[WARN]\tmmap failed (%d): %m\n", errno); + return 0; + } + + printf("[NOTE]\tMoving vDSO: [%p, %#lx] -> [%p, %#lx]\n", + vdso_addr, (unsigned long)vdso_addr + size, + dest_addr, (unsigned long)dest_addr + size); + fflush(stdout); + + new_addr = mremap(vdso_addr, size, size, + MREMAP_FIXED|MREMAP_MAYMOVE, dest_addr); + if ((unsigned long)new_addr == (unsigned long)-1) { + munmap(dest_addr, size); + if (errno == EINVAL) { + printf("[NOTE]\tvDSO partial move failed, will try with bigger size\n"); + return -1; /* Retry with larger */ + } + printf("[FAIL]\tmremap failed (%d): %m\n", errno); + return 1; + } + + return 0; + +} + +int main(int argc, char **argv, char **envp) +{ + pid_t child; + + child = fork(); + if (child == -1) { + printf("[WARN]\tfailed to fork (%d): %m\n", errno); + return 1; + } + + if (child == 0) { + unsigned long vdso_size = PAGE_SIZE; + unsigned long auxval; + int ret = -1; + + auxval = getauxval(AT_SYSINFO_EHDR); + printf("\tAT_SYSINFO_EHDR is %#lx\n", auxval); + if (!auxval || auxval == -ENOENT) { + printf("[WARN]\tgetauxval failed\n"); + return 0; + } + + /* Simpler than parsing ELF header */ + while (ret < 0) { + ret = try_to_remap((void *)auxval, vdso_size); + vdso_size += PAGE_SIZE; + } + + /* Glibc is likely to explode now - exit with raw syscall */ + asm volatile ("int $0x80" : : "a" (__NR_exit), "b" (!!ret)); + } else { + int status; + + if (waitpid(child, &status, 0) != child || + !WIFEXITED(status)) { + printf("[FAIL]\tmremap() of the vDSO does not work on this kernel!\n"); + return 1; + } else if (WEXITSTATUS(status) != 0) { + printf("[FAIL]\tChild failed with %d\n", + WEXITSTATUS(status)); + return 1; + } + printf("[OK]\n"); + } + + return 0; +} -- cgit v0.10.2 From 9a7e7b571826c4399aa639af4a670642d96d935c Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 8 Jul 2016 16:01:48 +0200 Subject: x86/asm/entry: Make thunk's restore a local label No need to have it appear in objdump output. No functionality change. Signed-off-by: Borislav Petkov Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/20160708141016.GH3808@pd.tnic Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/thunk_64.S b/arch/x86/entry/thunk_64.S index 027aec4..627ecbc 100644 --- a/arch/x86/entry/thunk_64.S +++ b/arch/x86/entry/thunk_64.S @@ -33,7 +33,7 @@ .endif call \func - jmp restore + jmp .L_restore _ASM_NOKPROBE(\name) .endm @@ -54,7 +54,7 @@ #if defined(CONFIG_TRACE_IRQFLAGS) \ || defined(CONFIG_DEBUG_LOCK_ALLOC) \ || defined(CONFIG_PREEMPT) -restore: +.L_restore: popq %r11 popq %r10 popq %r9 @@ -66,5 +66,5 @@ restore: popq %rdi popq %rbp ret - _ASM_NOKPROBE(restore) + _ASM_NOKPROBE(.L_restore) #endif -- cgit v0.10.2 From 2e9d1e150abf88cb63e5d34ca286edbb95b4c53d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 20 Jun 2016 16:58:29 +0200 Subject: x86/entry: Avoid interrupt flag save and restore Thanks to all the work that was done by Andy Lutomirski and others, enter_from_user_mode() and prepare_exit_to_usermode() are now called only with interrupts disabled. Let's provide them a version of user_enter()/user_exit() that skips saving and restoring the interrupt flag. On an AMD-based machine I tested this patch on, with force-enabled context tracking, the speed-up in system calls was 90 clock cycles or 6%, measured with the following simple benchmark: #include #include #include #include unsigned long rdtsc() { unsigned long result; asm volatile("rdtsc; shl $32, %%rdx; mov %%eax, %%eax\n" "or %%rdx, %%rax" : "=a" (result) : : "rdx"); return result; } int main() { unsigned long tsc1, tsc2; int pid = getpid(); int i; tsc1 = rdtsc(); for (i = 0; i < 100000000; i++) kill(pid, SIGWINCH); tsc2 = rdtsc(); printf("%ld\n", tsc2 - tsc1); } Signed-off-by: Paolo Bonzini Reviewed-by: Rik van Riel Reviewed-by: Andy Lutomirski Acked-by: Paolo Bonzini Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kvm@vger.kernel.org Link: http://lkml.kernel.org/r/1466434712-31440-2-git-send-email-pbonzini@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index ec138e5..618bc61 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -43,7 +43,7 @@ static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) __visible void enter_from_user_mode(void) { CT_WARN_ON(ct_state() != CONTEXT_USER); - user_exit(); + user_exit_irqoff(); } #else static inline void enter_from_user_mode(void) {} @@ -274,7 +274,7 @@ __visible inline void prepare_exit_to_usermode(struct pt_regs *regs) ti->status &= ~TS_COMPAT; #endif - user_enter(); + user_enter_irqoff(); } #define SYSCALL_EXIT_WORK_FLAGS \ diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index d259274..d9aef2a 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -31,6 +31,19 @@ static inline void user_exit(void) context_tracking_exit(CONTEXT_USER); } +/* Called with interrupts disabled. */ +static inline void user_enter_irqoff(void) +{ + if (context_tracking_is_enabled()) + __context_tracking_enter(CONTEXT_USER); + +} +static inline void user_exit_irqoff(void) +{ + if (context_tracking_is_enabled()) + __context_tracking_exit(CONTEXT_USER); +} + static inline enum ctx_state exception_enter(void) { enum ctx_state prev_ctx; @@ -69,6 +82,8 @@ static inline enum ctx_state ct_state(void) #else static inline void user_enter(void) { } static inline void user_exit(void) { } +static inline void user_enter_irqoff(void) { } +static inline void user_exit_irqoff(void) { } static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; } -- cgit v0.10.2 From be8a18e2e98e04a5def5887d913b267865562448 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Mon, 20 Jun 2016 16:58:30 +0200 Subject: x86/entry: Inline enter_from_user_mode() This matches what is already done for prepare_exit_to_usermode(), and saves about 60 clock cycles (4% speedup) with the benchmark in the previous commit message. Signed-off-by: Paolo Bonzini Reviewed-by: Rik van Riel Reviewed-by: Andy Lutomirski Acked-by: Paolo Bonzini Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: kvm@vger.kernel.org Link: http://lkml.kernel.org/r/1466434712-31440-3-git-send-email-pbonzini@redhat.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c index 618bc61..9e1e27d 100644 --- a/arch/x86/entry/common.c +++ b/arch/x86/entry/common.c @@ -40,7 +40,7 @@ static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs) #ifdef CONFIG_CONTEXT_TRACKING /* Called on entry from user mode with IRQs off. */ -__visible void enter_from_user_mode(void) +__visible inline void enter_from_user_mode(void) { CT_WARN_ON(ct_state() != CONTEXT_USER); user_exit_irqoff(); -- cgit v0.10.2 From 00839ee3b299303c6a5e26a0a2485427a3afcbbf Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 7 Jul 2016 17:19:11 -0700 Subject: x86/mm: Move swap offset/type up in PTE to work around erratum This erratum can result in Accessed/Dirty getting set by the hardware when we do not expect them to be (on !Present PTEs). Instead of trying to fix them up after this happens, we just allow the bits to get set and try to ignore them. We do this by shifting the layout of the bits we use for swap offset/type in our 64-bit PTEs. It looks like this: bitnrs: | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| names: | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| before: | OFFSET (9-63) |0|X|X| TYPE(1-5) |0| after: | OFFSET (14-63) | TYPE (9-13) |0|X|X|X| X| X|X|X|0| Note that D was already a don't care (X) even before. We just move TYPE up and turn its old spot (which could be hit by the A bit) into all don't cares. We take 5 bits away from the offset, but that still leaves us with 50 bits which lets us index into a 62-bit swapfile (4 EiB). I think that's probably fine for the moment. We could theoretically reclaim 5 of the bits (1, 2, 3, 4, 7) but it doesn't gain us anything. Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: dave.hansen@intel.com Cc: linux-mm@kvack.org Cc: mhocko@suse.com Link: http://lkml.kernel.org/r/20160708001911.9A3FD2B6@viggo.jf.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 2ee7811..7e8ec7a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -140,18 +140,32 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) #define pte_unmap(pte) ((void)(pte))/* NOP */ -/* Encode and de-code a swap entry */ +/* + * Encode and de-code a swap entry + * + * | ... | 11| 10| 9|8|7|6|5| 4| 3|2|1|0| <- bit number + * | ... |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names + * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry + * + * G (8) is aliased and used as a PROT_NONE indicator for + * !present ptes. We need to start storing swap entries above + * there. We also need to avoid using A and D because of an + * erratum where they can be incorrectly set by hardware on + * non-present PTEs. + */ +#define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1) #define SWP_TYPE_BITS 5 -#define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1) +/* Place the offset above the type: */ +#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1) #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS) -#define __swp_type(x) (((x).val >> (_PAGE_BIT_PRESENT + 1)) \ +#define __swp_type(x) (((x).val >> (SWP_TYPE_FIRST_BIT)) \ & ((1U << SWP_TYPE_BITS) - 1)) -#define __swp_offset(x) ((x).val >> SWP_OFFSET_SHIFT) +#define __swp_offset(x) ((x).val >> SWP_OFFSET_FIRST_BIT) #define __swp_entry(type, offset) ((swp_entry_t) { \ - ((type) << (_PAGE_BIT_PRESENT + 1)) \ - | ((offset) << SWP_OFFSET_SHIFT) }) + ((type) << (SWP_TYPE_FIRST_BIT)) \ + | ((offset) << SWP_OFFSET_FIRST_BIT) }) #define __pte_to_swp_entry(pte) ((swp_entry_t) { pte_val((pte)) }) #define __swp_entry_to_pte(x) ((pte_t) { .pte = (x).val }) -- cgit v0.10.2 From 97e3c602ccbdd7db54e92fe05675c664c052a466 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 7 Jul 2016 17:19:12 -0700 Subject: x86/mm: Ignore A/D bits in pte/pmd/pud_none() The erratum we are fixing here can lead to stray setting of the A and D bits. That means that a pte that we cleared might suddenly have A/D set. So, stop considering those bits when determining if a pte is pte_none(). The same goes for the other pmd_none() and pud_none(). pgd_none() can be skipped because it is not affected; we do not use PGD entries for anything other than pagetables on affected configurations. This adds a tiny amount of overhead to all pte_none() checks. I doubt we'll be able to measure it anywhere. Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: dave.hansen@intel.com Cc: linux-mm@kvack.org Cc: mhocko@suse.com Link: http://lkml.kernel.org/r/20160708001912.5216F89C@viggo.jf.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 1a27396..2815d26 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -480,7 +480,7 @@ pte_t *populate_extra_pte(unsigned long vaddr); static inline int pte_none(pte_t pte) { - return !pte.pte; + return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK)); } #define __HAVE_ARCH_PTE_SAME @@ -552,7 +552,8 @@ static inline int pmd_none(pmd_t pmd) { /* Only check low word on 32-bit platforms, since it might be out of sync with upper half. */ - return (unsigned long)native_pmd_val(pmd) == 0; + unsigned long val = native_pmd_val(pmd); + return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0; } static inline unsigned long pmd_page_vaddr(pmd_t pmd) @@ -616,7 +617,7 @@ static inline unsigned long pages_to_mb(unsigned long npg) #if CONFIG_PGTABLE_LEVELS > 2 static inline int pud_none(pud_t pud) { - return native_pud_val(pud) == 0; + return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0; } static inline int pud_present(pud_t pud) @@ -694,6 +695,12 @@ static inline int pgd_bad(pgd_t pgd) static inline int pgd_none(pgd_t pgd) { + /* + * There is no need to do a workaround for the KNL stray + * A/D bit erratum here. PGDs only point to page tables + * except on 32-bit non-PAE which is not supported on + * KNL. + */ return !native_pgd_val(pgd); } #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 7b5efe2..d14d0a5 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -70,6 +70,12 @@ _PAGE_PKEY_BIT2 | \ _PAGE_PKEY_BIT3) +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +#define _PAGE_KNL_ERRATUM_MASK (_PAGE_DIRTY | _PAGE_ACCESSED) +#else +#define _PAGE_KNL_ERRATUM_MASK 0 +#endif + #ifdef CONFIG_KMEMCHECK #define _PAGE_HIDDEN (_AT(pteval_t, 1) << _PAGE_BIT_HIDDEN) #else -- cgit v0.10.2 From e4a84be6f05eab4778732d799f63b3cd15427885 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 7 Jul 2016 17:19:14 -0700 Subject: x86/mm: Disallow running with 32-bit PTEs to work around erratum The Intel(R) Xeon Phi(TM) Processor x200 Family (codename: Knights Landing) has an erratum where a processor thread setting the Accessed or Dirty bits may not do so atomically against its checks for the Present bit. This may cause a thread (which is about to page fault) to set A and/or D, even though the Present bit had already been atomically cleared. These bits are truly "stray". In the case of the Dirty bit, the thread associated with the stray set was *not* allowed to write to the page. This means that we do not have to launder the bit(s); we can simply ignore them. If the PTE is used for storing a swap index or a NUMA migration index, the A bit could be misinterpreted as part of the swap type. The stray bits being set cause a software-cleared PTE to be interpreted as a swap entry. In some cases (like when the swap index ends up being for a non-existent swapfile), the kernel detects the stray value and WARN()s about it, but there is no guarantee that the kernel can always detect it. When we have 64-bit PTEs (64-bit mode or 32-bit PAE), we were able to move the swap PTE format around to avoid these troublesome bits. But, 32-bit non-PAE is tight on bits. So, disallow it from running on this hardware. I can't imagine anyone wanting to run 32-bit non-highmem kernels on this hardware, but disallowing them from running entirely is surely the safe thing to do. Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: dave.hansen@intel.com Cc: linux-mm@kvack.org Cc: mhocko@suse.com Link: http://lkml.kernel.org/r/20160708001914.D0B50110@viggo.jf.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 9011a88..a5ce666 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -294,6 +294,7 @@ static inline int cmdline_find_option_bool(const char *option) /* cpu.c, cpucheck.c */ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr); +int check_knl_erratum(void); int validate_cpu(void); /* early_serial_console.c */ diff --git a/arch/x86/boot/cpu.c b/arch/x86/boot/cpu.c index 29207f6..26240dd 100644 --- a/arch/x86/boot/cpu.c +++ b/arch/x86/boot/cpu.c @@ -93,6 +93,8 @@ int validate_cpu(void) show_cap_strs(err_flags); putchar('\n'); return -1; + } else if (check_knl_erratum()) { + return -1; } else { return 0; } diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 1fd7d57..4ad7d70 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -24,6 +24,7 @@ # include "boot.h" #endif #include +#include #include #include #include @@ -175,6 +176,8 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) puts("WARNING: PAE disabled. Use parameter 'forcepae' to enable at your own risk!\n"); } } + if (!err) + err = check_knl_erratum(); if (err_flags_ptr) *err_flags_ptr = err ? err_flags : NULL; @@ -185,3 +188,33 @@ int check_cpu(int *cpu_level_ptr, int *req_level_ptr, u32 **err_flags_ptr) return (cpu.level < req_level || err) ? -1 : 0; } + +int check_knl_erratum(void) +{ + /* + * First check for the affected model/family: + */ + if (!is_intel() || + cpu.family != 6 || + cpu.model != INTEL_FAM6_XEON_PHI_KNL) + return 0; + + /* + * This erratum affects the Accessed/Dirty bits, and can + * cause stray bits to be set in !Present PTEs. We have + * enough bits in our 64-bit PTEs (which we have on real + * 64-bit mode or PAE) to avoid using these troublesome + * bits. But, we do not have enough space in our 32-bit + * PTEs. So, refuse to run on 32-bit non-PAE kernels. + */ + if (IS_ENABLED(CONFIG_X86_64) || IS_ENABLED(CONFIG_X86_PAE)) + return 0; + + puts("This 32-bit kernel can not run on this Xeon Phi x200\n" + "processor due to a processor erratum. Use a 64-bit\n" + "kernel, or enable PAE in this 32-bit kernel.\n\n"); + + return -1; +} + + diff --git a/arch/x86/boot/cpuflags.c b/arch/x86/boot/cpuflags.c index 431fa5f..6687ab9 100644 --- a/arch/x86/boot/cpuflags.c +++ b/arch/x86/boot/cpuflags.c @@ -102,6 +102,7 @@ void get_cpuflags(void) cpuid(0x1, &tfms, &ignored, &cpu.flags[4], &cpu.flags[0]); cpu.level = (tfms >> 8) & 15; + cpu.family = cpu.level; cpu.model = (tfms >> 4) & 15; if (cpu.level >= 6) cpu.model += ((tfms >> 16) & 0xf) << 4; diff --git a/arch/x86/boot/cpuflags.h b/arch/x86/boot/cpuflags.h index 4cb404f..15ad56a 100644 --- a/arch/x86/boot/cpuflags.h +++ b/arch/x86/boot/cpuflags.h @@ -6,6 +6,7 @@ struct cpu_features { int level; /* Family, or 64 for x86-64 */ + int family; /* Family, always */ int model; u32 flags[NCAPINTS]; }; -- cgit v0.10.2 From dcb32d9913b7ed527b135a7e221f8d14b67bb952 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 7 Jul 2016 17:19:15 -0700 Subject: x86/mm: Use pte_none() to test for empty PTE The page table manipulation code seems to have grown a couple of sites that are looking for empty PTEs. Just in case one of these entries got a stray bit set, use pte_none() instead of checking for a zero pte_val(). The use pte_same() makes me a bit nervous. If we were doing a pte_same() check against two cleared entries and one of them had a stray bit set, it might fail the pte_same() check. But, I don't think we ever _do_ pte_same() for cleared entries. It is almost entirely used for checking for races in fault-in paths. Signed-off-by: Dave Hansen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Dave Hansen Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Cc: dave.hansen@intel.com Cc: linux-mm@kvack.org Cc: mhocko@suse.com Link: http://lkml.kernel.org/r/20160708001915.813703D9@viggo.jf.intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bce2e5d..bb88fbc 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -354,7 +354,7 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end, * pagetable pages as RO. So assume someone who pre-setup * these mappings are more intelligent. */ - if (pte_val(*pte)) { + if (!pte_none(*pte)) { if (!after_bootmem) pages++; continue; @@ -396,7 +396,7 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, continue; } - if (pmd_val(*pmd)) { + if (!pmd_none(*pmd)) { if (!pmd_large(*pmd)) { spin_lock(&init_mm.page_table_lock); pte = (pte_t *)pmd_page_vaddr(*pmd); @@ -470,7 +470,7 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, continue; } - if (pud_val(*pud)) { + if (!pud_none(*pud)) { if (!pud_large(*pud)) { pmd = pmd_offset(pud, 0); last_map_addr = phys_pmd_init(pmd, addr, end, @@ -673,7 +673,7 @@ static void __meminit free_pte_table(pte_t *pte_start, pmd_t *pmd) for (i = 0; i < PTRS_PER_PTE; i++) { pte = pte_start + i; - if (pte_val(*pte)) + if (!pte_none(*pte)) return; } @@ -691,7 +691,7 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) for (i = 0; i < PTRS_PER_PMD; i++) { pmd = pmd_start + i; - if (pmd_val(*pmd)) + if (!pmd_none(*pmd)) return; } @@ -710,7 +710,7 @@ static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) for (i = 0; i < PTRS_PER_PUD; i++) { pud = pud_start + i; - if (pud_val(*pud)) + if (!pud_none(*pud)) return false; } diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7a1f7bb..7514215 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1185,7 +1185,7 @@ repeat: return __cpa_process_fault(cpa, address, primary); old_pte = *kpte; - if (!pte_val(old_pte)) + if (pte_none(old_pte)) return __cpa_process_fault(cpa, address, primary); if (level == PG_LEVEL_4K) { diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index 75cc097..e67ae0e6 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -47,7 +47,7 @@ void set_pte_vaddr(unsigned long vaddr, pte_t pteval) return; } pte = pte_offset_kernel(pmd, vaddr); - if (pte_val(pteval)) + if (!pte_none(pteval)) set_pte_at(&init_mm, vaddr, pte, pteval); else pte_clear(&init_mm, vaddr, pte); -- cgit v0.10.2 From af2cf278ef4f9289f88504c3e03cb12f76027575 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 14 Jul 2016 13:22:49 -0700 Subject: x86/mm/hotplug: Don't remove PGD entries in remove_pagetable() So when memory hotplug removes a piece of physical memory from pagetable mappings, it also frees the underlying PGD entry. This complicates PGD management, so don't do this. We can keep the PGD mapped and the PUD table all clear - it's only a single 4K page per 512 GB of memory hotplugged. Signed-off-by: Ingo Molnar Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Waiman Long Cc: linux-mm@kvack.org Link: http://lkml.kernel.org/r/064ff6c7275734537f969e876f6cd0baa954d2cc.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bb88fbc..e14f870 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -702,27 +702,6 @@ static void __meminit free_pmd_table(pmd_t *pmd_start, pud_t *pud) spin_unlock(&init_mm.page_table_lock); } -/* Return true if pgd is changed, otherwise return false. */ -static bool __meminit free_pud_table(pud_t *pud_start, pgd_t *pgd) -{ - pud_t *pud; - int i; - - for (i = 0; i < PTRS_PER_PUD; i++) { - pud = pud_start + i; - if (!pud_none(*pud)) - return false; - } - - /* free a pud table */ - free_pagetable(pgd_page(*pgd), 0); - spin_lock(&init_mm.page_table_lock); - pgd_clear(pgd); - spin_unlock(&init_mm.page_table_lock); - - return true; -} - static void __meminit remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end, bool direct) @@ -913,7 +892,6 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) unsigned long addr; pgd_t *pgd; pud_t *pud; - bool pgd_changed = false; for (addr = start; addr < end; addr = next) { next = pgd_addr_end(addr, end); @@ -924,13 +902,8 @@ remove_pagetable(unsigned long start, unsigned long end, bool direct) pud = (pud_t *)pgd_page_vaddr(*pgd); remove_pud_table(pud, addr, next, direct); - if (free_pud_table(pud, pgd)) - pgd_changed = true; } - if (pgd_changed) - sync_global_pgds(start, end - 1, 1); - flush_tlb_all(); } -- cgit v0.10.2 From 360cb4d15567a7eca07a5f3ade6de308bbfb4e70 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:50 -0700 Subject: x86/mm/cpa: In populate_pgd(), don't set the PGD entry until it's populated This avoids pointless races in which another CPU or task might see a partially populated global PGD entry. These races should normally be harmless, but, if another CPU propagates the entry via vmalloc_fault() and then populate_pgd() fails (due to memory allocation failure, for example), this prevents a use-after-free of the PGD entry. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/bf99df27eac6835f687005364bd1fbd89130946c.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7514215..26aa487 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1104,8 +1104,6 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); if (!pud) return -1; - - set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); } pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); @@ -1113,11 +1111,16 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) ret = populate_pud(cpa, addr, pgd_entry, pgprot); if (ret < 0) { - unmap_pgd_range(cpa->pgd, addr, + if (pud) + free_page((unsigned long)pud); + unmap_pud_range(pgd_entry, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } + if (pud) + set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); + cpa->numpages = ret; return 0; } -- cgit v0.10.2 From d92fc69ccac4c0a20679fdbdc81b2010685a6f33 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:51 -0700 Subject: x86/mm: Remove kernel_unmap_pages_in_pgd() and efi_cleanup_page_tables() kernel_unmap_pages_in_pgd() is dangerous: if a PGD entry in init_mm.pgd were to be cleared, callers would need to ensure that the pgd entry hadn't been propagated to any other pgd. Its only caller was efi_cleanup_page_tables(), and that, in turn, was unused, so just delete both functions. This leaves a couple of other helpers unused, so delete them, too. Signed-off-by: Andy Lutomirski Reviewed-by: Matt Fleming Acked-by: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-efi@vger.kernel.org Link: http://lkml.kernel.org/r/77ff20fdde3b75cd393be5559ad8218870520248.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index 78d1e74..45ea38d 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -125,7 +125,6 @@ extern void __init efi_map_region_fixed(efi_memory_desc_t *md); extern void efi_sync_low_kernel_mappings(void); extern int __init efi_alloc_page_tables(void); extern int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages); -extern void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages); extern void __init old_map_region(efi_memory_desc_t *md); extern void __init runtime_code_page_mkexec(void); extern void __init efi_runtime_update_mappings(void); diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index d14d0a5..f1218f5 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -481,8 +481,6 @@ extern pmd_t *lookup_pmd_address(unsigned long address); extern phys_addr_t slow_virt_to_phys(void *__address); extern int kernel_map_pages_in_pgd(pgd_t *pgd, u64 pfn, unsigned long address, unsigned numpages, unsigned long page_flags); -void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, - unsigned numpages); #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_DEFS_H */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 26aa487..26c93c6 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -746,18 +746,6 @@ static bool try_to_free_pmd_page(pmd_t *pmd) return true; } -static bool try_to_free_pud_page(pud_t *pud) -{ - int i; - - for (i = 0; i < PTRS_PER_PUD; i++) - if (!pud_none(pud[i])) - return false; - - free_page((unsigned long)pud); - return true; -} - static bool unmap_pte_range(pmd_t *pmd, unsigned long start, unsigned long end) { pte_t *pte = pte_offset_kernel(pmd, start); @@ -871,16 +859,6 @@ static void unmap_pud_range(pgd_t *pgd, unsigned long start, unsigned long end) */ } -static void unmap_pgd_range(pgd_t *root, unsigned long addr, unsigned long end) -{ - pgd_t *pgd_entry = root + pgd_index(addr); - - unmap_pud_range(pgd_entry, addr, end); - - if (try_to_free_pud_page((pud_t *)pgd_page_vaddr(*pgd_entry))) - pgd_clear(pgd_entry); -} - static int alloc_pte_page(pmd_t *pmd) { pte_t *pte = (pte_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); @@ -1994,12 +1972,6 @@ out: return retval; } -void kernel_unmap_pages_in_pgd(pgd_t *root, unsigned long address, - unsigned numpages) -{ - unmap_pgd_range(root, address, address + (numpages << PAGE_SHIFT)); -} - /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here. diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index f93545e..62986e5 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -978,8 +978,6 @@ static void __init __efi_enter_virtual_mode(void) * EFI mixed mode we need all of memory to be accessible when * we pass parameters to the EFI runtime services in the * thunking code. - * - * efi_cleanup_page_tables(__pa(new_memmap), 1 << pg_shift); */ free_pages((unsigned long)new_memmap, pg_shift); diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 338402b..cef39b0 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -49,9 +49,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { return 0; } -void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) -{ -} void __init efi_map_region(efi_memory_desc_t *md) { diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index b226b3f..d288dce 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -285,11 +285,6 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) return 0; } -void __init efi_cleanup_page_tables(unsigned long pa_memmap, unsigned num_pages) -{ - kernel_unmap_pages_in_pgd(efi_pgd, pa_memmap, num_pages); -} - static void __init __map_region(efi_memory_desc_t *md, u64 va) { unsigned long flags = _PAGE_RW; -- cgit v0.10.2 From 9a2e9da3e003112399f2863b7b6b911043c01895 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:52 -0700 Subject: x86/dumpstack: Try harder to get a call trace on stack overflow If we overflow the stack, print_context_stack() will abort. Detect this case and rewind back into the valid part of the stack so that we can trace it. Signed-off-by: Andy Lutomirski Reviewed-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/ee1690eb2715ccc5dc187fde94effa4ca0ccbbcd.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index ef8017c..cc88e25 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -87,7 +87,7 @@ static inline int valid_stack_ptr(struct task_struct *task, else return 0; } - return p > t && p < t + THREAD_SIZE - size; + return p >= t && p < t + THREAD_SIZE - size; } unsigned long @@ -98,6 +98,14 @@ print_context_stack(struct task_struct *task, { struct stack_frame *frame = (struct stack_frame *)bp; + /* + * If we overflowed the stack into a guard page, jump back to the + * bottom of the usable stack. + */ + if ((unsigned long)task_stack_page(task) - (unsigned long)stack < + PAGE_SIZE) + stack = (unsigned long *)task_stack_page(task); + while (valid_stack_ptr(task, stack, sizeof(*stack), end)) { unsigned long addr; -- cgit v0.10.2 From 98f30b1207932b6553ea605c99393d8afca12324 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:53 -0700 Subject: x86/dumpstack/64: Handle faults when printing the "Stack: " part of an OOPS If we overflow the stack into a guard page, we'll recursively fault when trying to dump the contents of the guard page. Use probe_kernel_address() so we can recover if this happens. Signed-off-by: Andy Lutomirski Reviewed-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/e626d47a55d7b04dcb1b4d33faa95e8505b217c8.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index d558a8a..2552a1e 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -272,6 +272,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, stack = sp; for (i = 0; i < kstack_depth_to_print; i++) { + unsigned long word; + if (stack >= irq_stack && stack <= irq_stack_end) { if (stack == irq_stack_end) { stack = (unsigned long *) (irq_stack_end[-1]); @@ -281,12 +283,18 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, if (kstack_end(stack)) break; } + + if (probe_kernel_address(stack, word)) + break; + if ((i % STACKSLOTS_PER_LINE) == 0) { if (i != 0) pr_cont("\n"); - printk("%s %016lx", log_lvl, *stack++); + printk("%s %016lx", log_lvl, word); } else - pr_cont(" %016lx", *stack++); + pr_cont(" %016lx", word); + + stack++; touch_nmi_watchdog(); } preempt_enable(); -- cgit v0.10.2 From 46aea3873401836abb7f01200e7946e7d518b359 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:54 -0700 Subject: x86/mm/64: In vmalloc_fault(), use CR3 instead of current->active_mm If we get a vmalloc fault while current->active_mm->pgd doesn't match CR3, we'll crash without this change. I've seen this failure mode on heavily instrumented kernels with virtually mapped stacks. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/4650d7674185f165ed8fdf9ac4c5c35c5c179ba8.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 7d1fa7c..ca44e2e 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -439,7 +439,7 @@ static noinline int vmalloc_fault(unsigned long address) * happen within a race in page table update. In the later * case just flush: */ - pgd = pgd_offset(current->active_mm, address); + pgd = (pgd_t *)__va(read_cr3()) + pgd_index(address); pgd_ref = pgd_offset_k(address); if (pgd_none(*pgd_ref)) return -1; -- cgit v0.10.2 From 2deb4be28077638591fe5fc593b7f8aabc140f42 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:55 -0700 Subject: x86/dumpstack: When OOPSing, rewind the stack before do_exit() If we call do_exit() with a clean stack, we greatly reduce the risk of recursive oopses due to stack overflow in do_exit, and we allow do_exit to work even if we OOPS from an IST stack. The latter gives us a much better chance of surviving long enough after we detect a stack overflow to write out our logs. Signed-off-by: Andy Lutomirski Reviewed-by: Josh Poimboeuf Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/32f73ceb372ec61889598da5e5b145889b9f2e19.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 983e5d3..0b56666 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1153,3 +1153,14 @@ ENTRY(async_page_fault) jmp error_code END(async_page_fault) #endif + +ENTRY(rewind_stack_do_exit) + /* Prevent any naive code from trying to unwind to our caller. */ + xorl %ebp, %ebp + + movl PER_CPU_VAR(cpu_current_top_of_stack), %esi + leal -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%esi), %esp + + call do_exit +1: jmp 1b +END(rewind_stack_do_exit) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 9ee0da1..b846875 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1423,3 +1423,14 @@ ENTRY(ignore_sysret) mov $-ENOSYS, %eax sysret END(ignore_sysret) + +ENTRY(rewind_stack_do_exit) + /* Prevent any naive code from trying to unwind to our caller. */ + xorl %ebp, %ebp + + movq PER_CPU_VAR(cpu_current_top_of_stack), %rax + leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp + + call do_exit +1: jmp 1b +END(rewind_stack_do_exit) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index cc88e25..de8242d 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -234,6 +234,8 @@ unsigned long oops_begin(void) EXPORT_SYMBOL_GPL(oops_begin); NOKPROBE_SYMBOL(oops_begin); +void __noreturn rewind_stack_do_exit(int signr); + void oops_end(unsigned long flags, struct pt_regs *regs, int signr) { if (regs && kexec_should_crash(current)) @@ -255,7 +257,13 @@ void oops_end(unsigned long flags, struct pt_regs *regs, int signr) panic("Fatal exception in interrupt"); if (panic_on_oops) panic("Fatal exception"); - do_exit(signr); + + /* + * We're not going to return, but we might be on an IST stack or + * have very little stack space left. Rewind the stack and kill + * the task. + */ + rewind_stack_do_exit(signr); } NOKPROBE_SYMBOL(oops_end); -- cgit v0.10.2 From dfa9a942fd7951c8f333cf3f377dde51ebd21685 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:56 -0700 Subject: x86/uaccess: Move thread_info::uaccess_err and thread_info::sig_on_uaccess_err to thread_struct struct thread_info is a legacy mess. To prepare for its partial removal, move the uaccess control fields out -- they're straightforward. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/d0ac4d01c8e4d4d756264604e47445d5acc7900e.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 174c254..3aba2b0 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -221,8 +221,8 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) * With a real vsyscall, page faults cause SIGSEGV. We want to * preserve that behavior to make writing exploits harder. */ - prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; - current_thread_info()->sig_on_uaccess_error = 1; + prev_sig_on_uaccess_error = current->thread.sig_on_uaccess_error; + current->thread.sig_on_uaccess_error = 1; ret = -EFAULT; switch (vsyscall_nr) { @@ -243,7 +243,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) break; } - current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + current->thread.sig_on_uaccess_error = prev_sig_on_uaccess_error; check_fault: if (ret == -EFAULT) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 62c6cc3..f53ae57 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -419,6 +419,9 @@ struct thread_struct { /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; + unsigned int sig_on_uaccess_error:1; + unsigned int uaccess_err:1; /* uaccess failed */ + /* Floating point and extended processor state */ struct fpu fpu; /* diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 30c133a..7c47bb6 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -58,8 +58,6 @@ struct thread_info { __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ mm_segment_t addr_limit; - unsigned int sig_on_uaccess_error:1; - unsigned int uaccess_err:1; /* uaccess failed */ }; #define INIT_THREAD_INFO(tsk) \ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index d40ec72..8f66e56 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -487,13 +487,13 @@ struct __large_struct { unsigned long buf[100]; }; * uaccess_try and catch */ #define uaccess_try do { \ - current_thread_info()->uaccess_err = 0; \ + current->thread.uaccess_err = 0; \ __uaccess_begin(); \ barrier(); #define uaccess_catch(err) \ __uaccess_end(); \ - (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ + (err) |= (current->thread.uaccess_err ? -EFAULT : 0); \ } while (0) /** diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index 4bb53b8..0f90cc2 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -37,7 +37,7 @@ bool ex_handler_ext(const struct exception_table_entry *fixup, struct pt_regs *regs, int trapnr) { /* Special hack for uaccess_err */ - current_thread_info()->uaccess_err = 1; + current->thread.uaccess_err = 1; regs->ip = ex_fixup_addr(fixup); return true; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ca44e2e..69be03d 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -737,7 +737,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * In this case we need to make sure we're not recursively * faulting through the emulate_vsyscall() logic. */ - if (current_thread_info()->sig_on_uaccess_error && signal) { + if (current->thread.sig_on_uaccess_error && signal) { tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | PF_USER; tsk->thread.cr2 = address; -- cgit v0.10.2 From 2a53ccbc0de1b1950aeedd24680f7eca65c86ff5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 Jul 2016 10:21:11 +0200 Subject: x86/dumpstack: Rename thread_struct::sig_on_uaccess_error to sig_on_uaccess_err Rename it to match the thread_struct::uaccess_err pattern and also because it was too long. Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c index 3aba2b0..75fc719 100644 --- a/arch/x86/entry/vsyscall/vsyscall_64.c +++ b/arch/x86/entry/vsyscall/vsyscall_64.c @@ -96,7 +96,7 @@ static bool write_ok_or_segv(unsigned long ptr, size_t size) { /* * XXX: if access_ok, get_user, and put_user handled - * sig_on_uaccess_error, this could go away. + * sig_on_uaccess_err, this could go away. */ if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { @@ -125,7 +125,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) struct task_struct *tsk; unsigned long caller; int vsyscall_nr, syscall_nr, tmp; - int prev_sig_on_uaccess_error; + int prev_sig_on_uaccess_err; long ret; /* @@ -221,8 +221,8 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) * With a real vsyscall, page faults cause SIGSEGV. We want to * preserve that behavior to make writing exploits harder. */ - prev_sig_on_uaccess_error = current->thread.sig_on_uaccess_error; - current->thread.sig_on_uaccess_error = 1; + prev_sig_on_uaccess_err = current->thread.sig_on_uaccess_err; + current->thread.sig_on_uaccess_err = 1; ret = -EFAULT; switch (vsyscall_nr) { @@ -243,7 +243,7 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) break; } - current->thread.sig_on_uaccess_error = prev_sig_on_uaccess_error; + current->thread.sig_on_uaccess_err = prev_sig_on_uaccess_err; check_fault: if (ret == -EFAULT) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f53ae57..cbdfe5f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -419,7 +419,7 @@ struct thread_struct { /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; - unsigned int sig_on_uaccess_error:1; + unsigned int sig_on_uaccess_err:1; unsigned int uaccess_err:1; /* uaccess failed */ /* Floating point and extended processor state */ diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 69be03d..d22161a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -737,7 +737,7 @@ no_context(struct pt_regs *regs, unsigned long error_code, * In this case we need to make sure we're not recursively * faulting through the emulate_vsyscall() logic. */ - if (current->thread.sig_on_uaccess_error && signal) { + if (current->thread.sig_on_uaccess_err && signal) { tsk->thread.trap_nr = X86_TRAP_PF; tsk->thread.error_code = error_code | PF_USER; tsk->thread.cr2 = address; -- cgit v0.10.2 From 13d4ea097d18b419ad2a2b696063d44bf59acec0 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:57 -0700 Subject: x86/uaccess: Move thread_info::addr_limit to thread_struct struct thread_info is a legacy mess. To prepare for its partial removal, move thread_info::addr_limit out. As an added benefit, this way is simpler. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/15bee834d09402b47ac86f2feccdf6529f9bc5b0.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/checksum_32.h b/arch/x86/include/asm/checksum_32.h index 532f85e..7b53743 100644 --- a/arch/x86/include/asm/checksum_32.h +++ b/arch/x86/include/asm/checksum_32.h @@ -2,8 +2,7 @@ #define _ASM_X86_CHECKSUM_32_H #include - -#include +#include /* * computes the checksum of a memory block at buff, length len, diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index cbdfe5f..89314ed 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -371,6 +371,10 @@ extern unsigned int xstate_size; struct perf_event; +typedef struct { + unsigned long seg; +} mm_segment_t; + struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; @@ -419,6 +423,8 @@ struct thread_struct { /* Max allowed port in the bitmap, in bytes: */ unsigned io_bitmap_max; + mm_segment_t addr_limit; + unsigned int sig_on_uaccess_err:1; unsigned int uaccess_err:1; /* uaccess failed */ @@ -493,11 +499,6 @@ static inline void load_sp0(struct tss_struct *tss, #define set_iopl_mask native_set_iopl_mask #endif /* CONFIG_PARAVIRT */ -typedef struct { - unsigned long seg; -} mm_segment_t; - - /* Free all resources held by a thread. */ extern void release_thread(struct task_struct *); @@ -719,6 +720,7 @@ static inline void spin_lock_prefetch(const void *x) .sp0 = TOP_OF_INIT_STACK, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ + .addr_limit = KERNEL_DS, \ } extern unsigned long thread_saved_pc(struct task_struct *tsk); @@ -768,8 +770,9 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define STACK_TOP TASK_SIZE #define STACK_TOP_MAX TASK_SIZE_MAX -#define INIT_THREAD { \ - .sp0 = TOP_OF_INIT_STACK \ +#define INIT_THREAD { \ + .sp0 = TOP_OF_INIT_STACK, \ + .addr_limit = KERNEL_DS, \ } /* diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 7c47bb6..89bff04 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -57,7 +57,6 @@ struct thread_info { __u32 flags; /* low level flags */ __u32 status; /* thread synchronous flags */ __u32 cpu; /* current CPU */ - mm_segment_t addr_limit; }; #define INIT_THREAD_INFO(tsk) \ @@ -65,7 +64,6 @@ struct thread_info { .task = &tsk, \ .flags = 0, \ .cpu = 0, \ - .addr_limit = KERNEL_DS, \ } #define init_thread_info (init_thread_union.thread_info) @@ -184,11 +182,6 @@ static inline unsigned long current_stack_pointer(void) # define cpu_current_top_of_stack (cpu_tss + TSS_sp0) #endif -/* Load thread_info address into "reg" */ -#define GET_THREAD_INFO(reg) \ - _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ - _ASM_SUB $(THREAD_SIZE),reg ; - /* * ASM operand which evaluates to a 'thread_info' address of * the current task, if it is known that "reg" is exactly "off" diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8f66e56..c03bfb6 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -29,12 +29,12 @@ #define USER_DS MAKE_MM_SEG(TASK_SIZE_MAX) #define get_ds() (KERNEL_DS) -#define get_fs() (current_thread_info()->addr_limit) -#define set_fs(x) (current_thread_info()->addr_limit = (x)) +#define get_fs() (current->thread.addr_limit) +#define set_fs(x) (current->thread.addr_limit = (x)) #define segment_eq(a, b) ((a).seg == (b).seg) -#define user_addr_max() (current_thread_info()->addr_limit.seg) +#define user_addr_max() (current->thread.addr_limit.seg) #define __addr_ok(addr) \ ((unsigned long __force)(addr) < user_addr_max()) diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index 674134e..2bd5c6f 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -31,7 +31,9 @@ void common(void) { BLANK(); OFFSET(TI_flags, thread_info, flags); OFFSET(TI_status, thread_info, status); - OFFSET(TI_addr_limit, thread_info, addr_limit); + + BLANK(); + OFFSET(TASK_addr_limit, task_struct, thread.addr_limit); BLANK(); OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx); diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index 2b0ef26..bf603eb 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -17,11 +17,11 @@ /* Standard copy_to_user with segment limit checking */ ENTRY(_copy_to_user) - GET_THREAD_INFO(%rax) + mov PER_CPU_VAR(current_task), %rax movq %rdi,%rcx addq %rdx,%rcx jc bad_to_user - cmpq TI_addr_limit(%rax),%rcx + cmpq TASK_addr_limit(%rax),%rcx ja bad_to_user ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ "jmp copy_user_generic_string", \ @@ -32,11 +32,11 @@ ENDPROC(_copy_to_user) /* Standard copy_from_user with segment limit checking */ ENTRY(_copy_from_user) - GET_THREAD_INFO(%rax) + mov PER_CPU_VAR(current_task), %rax movq %rsi,%rcx addq %rdx,%rcx jc bad_from_user - cmpq TI_addr_limit(%rax),%rcx + cmpq TASK_addr_limit(%rax),%rcx ja bad_from_user ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ "jmp copy_user_generic_string", \ diff --git a/arch/x86/lib/csum-wrappers_64.c b/arch/x86/lib/csum-wrappers_64.c index 28a6654..b6fcb9a 100644 --- a/arch/x86/lib/csum-wrappers_64.c +++ b/arch/x86/lib/csum-wrappers_64.c @@ -6,6 +6,7 @@ */ #include #include +#include #include /** diff --git a/arch/x86/lib/getuser.S b/arch/x86/lib/getuser.S index 46668cd..0ef5128 100644 --- a/arch/x86/lib/getuser.S +++ b/arch/x86/lib/getuser.S @@ -35,8 +35,8 @@ .text ENTRY(__get_user_1) - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 1: movzbl (%_ASM_AX),%edx @@ -48,8 +48,8 @@ ENDPROC(__get_user_1) ENTRY(__get_user_2) add $1,%_ASM_AX jc bad_get_user - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 2: movzwl -1(%_ASM_AX),%edx @@ -61,8 +61,8 @@ ENDPROC(__get_user_2) ENTRY(__get_user_4) add $3,%_ASM_AX jc bad_get_user - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 3: movl -3(%_ASM_AX),%edx @@ -75,8 +75,8 @@ ENTRY(__get_user_8) #ifdef CONFIG_X86_64 add $7,%_ASM_AX jc bad_get_user - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user ASM_STAC 4: movq -7(%_ASM_AX),%rdx @@ -86,8 +86,8 @@ ENTRY(__get_user_8) #else add $7,%_ASM_AX jc bad_get_user_8 - GET_THREAD_INFO(%_ASM_DX) - cmp TI_addr_limit(%_ASM_DX),%_ASM_AX + mov PER_CPU_VAR(current_task), %_ASM_DX + cmp TASK_addr_limit(%_ASM_DX),%_ASM_AX jae bad_get_user_8 ASM_STAC 4: movl -7(%_ASM_AX),%edx diff --git a/arch/x86/lib/putuser.S b/arch/x86/lib/putuser.S index e0817a1..c891ece 100644 --- a/arch/x86/lib/putuser.S +++ b/arch/x86/lib/putuser.S @@ -29,14 +29,14 @@ * as they get called from within inline assembly. */ -#define ENTER GET_THREAD_INFO(%_ASM_BX) +#define ENTER mov PER_CPU_VAR(current_task), %_ASM_BX #define EXIT ASM_CLAC ; \ ret .text ENTRY(__put_user_1) ENTER - cmp TI_addr_limit(%_ASM_BX),%_ASM_CX + cmp TASK_addr_limit(%_ASM_BX),%_ASM_CX jae bad_put_user ASM_STAC 1: movb %al,(%_ASM_CX) @@ -46,7 +46,7 @@ ENDPROC(__put_user_1) ENTRY(__put_user_2) ENTER - mov TI_addr_limit(%_ASM_BX),%_ASM_BX + mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $1,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user @@ -58,7 +58,7 @@ ENDPROC(__put_user_2) ENTRY(__put_user_4) ENTER - mov TI_addr_limit(%_ASM_BX),%_ASM_BX + mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $3,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user @@ -70,7 +70,7 @@ ENDPROC(__put_user_4) ENTRY(__put_user_8) ENTER - mov TI_addr_limit(%_ASM_BX),%_ASM_BX + mov TASK_addr_limit(%_ASM_BX),%_ASM_BX sub $7,%_ASM_BX cmp %_ASM_BX,%_ASM_CX jae bad_put_user diff --git a/arch/x86/lib/usercopy_64.c b/arch/x86/lib/usercopy_64.c index 0a42327..9f760cd 100644 --- a/arch/x86/lib/usercopy_64.c +++ b/arch/x86/lib/usercopy_64.c @@ -6,7 +6,7 @@ * Copyright 2002 Andi Kleen */ #include -#include +#include /* * Zero Userspace diff --git a/drivers/pnp/isapnp/proc.c b/drivers/pnp/isapnp/proc.c index 5edee64..262285e 100644 --- a/drivers/pnp/isapnp/proc.c +++ b/drivers/pnp/isapnp/proc.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include extern struct pnp_protocol isapnp_protocol; diff --git a/lib/bitmap.c b/lib/bitmap.c index c66da50..eca8808 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -14,9 +14,9 @@ #include #include #include +#include #include -#include /* * bitmaps provide an array of bits, implemented using an an -- cgit v0.10.2 From fb59831b496a5bb7d0a06c7e702d88d1757edfca Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:58 -0700 Subject: x86/smp: Remove stack_smp_processor_id() It serves no purpose -- raw_smp_processor_id() works fine. This change will be needed to move thread_info off the stack. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/a2bf4f07fbc30fb32f9f7f3f8f94ad3580823847.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h index 678637a..59d34c5 100644 --- a/arch/x86/include/asm/cpu.h +++ b/arch/x86/include/asm/cpu.h @@ -17,7 +17,6 @@ static inline void prefill_possible_map(void) {} #define cpu_physical_id(cpu) boot_cpu_physical_apicid #define safe_smp_processor_id() 0 -#define stack_smp_processor_id() 0 #endif /* CONFIG_SMP */ diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h index 66b0573..0576b61 100644 --- a/arch/x86/include/asm/smp.h +++ b/arch/x86/include/asm/smp.h @@ -172,12 +172,6 @@ extern int safe_smp_processor_id(void); #elif defined(CONFIG_X86_64_SMP) #define raw_smp_processor_id() (this_cpu_read(cpu_number)) -#define stack_smp_processor_id() \ -({ \ - struct thread_info *ti; \ - __asm__("andq %%rsp,%0; ":"=r" (ti) : "0" (CURRENT_MASK)); \ - ti->cpu; \ -}) #define safe_smp_processor_id() smp_processor_id() #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0fe6953f..d22a7b9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1452,7 +1452,7 @@ void cpu_init(void) struct task_struct *me; struct tss_struct *t; unsigned long v; - int cpu = stack_smp_processor_id(); + int cpu = raw_smp_processor_id(); int i; wait_for_master_cpu(cpu); -- cgit v0.10.2 From eb43e8f85fffc1ba535e0362a872101dfe48abe3 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Thu, 14 Jul 2016 13:22:59 -0700 Subject: x86/smp: Remove unnecessary initialization of thread_info::cpu It's statically initialized to zero -- no need to dynamically initialize it to zero as well. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/6cf6314dce3051371a913ee19d1b88e29c68c560.1468527351.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index fafe8b9..0e91dbe 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1285,7 +1285,6 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) cpumask_copy(cpu_callin_mask, cpumask_of(0)); mb(); - current_thread_info()->cpu = 0; /* needed? */ for_each_possible_cpu(i) { zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); -- cgit v0.10.2 From 3ebfd81f7fb3e81a754e37283b7f38c62244641a Mon Sep 17 00:00:00 2001 From: "H.J. Lu" Date: Thu, 14 Jul 2016 12:31:53 -0700 Subject: x86/syscalls: Add compat_sys_preadv64v2/compat_sys_pwritev64v2 Don't use the same syscall numbers for 2 different syscalls: 534 x32 preadv compat_sys_preadv64 535 x32 pwritev compat_sys_pwritev64 534 x32 preadv2 compat_sys_preadv2 535 x32 pwritev2 compat_sys_pwritev2 Add compat_sys_preadv64v2() and compat_sys_pwritev64v2() so that 64-bit offset is passed in one 64-bit register on x32, similar to compat_sys_preadv64() and compat_sys_pwritev64(). Signed-off-by: H.J. Lu Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Christoph Hellwig Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/CAMe9rOovCMf-RQfx_n1U_Tu_DX1BYkjtFr%3DQ4-_PFVSj9BCzUA@mail.gmail.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 555263e..e9ce9c7 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -374,5 +374,5 @@ 543 x32 io_setup compat_sys_io_setup 544 x32 io_submit compat_sys_io_submit 545 x32 execveat compat_sys_execveat/ptregs -534 x32 preadv2 compat_sys_preadv2 -535 x32 pwritev2 compat_sys_pwritev2 +546 x32 preadv2 compat_sys_preadv64v2 +547 x32 pwritev2 compat_sys_pwritev64v2 diff --git a/arch/x86/include/asm/unistd.h b/arch/x86/include/asm/unistd.h index 2b19caa..32712a9 100644 --- a/arch/x86/include/asm/unistd.h +++ b/arch/x86/include/asm/unistd.h @@ -26,6 +26,8 @@ # define __ARCH_WANT_COMPAT_SYS_GETDENTS64 # define __ARCH_WANT_COMPAT_SYS_PREADV64 # define __ARCH_WANT_COMPAT_SYS_PWRITEV64 +# define __ARCH_WANT_COMPAT_SYS_PREADV64V2 +# define __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 # endif diff --git a/fs/read_write.c b/fs/read_write.c index 933b53a..66215a7 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1168,6 +1168,15 @@ COMPAT_SYSCALL_DEFINE5(preadv, compat_ulong_t, fd, return do_compat_preadv64(fd, vec, vlen, pos, 0); } +#ifdef __ARCH_WANT_COMPAT_SYS_PREADV64V2 +COMPAT_SYSCALL_DEFINE5(preadv64v2, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos, int, flags) +{ + return do_compat_preadv64(fd, vec, vlen, pos, flags); +} +#endif + COMPAT_SYSCALL_DEFINE6(preadv2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, @@ -1265,6 +1274,15 @@ COMPAT_SYSCALL_DEFINE5(pwritev, compat_ulong_t, fd, return do_compat_pwritev64(fd, vec, vlen, pos, 0); } +#ifdef __ARCH_WANT_COMPAT_SYS_PWRITEV64V2 +COMPAT_SYSCALL_DEFINE5(pwritev64v2, unsigned long, fd, + const struct compat_iovec __user *,vec, + unsigned long, vlen, loff_t, pos, int, flags) +{ + return do_compat_pwritev64(fd, vec, vlen, pos, flags); +} +#endif + COMPAT_SYSCALL_DEFINE6(pwritev2, compat_ulong_t, fd, const struct compat_iovec __user *,vec, compat_ulong_t, vlen, u32, pos_low, u32, pos_high, int, flags) -- cgit v0.10.2 From 530dd8d4b9daf77e3e5d145a26210d91ced954c7 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 22 Jul 2016 21:58:08 -0700 Subject: x86/mm/cpa: Fix populate_pgd(): Stop trying to deallocate failed PUDs Valdis Kletnieks bisected a boot failure back to this recent commit: 360cb4d15567 ("x86/mm/cpa: In populate_pgd(), don't set the PGD entry until it's populated") I broke the case where a PUD table got allocated -- populate_pud() would wander off a pgd_none entry and get lost. I'm not sure how this survived my testing. Fix the original issue in a much simpler way. The problem was that, if we allocated a PUD table, failed to populate it, and freed it, another CPU could potentially keep using the PGD entry we installed (either by copying it via vmalloc_fault or by speculatively caching it). There's a straightforward fix: simply leave the top-level entry in place if this happens. This can't waste any significant amount of memory -- there are at most 256 entries like this systemwide and, as a practical matter, if we hit this failure path repeatedly, we're likely to reuse the same page anyway. For context, this is a reversion with this hunk added in: if (ret < 0) { + /* + * Leave the PUD page in place in case some other CPU or thread + * already found it, but remove any useless entries we just + * added to it. + */ - unmap_pgd_range(cpa->pgd, addr, + unmap_pud_range(pgd_entry, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } This effectively open-codes what the now-deleted unmap_pgd_range() function used to do except that unmap_pgd_range() used to try to free the page as well. Reported-by: Valdis Kletnieks Signed-off-by: Andy Lutomirski Cc: Andrew Morton Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Luis R. Rodriguez Cc: Mike Krinkin Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Toshi Kani Link: http://lkml.kernel.org/r/21cbc2822aa18aa812c0215f4231dbf5f65afa7f.1469249789.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 26c93c6..2bc6ea1 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1082,6 +1082,8 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pud = (pud_t *)get_zeroed_page(GFP_KERNEL | __GFP_NOTRACK); if (!pud) return -1; + + set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); } pgprot_val(pgprot) &= ~pgprot_val(cpa->mask_clr); @@ -1089,16 +1091,11 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) ret = populate_pud(cpa, addr, pgd_entry, pgprot); if (ret < 0) { - if (pud) - free_page((unsigned long)pud); unmap_pud_range(pgd_entry, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; } - if (pud) - set_pgd(pgd_entry, __pgd(__pa(pud) | _KERNPG_TABLE)); - cpa->numpages = ret; return 0; } -- cgit v0.10.2 From 55920d31f1e3fea06702c74271dd56c4fc9b70ca Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 23 Jul 2016 09:59:28 -0700 Subject: x86/mm/cpa: Add missing comment in populate_pdg() In commit: 21cbc2822aa1 ("x86/mm/cpa: Unbreak populate_pgd(): stop trying to deallocate failed PUDs") I intended to add this comment, but I failed at using git. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/242baf8612394f4e31216f96d13c4d2e9b90d1b7.1469293159.git.luto@kernel.org Signed-off-by: Ingo Molnar diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 2bc6ea1..47870a5 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1091,6 +1091,11 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) ret = populate_pud(cpa, addr, pgd_entry, pgprot); if (ret < 0) { + /* + * Leave the PUD page in place in case some other CPU or thread + * already found it, but remove any useless entries we just + * added to it. + */ unmap_pud_range(pgd_entry, addr, addr + (cpa->numpages << PAGE_SHIFT)); return ret; -- cgit v0.10.2