From c73e36b775a777abd67a1e15481923fcbd2040e1 Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Fri, 3 Jul 2015 22:19:02 +0200
Subject: x86/asm/entry/32: Replace RESTORE_RSI_RDI with open-coded 32-bit
 reads

This doesn't change much, but uses shorter 32-bit insns:

        -48 8b 74 24 68         mov    0x68(%rsp),%rsi
        -48 8b 7c 24 70         mov    0x70(%rsp),%rdi
        -48 8b 54 24 60         mov    0x60(%rsp),%rdx
        +8b 54 24 60            mov    0x60(%rsp),%edx
        +8b 74 24 68            mov    0x68(%rsp),%esi
        +8b 7c 24 70            mov    0x70(%rsp),%edi

and does the loads in pt_regs order.

Since these are the only uses of RESTORE_RSI_RDI[_RDX], drop
these macros.

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Alexei Starovoitov <ast@plumgrid.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Will Drewry <wad@chromium.org>
Link: http://lkml.kernel.org/r/1435954742-2545-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index f4e6308..519207f 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -193,12 +193,6 @@ For 32-bit we have the following conventions - kernel is built with
 	.macro RESTORE_C_REGS_EXCEPT_RCX_R11
 	RESTORE_C_REGS_HELPER 1,0,0,1,1
 	.endm
-	.macro RESTORE_RSI_RDI
-	RESTORE_C_REGS_HELPER 0,0,0,0,0
-	.endm
-	.macro RESTORE_RSI_RDI_RDX
-	RESTORE_C_REGS_HELPER 0,0,0,0,1
-	.endm
 
 	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
 	subq $-(15*8+\addskip), %rsp
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index bb187a6..b868cfc 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -140,7 +140,8 @@ sysexit_from_sys_call:
 	 */
 	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
 	movl	RIP(%rsp), %ecx		/* User %eip */
-	RESTORE_RSI_RDI
+	movl	RSI(%rsp), %esi
+	movl	RDI(%rsp), %edi
 	xorl	%edx, %edx		/* Do not leak kernel information */
 	xorq	%r8, %r8
 	xorq	%r9, %r9
@@ -366,7 +367,9 @@ cstar_dispatch:
 
 sysretl_from_sys_call:
 	andl	$~TS_COMPAT, ASM_THREAD_INFO(TI_status, %rsp, SIZEOF_PTREGS)
-	RESTORE_RSI_RDI_RDX
+	movl	RDX(%rsp), %edx
+	movl	RSI(%rsp), %esi
+	movl	RDI(%rsp), %edi
 	movl	RIP(%rsp), %ecx
 	movl	EFLAGS(%rsp), %r11d
 	xorq	%r10, %r10
-- 
cgit v0.10.2


From c6e5ca35c4685cd920b1d5279dbc9f4483d7dfd4 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:43:55 +0200
Subject: x86/asm/tsc: Inline native_read_tsc() and remove __native_read_tsc()

In the following commit:

  cdc7957d1954 ("x86: move native_read_tsc() offline")

... native_read_tsc() was moved out of line, presumably for some
now-obsolete vDSO-related reason. Undo it.

The entire rdtsc, shl, or sequence is only 11 bytes, and calls
via rdtscl() and similar helpers were already inlined.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/d05ffe2aaf8468ca475ebc00efad7b2fa174af19.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 9793322..972b488 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -186,7 +186,7 @@ notrace static cycle_t vread_tsc(void)
 	 * but no one has ever seen it happen.
 	 */
 	rdtsc_barrier();
-	ret = (cycle_t)__native_read_tsc();
+	ret = (cycle_t)native_read_tsc();
 
 	last = gtod->cycle_last;
 
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index e6a707e..8871147 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -106,12 +106,10 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
 	return err;
 }
 
-extern unsigned long long native_read_tsc(void);
-
 extern int rdmsr_safe_regs(u32 regs[8]);
 extern int wrmsr_safe_regs(u32 regs[8]);
 
-static __always_inline unsigned long long __native_read_tsc(void)
+static __always_inline unsigned long long native_read_tsc(void)
 {
 	DECLARE_ARGS(val, low, high);
 
@@ -181,10 +179,10 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 }
 
 #define rdtscl(low)						\
-	((low) = (u32)__native_read_tsc())
+	((low) = (u32)native_read_tsc())
 
 #define rdtscll(val)						\
-	((val) = __native_read_tsc())
+	((val) = native_read_tsc())
 
 #define rdpmc(counter, low, high)			\
 do {							\
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 628954c..2bd69d6 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 static __always_inline
 u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
 {
-	u64 delta = __native_read_tsc() - src->tsc_timestamp;
+	u64 delta = native_read_tsc() - src->tsc_timestamp;
 	return pvclock_scale_delta(delta, src->tsc_to_system_mul,
 				   src->tsc_shift);
 }
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index c2e00bb..bc5fa2a 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void)
 	 * on during the bootup the random pool has true entropy too.
 	 */
 	get_random_bytes(&canary, sizeof(canary));
-	tsc = __native_read_tsc();
+	tsc = native_read_tsc();
 	canary += tsc + (tsc << 32UL);
 
 	current->stack_canary = canary;
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 94605c0..fd11128 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -42,7 +42,7 @@ static __always_inline cycles_t vget_cycles(void)
 	if (!cpu_has_tsc)
 		return 0;
 #endif
-	return (cycles_t)__native_read_tsc();
+	return (cycles_t)native_read_tsc();
 }
 
 extern void tsc_init(void);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index ede92c3..9fe111c 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void)
 	old = dw_apb_clocksource_read(clocksource_apbt);
 	old += loop;
 
-	t1 = __native_read_tsc();
+	t1 = native_read_tsc();
 
 	do {
 		new = dw_apb_clocksource_read(clocksource_apbt);
 	} while (new < old);
 
-	t2 = __native_read_tsc();
+	t2 = native_read_tsc();
 
 	shift = 5;
 	if (unlikely(loop >> shift == 0)) {
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 5054497..e7710cd 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -308,12 +308,6 @@ unsigned long long
 sched_clock(void) __attribute__((alias("native_sched_clock")));
 #endif
 
-unsigned long long native_read_tsc(void)
-{
-	return __native_read_tsc();
-}
-EXPORT_SYMBOL(native_read_tsc);
-
 int check_tsc_unstable(void)
 {
 	return tsc_unstable;
-- 
cgit v0.10.2


From 881d7bf843d7139c6dfbffdec4903b3354423c49 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:43:56 +0200
Subject: x86/asm/tsc, kvm: Remove vget_cycles()

The only caller was KVM's read_tsc(). The only difference
between vget_cycles() and native_read_tsc() was that
vget_cycles() returned zero instead of crashing on TSC-less
systems. KVM already checks vclock_mode() before calling that
function, so the extra check is unnecessary. Also, KVM
(host-side) requires the TSC to exist.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Paolo Bonzini <pbonzini@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/20615df14ae2eb713ea7a5f5123c1dc4c7ca993d.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index fd11128..3da1cc1 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -32,19 +32,6 @@ static inline cycles_t get_cycles(void)
 	return ret;
 }
 
-static __always_inline cycles_t vget_cycles(void)
-{
-	/*
-	 * We only do VDSOs on TSC capable CPUs, so this shouldn't
-	 * access boot_cpu_data (which is not VDSO-safe):
-	 */
-#ifndef CONFIG_X86_TSC
-	if (!cpu_has_tsc)
-		return 0;
-#endif
-	return (cycles_t)native_read_tsc();
-}
-
 extern void tsc_init(void);
 extern void mark_tsc_unstable(char *reason);
 extern int unsynchronized_tsc(void);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bbaf44e..f771058 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1455,7 +1455,7 @@ static cycle_t read_tsc(void)
 	 * but no one has ever seen it happen.
 	 */
 	rdtsc_barrier();
-	ret = (cycle_t)vget_cycles();
+	ret = (cycle_t)native_read_tsc();
 
 	last = pvclock_gtod_data.clock.cycle_last;
 
-- 
cgit v0.10.2


From 9261e050b686c9fe229cd9918d997b3caaf20e34 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:43:57 +0200
Subject: x86/asm/tsc, x86/paravirt: Remove read_tsc() and read_tscp() paravirt
 hooks

We've had ->read_tsc() and ->read_tscp() paravirt hooks since
the very beginning of paravirt, i.e.,

  d3561b7fa0fb ("[PATCH] paravirt: header and stubs for paravirtualisation").

AFAICT, the only paravirt guest implementation that ever
replaced these calls was vmware, and it's gone. Arguably even
vmware shouldn't have hooked RDTSC -- we fully support systems
that don't have a TSC at all, so there's no point for a paravirt
implementation to pretend that we have a TSC but to replace it.

I also doubt that these hooks actually worked. Calls to rdtscl()
and rdtscll(), which respected the hooks, were used seemingly
interchangeably with native_read_tsc(), which did not.

Just remove them. If anyone ever needs them again, they can try
to make a case for why they need them.

Before, on a paravirt config:
  text    	data     bss     dec     hex filename
  12618257      1816384 1093632 15528273 ecf151 vmlinux

After:
  text		data     bss     dec     hex filename
  12617207      1816384 1093632 15527223 eced37 vmlinux

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: virtualization@lists.linux-foundation.org
Link: http://lkml.kernel.org/r/d08a2600fb298af163681e5efd8e599d889a5b97.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 8871147..d1afac7 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -178,12 +178,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	return err;
 }
 
-#define rdtscl(low)						\
-	((low) = (u32)native_read_tsc())
-
-#define rdtscll(val)						\
-	((val) = native_read_tsc())
-
 #define rdpmc(counter, low, high)			\
 do {							\
 	u64 _l = native_read_pmc((counter));		\
@@ -193,6 +187,14 @@ do {							\
 
 #define rdpmcl(counter, val) ((val) = native_read_pmc(counter))
 
+#endif	/* !CONFIG_PARAVIRT */
+
+#define rdtscl(low)						\
+	((low) = (u32)native_read_tsc())
+
+#define rdtscll(val)						\
+	((val) = native_read_tsc())
+
 #define rdtscp(low, high, aux)					\
 do {                                                            \
 	unsigned long long _val = native_read_tscp(&(aux));     \
@@ -202,8 +204,6 @@ do {                                                            \
 
 #define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
 
-#endif	/* !CONFIG_PARAVIRT */
-
 /*
  * 64-bit version of wrmsr_safe():
  */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index d143bfa..c2be037 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -174,19 +174,6 @@ static inline int rdmsrl_safe(unsigned msr, unsigned long long *p)
 	return err;
 }
 
-static inline u64 paravirt_read_tsc(void)
-{
-	return PVOP_CALL0(u64, pv_cpu_ops.read_tsc);
-}
-
-#define rdtscl(low)				\
-do {						\
-	u64 _l = paravirt_read_tsc();		\
-	low = (int)_l;				\
-} while (0)
-
-#define rdtscll(val) (val = paravirt_read_tsc())
-
 static inline unsigned long long paravirt_sched_clock(void)
 {
 	return PVOP_CALL0(unsigned long long, pv_time_ops.sched_clock);
@@ -215,27 +202,6 @@ do {						\
 
 #define rdpmcl(counter, val) ((val) = paravirt_read_pmc(counter))
 
-static inline unsigned long long paravirt_rdtscp(unsigned int *aux)
-{
-	return PVOP_CALL1(u64, pv_cpu_ops.read_tscp, aux);
-}
-
-#define rdtscp(low, high, aux)				\
-do {							\
-	int __aux;					\
-	unsigned long __val = paravirt_rdtscp(&__aux);	\
-	(low) = (u32)__val;				\
-	(high) = (u32)(__val >> 32);			\
-	(aux) = __aux;					\
-} while (0)
-
-#define rdtscpll(val, aux)				\
-do {							\
-	unsigned long __aux; 				\
-	val = paravirt_rdtscp(&__aux);			\
-	(aux) = __aux;					\
-} while (0)
-
 static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
 {
 	PVOP_VCALL2(pv_cpu_ops.alloc_ldt, ldt, entries);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index a6b8f9f..ce029e4 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -156,9 +156,7 @@ struct pv_cpu_ops {
 	u64 (*read_msr)(unsigned int msr, int *err);
 	int (*write_msr)(unsigned int msr, unsigned low, unsigned high);
 
-	u64 (*read_tsc)(void);
 	u64 (*read_pmc)(int counter);
-	unsigned long long (*read_tscp)(unsigned int *aux);
 
 #ifdef CONFIG_X86_32
 	/*
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 58bcfb6..f68e48f 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -351,9 +351,7 @@ __visible struct pv_cpu_ops pv_cpu_ops = {
 	.wbinvd = native_wbinvd,
 	.read_msr = native_read_msr_safe,
 	.write_msr = native_write_msr_safe,
-	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
-	.read_tscp = native_read_tscp,
 	.load_tr_desc = native_load_tr_desc,
 	.set_ldt = native_set_ldt,
 	.load_gdt = native_load_gdt,
diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c
index e1b0136..c89f50a 100644
--- a/arch/x86/kernel/paravirt_patch_32.c
+++ b/arch/x86/kernel/paravirt_patch_32.c
@@ -10,7 +10,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "mov %cr2, %eax");
 DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3");
 DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax");
 DEF_NATIVE(pv_cpu_ops, clts, "clts");
-DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc");
 
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)");
@@ -52,7 +51,6 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf,
 		PATCH_SITE(pv_mmu_ops, read_cr3);
 		PATCH_SITE(pv_mmu_ops, write_cr3);
 		PATCH_SITE(pv_cpu_ops, clts);
-		PATCH_SITE(pv_cpu_ops, read_tsc);
 #if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS)
 		case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock):
 			if (pv_is_native_spin_unlock()) {
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 0b95c9b..32136bf 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1175,11 +1175,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
 	.read_msr = xen_read_msr_safe,
 	.write_msr = xen_write_msr_safe,
 
-	.read_tsc = native_read_tsc,
 	.read_pmc = native_read_pmc,
 
-	.read_tscp = native_read_tscp,
-
 	.iret = xen_iret,
 #ifdef CONFIG_X86_64
 	.usergs_sysret32 = xen_sysret32,
-- 
cgit v0.10.2


From 87be28aaf1458445d5f648688c2eec0f13b8f3b9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:43:58 +0200
Subject: x86/asm/tsc: Replace rdtscll() with native_read_tsc()

Now that the ->read_tsc() paravirt hook is gone, rdtscll() is
just a wrapper around native_read_tsc(). Unwrap it.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/d2449ae62c1b1fb90195bcfb19ef4a35883a04dc.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index d7b1f65..ea33236 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -82,7 +82,7 @@ static unsigned long get_random_long(void)
 
 	if (has_cpuflag(X86_FEATURE_TSC)) {
 		debug_putstr(" RDTSC");
-		rdtscll(raw);
+		raw = native_read_tsc();
 
 		random ^= raw;
 		use_i8254 = false;
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index d1afac7..7273b74 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -192,9 +192,6 @@ do {							\
 #define rdtscl(low)						\
 	((low) = (u32)native_read_tsc())
 
-#define rdtscll(val)						\
-	((val) = native_read_tsc())
-
 #define rdtscp(low, high, aux)					\
 do {                                                            \
 	unsigned long long _val = native_read_tscp(&(aux));     \
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 3da1cc1..b488390 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -21,15 +21,12 @@ extern void disable_TSC(void);
 
 static inline cycles_t get_cycles(void)
 {
-	unsigned long long ret = 0;
-
 #ifndef CONFIG_X86_TSC
 	if (!cpu_has_tsc)
 		return 0;
 #endif
-	rdtscll(ret);
 
-	return ret;
+	return native_read_tsc();
 }
 
 extern void tsc_init(void);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 9fe111c..25efa53 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -263,7 +263,7 @@ static int apbt_clocksource_register(void)
 
 	/* Verify whether apbt counter works */
 	t1 = dw_apb_clocksource_read(clocksource_apbt);
-	rdtscll(start);
+	start = native_read_tsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -273,7 +273,7 @@ static int apbt_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		rdtscll(now);
+		now = native_read_tsc();
 	} while ((now - start) < 200000UL);
 
 	/* APBT is the only always on clocksource, it has to work! */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index dcb5285..51af1ed 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -457,7 +457,7 @@ static int lapic_next_deadline(unsigned long delta,
 {
 	u64 tsc;
 
-	rdtscll(tsc);
+	tsc = native_read_tsc();
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
 }
@@ -592,7 +592,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 	unsigned long pm = acpi_pm_read_early();
 
 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = native_read_tsc();
 
 	switch (lapic_cal_loops++) {
 	case 0:
@@ -1209,7 +1209,7 @@ void setup_local_APIC(void)
 	long long max_loops = cpu_khz ? cpu_khz : 1000000;
 
 	if (cpu_has_tsc)
-		rdtscll(tsc);
+		tsc = native_read_tsc();
 
 	if (disable_apic) {
 		disable_ioapic_support();
@@ -1293,7 +1293,7 @@ void setup_local_APIC(void)
 		}
 		if (queued) {
 			if (cpu_has_tsc && cpu_khz) {
-				rdtscll(ntsc);
+				ntsc = native_read_tsc();
 				max_loops = (cpu_khz << 10) - (ntsc - tsc);
 			} else
 				max_loops--;
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index df919ff..a5283d2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -125,7 +125,7 @@ void mce_setup(struct mce *m)
 {
 	memset(m, 0, sizeof(struct mce));
 	m->cpu = m->extcpu = smp_processor_id();
-	rdtscll(m->tsc);
+	m->tsc = native_read_tsc();
 	/* We hope get_seconds stays lockless */
 	m->time = get_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -1784,7 +1784,7 @@ static void collect_tscs(void *data)
 {
 	unsigned long *cpu_tsc = (unsigned long *)data;
 
-	rdtscll(cpu_tsc[smp_processor_id()]);
+	cpu_tsc[smp_processor_id()] = native_read_tsc();
 }
 
 static int mce_apei_read_done;
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index f5d0730..334a2a9 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -110,7 +110,7 @@ static void init_espfix_random(void)
 	 */
 	if (!arch_get_random_long(&rand)) {
 		/* The constant is an arbitrary large prime */
-		rdtscll(rand);
+		rand = native_read_tsc();
 		rand *= 0xc345c6b72fd16123UL;
 	}
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 10757d0..cc390fe 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -735,7 +735,7 @@ static int hpet_clocksource_register(void)
 
 	/* Verify whether hpet counter works */
 	t1 = hpet_readl(HPET_COUNTER);
-	rdtscll(start);
+	start = native_read_tsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -745,7 +745,7 @@ static int hpet_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		rdtscll(now);
+		now = native_read_tsc();
 	} while ((now - start) < 200000UL);
 
 	if (t1 == hpet_readl(HPET_COUNTER)) {
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
index 25b9937..bd8f4d4 100644
--- a/arch/x86/kernel/trace_clock.c
+++ b/arch/x86/kernel/trace_clock.c
@@ -15,7 +15,7 @@ u64 notrace trace_clock_x86_tsc(void)
 	u64 ret;
 
 	rdtsc_barrier();
-	rdtscll(ret);
+	ret = native_read_tsc();
 
 	return ret;
 }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index e7710cd..e66f5dc 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 
 	data = cyc2ns_write_begin(cpu);
 
-	rdtscll(tsc_now);
+	tsc_now = native_read_tsc();
 	ns_now = cycles_2_ns(tsc_now);
 
 	/*
@@ -290,7 +290,7 @@ u64 native_sched_clock(void)
 	}
 
 	/* read the Time Stamp Counter: */
-	rdtscll(tsc_now);
+	tsc_now = native_read_tsc();
 
 	/* return the value in ns */
 	return cycles_2_ns(tsc_now);
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index e856dd5..4fa1cca 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void)
 {
 	u64 host_tsc, tsc_offset;
 
-	rdtscll(host_tsc);
+	host_tsc = native_read_tsc();
 	tsc_offset = vmcs_read64(TSC_OFFSET);
 	return host_tsc + tsc_offset;
 }
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 39d6a3d..9a52ad0 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -100,7 +100,7 @@ void use_tsc_delay(void)
 int read_current_timer(unsigned long *timer_val)
 {
 	if (delay_fn == delay_tsc) {
-		rdtscll(*timer_val);
+		*timer_val = native_read_tsc();
 		return 0;
 	}
 	return -1;
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index 5820e85..ab13448 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -340,7 +340,7 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
 
 	/* check result for the last window */
 	msr_now = pkg_state_counter();
-	rdtscll(tsc_now);
+	tsc_now = native_read_tsc();
 
 	/* calculate pkg cstate vs tsc ratio */
 	if (!msr_last || !tsc_last)
@@ -482,7 +482,7 @@ static void poll_pkg_cstate(struct work_struct *dummy)
 	u64 val64;
 
 	msr_now = pkg_state_counter();
-	rdtscll(tsc_now);
+	tsc_now = native_read_tsc();
 	jiffies_now = jiffies;
 
 	/* calculate pkg cstate vs tsc ratio */
diff --git a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
index 5224ee5..f02b0c0 100644
--- a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
+++ b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
@@ -81,11 +81,11 @@ static int __init cpufreq_test_tsc(void)
 
 	printk(KERN_DEBUG "start--> \n");
 	then = read_pmtmr();
-        rdtscll(then_tsc);
+	then_tsc = native_read_tsc();
 	for (i=0;i<20;i++) {
 		mdelay(100);
 		now = read_pmtmr();
-		rdtscll(now_tsc);
+		now_tsc = native_read_tsc();
 		diff = (now - then) & 0xFFFFFF;
 		diff_tsc = now_tsc - then_tsc;
 		printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc);
-- 
cgit v0.10.2


From ec69de52c648b1d9416a810943e68dbe9fe519f4 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:43:59 +0200
Subject: x86/asm/tsc: Remove the rdtscp() and rdtscpll() macros

They have no users. Leave native_read_tscp() which seems
potentially useful despite also having no callers.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/6abfa3ef80534b5d73898a48c4d25e069303cbe5.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 7273b74..626f781 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -192,15 +192,6 @@ do {							\
 #define rdtscl(low)						\
 	((low) = (u32)native_read_tsc())
 
-#define rdtscp(low, high, aux)					\
-do {                                                            \
-	unsigned long long _val = native_read_tscp(&(aux));     \
-	(low) = (u32)_val;                                      \
-	(high) = (u32)(_val >> 32);                             \
-} while (0)
-
-#define rdtscpll(val, aux) (val) = native_read_tscp(&(aux))
-
 /*
  * 64-bit version of wrmsr_safe():
  */
-- 
cgit v0.10.2


From 9cfa1a0279e22063a727fd204a75cf3672860d83 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:00 +0200
Subject: x86/asm/tsc: Use the full 64-bit TSC in delay_tsc()

As a very minor optimization, delay_tsc() was only using the low
32 bits of the TSC. It's a delay function, so just use the whole
thing.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/bd1a277c71321b67c4794970cb5ace05efe21ab6.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 9a52ad0..35115f3 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -49,16 +49,16 @@ static void delay_loop(unsigned long loops)
 /* TSC based delay: */
 static void delay_tsc(unsigned long __loops)
 {
-	u32 bclock, now, loops = __loops;
+	u64 bclock, now, loops = __loops;
 	int cpu;
 
 	preempt_disable();
 	cpu = smp_processor_id();
 	rdtsc_barrier();
-	rdtscl(bclock);
+	bclock = native_read_tsc();
 	for (;;) {
 		rdtsc_barrier();
-		rdtscl(now);
+		now = native_read_tsc();
 		if ((now - bclock) >= loops)
 			break;
 
@@ -80,7 +80,7 @@ static void delay_tsc(unsigned long __loops)
 			loops -= (now - bclock);
 			cpu = smp_processor_id();
 			rdtsc_barrier();
-			rdtscl(bclock);
+			bclock = native_read_tsc();
 		}
 	}
 	preempt_enable();
-- 
cgit v0.10.2


From 3796366614598e48edf0561b86f18c230a7debc8 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:01 +0200
Subject: x86/asm/tsc, x86/cpu/amd: Use the full 64-bit TSC to detect the 2.6.2
 bug

This code is timing 100k indirect calls, so the added overhead
of counting the number of cycles elapsed as a 64-bit number
should be insignificant.  Drop the optimization of using a
32-bit count.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/d58f339a9c0dd8352b50d2f7a216f67ec2844f20.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index dd3a4ba..a69710d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -114,7 +114,7 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 		const int K6_BUG_LOOP = 1000000;
 		int n;
 		void (*f_vide)(void);
-		unsigned long d, d2;
+		u64 d, d2;
 
 		printk(KERN_INFO "AMD K6 stepping B detected - ");
 
@@ -125,10 +125,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 
 		n = K6_BUG_LOOP;
 		f_vide = vide;
-		rdtscl(d);
+		d = native_read_tsc();
 		while (n--)
 			f_vide();
-		rdtscl(d2);
+		d2 = native_read_tsc();
 		d = d2-d;
 
 		if (d > 20*K6_BUG_LOOP)
-- 
cgit v0.10.2


From e18d1f8df176527332761ac29ee3097f8584c478 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:02 +0200
Subject: x86/asm/tsc, drivers/net/hamradio/baycom_epp: Replace rdtscl() with
 native_read_tsc()

This is only used if BAYCOM_DEBUG is defined.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Thomas Sailer <t.sailer@alumni.ethz.ch
Acked-by: Walter Harms <wharms@bfs.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: linux-hams@vger.kernel.org
Link: http://lkml.kernel.org/r/1195ce0c7f34169ff3006341b77806184a46b9bf.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 83c7cce..44e5c3b 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -638,7 +638,7 @@ static int receive(struct net_device *dev, int cnt)
 #define GETTICK(x)                                                \
 ({                                                                \
 	if (cpu_has_tsc)                                          \
-		rdtscl(x);                                        \
+		x = (unsigned int)native_read_tsc();		  \
 })
 #else /* __i386__ */
 #define GETTICK(x)
-- 
cgit v0.10.2


From 3a2c16c8489d967de10b3b7f5cc0f7cab4337770 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:03 +0200
Subject: x86/asm/tsc, staging/lirc_serial: Remove TSC-based timing

It wasn't compiled in by default. I suspect that the driver was
and still is broken, though -- it's calling udelay with a
parameter that's derived from loops_per_jiffy.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Jarod Wilson <jarod@wilsonet.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: devel@driverdev.osuosl.org
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/c95df47c5405b494d19d20b2852a9378c9f661f3.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/drivers/staging/media/lirc/lirc_serial.c b/drivers/staging/media/lirc/lirc_serial.c
index dc79844..465796a 100644
--- a/drivers/staging/media/lirc/lirc_serial.c
+++ b/drivers/staging/media/lirc/lirc_serial.c
@@ -327,9 +327,6 @@ static void safe_udelay(unsigned long usecs)
  * time
  */
 
-/* So send_pulse can quickly convert microseconds to clocks */
-static unsigned long conv_us_to_clocks;
-
 static int init_timing_params(unsigned int new_duty_cycle,
 		unsigned int new_freq)
 {
@@ -344,7 +341,6 @@ static int init_timing_params(unsigned int new_duty_cycle,
 	/* How many clocks in a microsecond?, avoiding long long divide */
 	work = loops_per_sec;
 	work *= 4295;  /* 4295 = 2^32 / 1e6 */
-	conv_us_to_clocks = work >> 32;
 
 	/*
 	 * Carrier period in clocks, approach good up to 32GHz clock,
@@ -357,10 +353,9 @@ static int init_timing_params(unsigned int new_duty_cycle,
 	pulse_width = period * duty_cycle / 100;
 	space_width = period - pulse_width;
 	dprintk("in init_timing_params, freq=%d, duty_cycle=%d, "
-		"clk/jiffy=%ld, pulse=%ld, space=%ld, "
-		"conv_us_to_clocks=%ld\n",
+		"clk/jiffy=%ld, pulse=%ld, space=%ld\n",
 		freq, duty_cycle, __this_cpu_read(cpu_info.loops_per_jiffy),
-		pulse_width, space_width, conv_us_to_clocks);
+		pulse_width, space_width);
 	return 0;
 }
 #else /* ! USE_RDTSC */
@@ -431,63 +426,14 @@ static long send_pulse_irdeo(unsigned long length)
 	return ret;
 }
 
-#ifdef USE_RDTSC
-/* Version that uses Pentium rdtsc instruction to measure clocks */
-
-/*
- * This version does sub-microsecond timing using rdtsc instruction,
- * and does away with the fudged LIRC_SERIAL_TRANSMITTER_LATENCY
- * Implicitly i586 architecture...  - Steve
- */
-
-static long send_pulse_homebrew_softcarrier(unsigned long length)
-{
-	int flag;
-	unsigned long target, start, now;
-
-	/* Get going quick as we can */
-	rdtscl(start);
-	on();
-	/* Convert length from microseconds to clocks */
-	length *= conv_us_to_clocks;
-	/* And loop till time is up - flipping at right intervals */
-	now = start;
-	target = pulse_width;
-	flag = 1;
-	/*
-	 * FIXME: This looks like a hard busy wait, without even an occasional,
-	 * polite, cpu_relax() call.  There's got to be a better way?
-	 *
-	 * The i2c code has the result of a lot of bit-banging work, I wonder if
-	 * there's something there which could be helpful here.
-	 */
-	while ((now - start) < length) {
-		/* Delay till flip time */
-		do {
-			rdtscl(now);
-		} while ((now - start) < target);
-
-		/* flip */
-		if (flag) {
-			rdtscl(now);
-			off();
-			target += space_width;
-		} else {
-			rdtscl(now); on();
-			target += pulse_width;
-		}
-		flag = !flag;
-	}
-	rdtscl(now);
-	return ((now - start) - length) / conv_us_to_clocks;
-}
-#else /* ! USE_RDTSC */
 /* Version using udelay() */
 
 /*
  * here we use fixed point arithmetic, with 8
  * fractional bits.  that gets us within 0.1% or so of the right average
  * frequency, albeit with some jitter in pulse length - Steve
+ *
+ * This should use ndelay instead.
  */
 
 /* To match 8 fractional bits used for pulse/space length */
@@ -520,7 +466,6 @@ static long send_pulse_homebrew_softcarrier(unsigned long length)
 	}
 	return (actual-length) >> 8;
 }
-#endif /* USE_RDTSC */
 
 static long send_pulse_homebrew(unsigned long length)
 {
-- 
cgit v0.10.2


From 016bfc449a88c833e949414a41748b359843dbb1 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:04 +0200
Subject: x86/asm/tsc, input/joystick/analog: Switch from rdtscl() to
 native_read_tsc()

This timing code is hideous, and this doesn't help.  It gets rid
of one of the last users of rdtscl(), though.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: linux-input@vger.kernel.org
Link: http://lkml.kernel.org/r/90d19b3cea0e05ca6f333d1598daa38afb993260.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index 4284080..f871b4f 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -143,7 +143,7 @@ struct analog_port {
 
 #include <linux/i8253.h>
 
-#define GET_TIME(x)	do { if (cpu_has_tsc) rdtscl(x); else x = get_time_pit(); } while (0)
+#define GET_TIME(x)	do { if (cpu_has_tsc) x = (unsigned int)native_read_tsc(); else x = get_time_pit(); } while (0)
 #define DELTA(x,y)	(cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0)))
 #define TIME_NAME	(cpu_has_tsc?"TSC":"PIT")
 static unsigned int get_time_pit(void)
@@ -160,7 +160,7 @@ static unsigned int get_time_pit(void)
         return count;
 }
 #elif defined(__x86_64__)
-#define GET_TIME(x)	rdtscl(x)
+#define GET_TIME(x)	do { x = (unsigned int)native_read_tsc(); } while (0)
 #define DELTA(x,y)	((y)-(x))
 #define TIME_NAME	"TSC"
 #elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE)
-- 
cgit v0.10.2


From 732f374ba50b64150bf954c2d4e9f6fae583cccf Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:05 +0200
Subject: x86/asm/tsc, drivers/input/gameport: Replace rdtscl() with
 native_read_tsc()

It's unclear to me why this code exists in the first place.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Cc: linux-input@vger.kernel.org
Link: http://lkml.kernel.org/r/9e058e72f4cf1f13c6483c1360b39c3d188a2c2a.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index e853a21..abc0cb2 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -149,9 +149,9 @@ static int old_gameport_measure_speed(struct gameport *gameport)
 
 	for(i = 0; i < 50; i++) {
 		local_irq_save(flags);
-		rdtscl(t1);
+		t1 = native_read_tsc();
 		for (t = 0; t < 50; t++) gameport_read(gameport);
-		rdtscl(t2);
+		t2 = native_read_tsc();
 		local_irq_restore(flags);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
-- 
cgit v0.10.2


From fe47ae6e1a5005b2e82f7eab57b5c3820453293a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:06 +0200
Subject: x86/asm/tsc: Remove rdtscl()

It has no more callers, and it was never a very sensible
interface to begin with. Users of the TSC should either read all
64 bits or explicitly throw out the high bits.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/250105f7cee519be9d7fc4464b5784caafc8f4fe.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 626f781..c89ed6c 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -189,9 +189,6 @@ do {							\
 
 #endif	/* !CONFIG_PARAVIRT */
 
-#define rdtscl(low)						\
-	((low) = (u32)native_read_tsc())
-
 /*
  * 64-bit version of wrmsr_safe():
  */
-- 
cgit v0.10.2


From 4ea1636b04dbd66536fa387bae2eea463efc705b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:07 +0200
Subject: x86/asm/tsc: Rename native_read_tsc() to rdtsc()

Now that there is no paravirt TSC, the "native" is
inappropriate. The function does RDTSC, so give it the obvious
name: rdtsc().

Suggested-by: Borislav Petkov <bp@suse.de>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/fd43e16281991f096c1e4d21574d9e1402c62d39.1434501121.git.luto@kernel.org
[ Ported it to v4.2-rc1. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/boot/compressed/aslr.c b/arch/x86/boot/compressed/aslr.c
index ea33236..6a9b96b 100644
--- a/arch/x86/boot/compressed/aslr.c
+++ b/arch/x86/boot/compressed/aslr.c
@@ -82,7 +82,7 @@ static unsigned long get_random_long(void)
 
 	if (has_cpuflag(X86_FEATURE_TSC)) {
 		debug_putstr(" RDTSC");
-		raw = native_read_tsc();
+		raw = rdtsc();
 
 		random ^= raw;
 		use_i8254 = false;
diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 972b488..0340d93 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -186,7 +186,7 @@ notrace static cycle_t vread_tsc(void)
 	 * but no one has ever seen it happen.
 	 */
 	rdtsc_barrier();
-	ret = (cycle_t)native_read_tsc();
+	ret = (cycle_t)rdtsc();
 
 	last = gtod->cycle_last;
 
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index c89ed6c..ff0c120 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -109,7 +109,16 @@ notrace static inline int native_write_msr_safe(unsigned int msr,
 extern int rdmsr_safe_regs(u32 regs[8]);
 extern int wrmsr_safe_regs(u32 regs[8]);
 
-static __always_inline unsigned long long native_read_tsc(void)
+/**
+ * rdtsc() - returns the current TSC without ordering constraints
+ *
+ * rdtsc() returns the result of RDTSC as a 64-bit integer.  The
+ * only ordering constraint it supplies is the ordering implied by
+ * "asm volatile": it will put the RDTSC in the place you expect.  The
+ * CPU can and will speculatively execute that RDTSC, though, so the
+ * results can be non-monotonic if compared on different CPUs.
+ */
+static __always_inline unsigned long long rdtsc(void)
 {
 	DECLARE_ARGS(val, low, high);
 
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 2bd69d6..5c490db 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 static __always_inline
 u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
 {
-	u64 delta = native_read_tsc() - src->tsc_timestamp;
+	u64 delta = rdtsc() - src->tsc_timestamp;
 	return pvclock_scale_delta(delta, src->tsc_to_system_mul,
 				   src->tsc_shift);
 }
diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h
index bc5fa2a..58505f0 100644
--- a/arch/x86/include/asm/stackprotector.h
+++ b/arch/x86/include/asm/stackprotector.h
@@ -72,7 +72,7 @@ static __always_inline void boot_init_stack_canary(void)
 	 * on during the bootup the random pool has true entropy too.
 	 */
 	get_random_bytes(&canary, sizeof(canary));
-	tsc = native_read_tsc();
+	tsc = rdtsc();
 	canary += tsc + (tsc << 32UL);
 
 	current->stack_canary = canary;
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index b488390..3df7675 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -26,7 +26,7 @@ static inline cycles_t get_cycles(void)
 		return 0;
 #endif
 
-	return native_read_tsc();
+	return rdtsc();
 }
 
 extern void tsc_init(void);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 25efa53..222a570 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -263,7 +263,7 @@ static int apbt_clocksource_register(void)
 
 	/* Verify whether apbt counter works */
 	t1 = dw_apb_clocksource_read(clocksource_apbt);
-	start = native_read_tsc();
+	start = rdtsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -273,7 +273,7 @@ static int apbt_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		now = native_read_tsc();
+		now = rdtsc();
 	} while ((now - start) < 200000UL);
 
 	/* APBT is the only always on clocksource, it has to work! */
@@ -390,13 +390,13 @@ unsigned long apbt_quick_calibrate(void)
 	old = dw_apb_clocksource_read(clocksource_apbt);
 	old += loop;
 
-	t1 = native_read_tsc();
+	t1 = rdtsc();
 
 	do {
 		new = dw_apb_clocksource_read(clocksource_apbt);
 	} while (new < old);
 
-	t2 = native_read_tsc();
+	t2 = rdtsc();
 
 	shift = 5;
 	if (unlikely(loop >> shift == 0)) {
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 51af1ed..0d71cd9 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -457,7 +457,7 @@ static int lapic_next_deadline(unsigned long delta,
 {
 	u64 tsc;
 
-	tsc = native_read_tsc();
+	tsc = rdtsc();
 	wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR));
 	return 0;
 }
@@ -592,7 +592,7 @@ static void __init lapic_cal_handler(struct clock_event_device *dev)
 	unsigned long pm = acpi_pm_read_early();
 
 	if (cpu_has_tsc)
-		tsc = native_read_tsc();
+		tsc = rdtsc();
 
 	switch (lapic_cal_loops++) {
 	case 0:
@@ -1209,7 +1209,7 @@ void setup_local_APIC(void)
 	long long max_loops = cpu_khz ? cpu_khz : 1000000;
 
 	if (cpu_has_tsc)
-		tsc = native_read_tsc();
+		tsc = rdtsc();
 
 	if (disable_apic) {
 		disable_ioapic_support();
@@ -1293,7 +1293,7 @@ void setup_local_APIC(void)
 		}
 		if (queued) {
 			if (cpu_has_tsc && cpu_khz) {
-				ntsc = native_read_tsc();
+				ntsc = rdtsc();
 				max_loops = (cpu_khz << 10) - (ntsc - tsc);
 			} else
 				max_loops--;
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a69710d..51ad2af 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -125,10 +125,10 @@ static void init_amd_k6(struct cpuinfo_x86 *c)
 
 		n = K6_BUG_LOOP;
 		f_vide = vide;
-		d = native_read_tsc();
+		d = rdtsc();
 		while (n--)
 			f_vide();
-		d2 = native_read_tsc();
+		d2 = rdtsc();
 		d = d2-d;
 
 		if (d > 20*K6_BUG_LOOP)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index a5283d2..96ccecc 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -125,7 +125,7 @@ void mce_setup(struct mce *m)
 {
 	memset(m, 0, sizeof(struct mce));
 	m->cpu = m->extcpu = smp_processor_id();
-	m->tsc = native_read_tsc();
+	m->tsc = rdtsc();
 	/* We hope get_seconds stays lockless */
 	m->time = get_seconds();
 	m->cpuvendor = boot_cpu_data.x86_vendor;
@@ -1784,7 +1784,7 @@ static void collect_tscs(void *data)
 {
 	unsigned long *cpu_tsc = (unsigned long *)data;
 
-	cpu_tsc[smp_processor_id()] = native_read_tsc();
+	cpu_tsc[smp_processor_id()] = rdtsc();
 }
 
 static int mce_apei_read_done;
diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c
index 334a2a9..67315cd 100644
--- a/arch/x86/kernel/espfix_64.c
+++ b/arch/x86/kernel/espfix_64.c
@@ -110,7 +110,7 @@ static void init_espfix_random(void)
 	 */
 	if (!arch_get_random_long(&rand)) {
 		/* The constant is an arbitrary large prime */
-		rand = native_read_tsc();
+		rand = rdtsc();
 		rand *= 0xc345c6b72fd16123UL;
 	}
 
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index cc390fe..f75c590 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -735,7 +735,7 @@ static int hpet_clocksource_register(void)
 
 	/* Verify whether hpet counter works */
 	t1 = hpet_readl(HPET_COUNTER);
-	start = native_read_tsc();
+	start = rdtsc();
 
 	/*
 	 * We don't know the TSC frequency yet, but waiting for
@@ -745,7 +745,7 @@ static int hpet_clocksource_register(void)
 	 */
 	do {
 		rep_nop();
-		now = native_read_tsc();
+		now = rdtsc();
 	} while ((now - start) < 200000UL);
 
 	if (t1 == hpet_readl(HPET_COUNTER)) {
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
index bd8f4d4..67efb8c 100644
--- a/arch/x86/kernel/trace_clock.c
+++ b/arch/x86/kernel/trace_clock.c
@@ -15,7 +15,7 @@ u64 notrace trace_clock_x86_tsc(void)
 	u64 ret;
 
 	rdtsc_barrier();
-	ret = native_read_tsc();
+	ret = rdtsc();
 
 	return ret;
 }
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index e66f5dc..21d6e04 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -248,7 +248,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu)
 
 	data = cyc2ns_write_begin(cpu);
 
-	tsc_now = native_read_tsc();
+	tsc_now = rdtsc();
 	ns_now = cycles_2_ns(tsc_now);
 
 	/*
@@ -290,7 +290,7 @@ u64 native_sched_clock(void)
 	}
 
 	/* read the Time Stamp Counter: */
-	tsc_now = native_read_tsc();
+	tsc_now = rdtsc();
 
 	/* return the value in ns */
 	return cycles_2_ns(tsc_now);
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 954e98a..2f0ade4 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1172,7 +1172,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu)
 
 	tsc_deadline = apic->lapic_timer.expired_tscdeadline;
 	apic->lapic_timer.expired_tscdeadline = 0;
-	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+	guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
 	trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline);
 
 	/* __delay is delay_tsc whenever the hardware has TSC, thus always.  */
@@ -1240,7 +1240,7 @@ static void start_apic_timer(struct kvm_lapic *apic)
 		local_irq_save(flags);
 
 		now = apic->lapic_timer.timer.base->get_time();
-		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, native_read_tsc());
+		guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc());
 		if (likely(tscdeadline > guest_tsc)) {
 			ns = (tscdeadline - guest_tsc) * 1000000ULL;
 			do_div(ns, this_tsc_khz);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 602b974..8dfbad7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1080,7 +1080,7 @@ static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
 	u64 tsc;
 
-	tsc = svm_scale_tsc(vcpu, native_read_tsc());
+	tsc = svm_scale_tsc(vcpu, rdtsc());
 
 	return target_tsc - tsc;
 }
@@ -3079,7 +3079,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 	switch (msr_info->index) {
 	case MSR_IA32_TSC: {
 		msr_info->data = svm->vmcb->control.tsc_offset +
-			svm_scale_tsc(vcpu, native_read_tsc());
+			svm_scale_tsc(vcpu, rdtsc());
 
 		break;
 	}
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 4fa1cca..10d69a6 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -2236,7 +2236,7 @@ static u64 guest_read_tsc(void)
 {
 	u64 host_tsc, tsc_offset;
 
-	host_tsc = native_read_tsc();
+	host_tsc = rdtsc();
 	tsc_offset = vmcs_read64(TSC_OFFSET);
 	return host_tsc + tsc_offset;
 }
@@ -2317,7 +2317,7 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho
 
 static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 {
-	return target_tsc - native_read_tsc();
+	return target_tsc - rdtsc();
 }
 
 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index f771058..dfa9713 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1455,7 +1455,7 @@ static cycle_t read_tsc(void)
 	 * but no one has ever seen it happen.
 	 */
 	rdtsc_barrier();
-	ret = (cycle_t)native_read_tsc();
+	ret = (cycle_t)rdtsc();
 
 	last = pvclock_gtod_data.clock.cycle_last;
 
@@ -1646,7 +1646,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		return 1;
 	}
 	if (!use_master_clock) {
-		host_tsc = native_read_tsc();
+		host_tsc = rdtsc();
 		kernel_ns = get_kernel_ns();
 	}
 
@@ -2810,7 +2810,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
 		s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 :
-				native_read_tsc() - vcpu->arch.last_host_tsc;
+				rdtsc() - vcpu->arch.last_host_tsc;
 		if (tsc_delta < 0)
 			mark_tsc_unstable("KVM discovered backwards TSC");
 		if (check_tsc_unstable()) {
@@ -2838,7 +2838,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 {
 	kvm_x86_ops->vcpu_put(vcpu);
 	kvm_put_guest_fpu(vcpu);
-	vcpu->arch.last_host_tsc = native_read_tsc();
+	vcpu->arch.last_host_tsc = rdtsc();
 }
 
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
@@ -6623,7 +6623,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		hw_breakpoint_restore();
 
 	vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu,
-							   native_read_tsc());
+							   rdtsc());
 
 	vcpu->mode = OUTSIDE_GUEST_MODE;
 	smp_wmb();
@@ -7437,7 +7437,7 @@ int kvm_arch_hardware_enable(void)
 	if (ret != 0)
 		return ret;
 
-	local_tsc = native_read_tsc();
+	local_tsc = rdtsc();
 	stable = !check_tsc_unstable();
 	list_for_each_entry(kvm, &vm_list, vm_list) {
 		kvm_for_each_vcpu(i, vcpu, kvm) {
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 35115f3..f24bc59 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -55,10 +55,10 @@ static void delay_tsc(unsigned long __loops)
 	preempt_disable();
 	cpu = smp_processor_id();
 	rdtsc_barrier();
-	bclock = native_read_tsc();
+	bclock = rdtsc();
 	for (;;) {
 		rdtsc_barrier();
-		now = native_read_tsc();
+		now = rdtsc();
 		if ((now - bclock) >= loops)
 			break;
 
@@ -80,7 +80,7 @@ static void delay_tsc(unsigned long __loops)
 			loops -= (now - bclock);
 			cpu = smp_processor_id();
 			rdtsc_barrier();
-			bclock = native_read_tsc();
+			bclock = rdtsc();
 		}
 	}
 	preempt_enable();
@@ -100,7 +100,7 @@ void use_tsc_delay(void)
 int read_current_timer(unsigned long *timer_val)
 {
 	if (delay_fn == delay_tsc) {
-		*timer_val = native_read_tsc();
+		*timer_val = rdtsc();
 		return 0;
 	}
 	return -1;
diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
index 15ada47..7c56d7e 100644
--- a/drivers/cpufreq/intel_pstate.c
+++ b/drivers/cpufreq/intel_pstate.c
@@ -765,7 +765,7 @@ static inline void intel_pstate_sample(struct cpudata *cpu)
 	local_irq_save(flags);
 	rdmsrl(MSR_IA32_APERF, aperf);
 	rdmsrl(MSR_IA32_MPERF, mperf);
-	tsc = native_read_tsc();
+	tsc = rdtsc();
 	local_irq_restore(flags);
 
 	cpu->last_sample_time = cpu->sample.time;
diff --git a/drivers/input/gameport/gameport.c b/drivers/input/gameport/gameport.c
index abc0cb2..4a2a9e3 100644
--- a/drivers/input/gameport/gameport.c
+++ b/drivers/input/gameport/gameport.c
@@ -149,9 +149,9 @@ static int old_gameport_measure_speed(struct gameport *gameport)
 
 	for(i = 0; i < 50; i++) {
 		local_irq_save(flags);
-		t1 = native_read_tsc();
+		t1 = rdtsc();
 		for (t = 0; t < 50; t++) gameport_read(gameport);
-		t2 = native_read_tsc();
+		t2 = rdtsc();
 		local_irq_restore(flags);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
diff --git a/drivers/input/joystick/analog.c b/drivers/input/joystick/analog.c
index f871b4f..6f8b084 100644
--- a/drivers/input/joystick/analog.c
+++ b/drivers/input/joystick/analog.c
@@ -143,7 +143,7 @@ struct analog_port {
 
 #include <linux/i8253.h>
 
-#define GET_TIME(x)	do { if (cpu_has_tsc) x = (unsigned int)native_read_tsc(); else x = get_time_pit(); } while (0)
+#define GET_TIME(x)	do { if (cpu_has_tsc) x = (unsigned int)rdtsc(); else x = get_time_pit(); } while (0)
 #define DELTA(x,y)	(cpu_has_tsc ? ((y) - (x)) : ((x) - (y) + ((x) < (y) ? PIT_TICK_RATE / HZ : 0)))
 #define TIME_NAME	(cpu_has_tsc?"TSC":"PIT")
 static unsigned int get_time_pit(void)
@@ -160,7 +160,7 @@ static unsigned int get_time_pit(void)
         return count;
 }
 #elif defined(__x86_64__)
-#define GET_TIME(x)	do { x = (unsigned int)native_read_tsc(); } while (0)
+#define GET_TIME(x)	do { x = (unsigned int)rdtsc(); } while (0)
 #define DELTA(x,y)	((y)-(x))
 #define TIME_NAME	"TSC"
 #elif defined(__alpha__) || defined(CONFIG_MN10300) || defined(CONFIG_ARM) || defined(CONFIG_ARM64) || defined(CONFIG_TILE)
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 44e5c3b..72c9f1f 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -638,7 +638,7 @@ static int receive(struct net_device *dev, int cnt)
 #define GETTICK(x)                                                \
 ({                                                                \
 	if (cpu_has_tsc)                                          \
-		x = (unsigned int)native_read_tsc();		  \
+		x = (unsigned int)rdtsc();		  \
 })
 #else /* __i386__ */
 #define GETTICK(x)
diff --git a/drivers/thermal/intel_powerclamp.c b/drivers/thermal/intel_powerclamp.c
index ab13448..2ac0c70 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -340,7 +340,7 @@ static bool powerclamp_adjust_controls(unsigned int target_ratio,
 
 	/* check result for the last window */
 	msr_now = pkg_state_counter();
-	tsc_now = native_read_tsc();
+	tsc_now = rdtsc();
 
 	/* calculate pkg cstate vs tsc ratio */
 	if (!msr_last || !tsc_last)
@@ -482,7 +482,7 @@ static void poll_pkg_cstate(struct work_struct *dummy)
 	u64 val64;
 
 	msr_now = pkg_state_counter();
-	tsc_now = native_read_tsc();
+	tsc_now = rdtsc();
 	jiffies_now = jiffies;
 
 	/* calculate pkg cstate vs tsc ratio */
diff --git a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
index f02b0c0..6ff8383 100644
--- a/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
+++ b/tools/power/cpupower/debug/kernel/cpufreq-test_tsc.c
@@ -81,11 +81,11 @@ static int __init cpufreq_test_tsc(void)
 
 	printk(KERN_DEBUG "start--> \n");
 	then = read_pmtmr();
-	then_tsc = native_read_tsc();
+	then_tsc = rdtsc();
 	for (i=0;i<20;i++) {
 		mdelay(100);
 		now = read_pmtmr();
-		now_tsc = native_read_tsc();
+		now_tsc = rdtsc();
 		diff = (now - then) & 0xFFFFFF;
 		diff_tsc = now_tsc - then_tsc;
 		printk(KERN_DEBUG "t1: %08u t2: %08u diff_pmtmr: %08u diff_tsc: %016llu\n", then, now, diff, diff_tsc);
-- 
cgit v0.10.2


From 03b9730b769fc4d87e40f6104f4c5b2e43889f19 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:08 +0200
Subject: x86/asm/tsc: Add rdtsc_ordered() and use it in trivial call sites

rdtsc_barrier(); rdtsc() is an unnecessary mouthful and requires
more thought than should be necessary. Add an rdtsc_ordered()
helper and replace the trivial call sites with it.

This should not change generated code. The duplication of the
fence asm is temporary.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/dddbf98a2af53312e9aa73a5a2b1622fe5d6f52b.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 0340d93..ca94fa6 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -175,20 +175,8 @@ static notrace cycle_t vread_pvclock(int *mode)
 
 notrace static cycle_t vread_tsc(void)
 {
-	cycle_t ret;
-	u64 last;
-
-	/*
-	 * Empirically, a fence (of type that depends on the CPU)
-	 * before rdtsc is enough to ensure that rdtsc is ordered
-	 * with respect to loads.  The various CPU manuals are unclear
-	 * as to whether rdtsc can be reordered with later loads,
-	 * but no one has ever seen it happen.
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)rdtsc();
-
-	last = gtod->cycle_last;
+	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 last = gtod->cycle_last;
 
 	if (likely(ret >= last))
 		return ret;
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index ff0c120..02bdd6c 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -127,6 +127,32 @@ static __always_inline unsigned long long rdtsc(void)
 	return EAX_EDX_VAL(val, low, high);
 }
 
+/**
+ * rdtsc_ordered() - read the current TSC in program order
+ *
+ * rdtsc_ordered() returns the result of RDTSC as a 64-bit integer.
+ * It is ordered like a load to a global in-memory counter.  It should
+ * be impossible to observe non-monotonic rdtsc_unordered() behavior
+ * across multiple CPUs as long as the TSC is synced.
+ */
+static __always_inline unsigned long long rdtsc_ordered(void)
+{
+	/*
+	 * The RDTSC instruction is not ordered relative to memory
+	 * access.  The Intel SDM and the AMD APM are both vague on this
+	 * point, but empirically an RDTSC instruction can be
+	 * speculatively executed before prior loads.  An RDTSC
+	 * immediately after an appropriate barrier appears to be
+	 * ordered as a normal load, that is, it provides the same
+	 * ordering guarantees as reading from a global memory location
+	 * that some other imaginary CPU is updating continuously with a
+	 * time stamp.
+	 */
+	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+			  "lfence", X86_FEATURE_LFENCE_RDTSC);
+	return rdtsc();
+}
+
 static inline unsigned long long native_read_pmc(int counter)
 {
 	DECLARE_ARGS(val, low, high);
diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c
index 67efb8c..80bb24d 100644
--- a/arch/x86/kernel/trace_clock.c
+++ b/arch/x86/kernel/trace_clock.c
@@ -12,10 +12,5 @@
  */
 u64 notrace trace_clock_x86_tsc(void)
 {
-	u64 ret;
-
-	rdtsc_barrier();
-	ret = rdtsc();
-
-	return ret;
+	return rdtsc_ordered();
 }
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index dfa9713..8d73ec8 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -1444,20 +1444,8 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
 
 static cycle_t read_tsc(void)
 {
-	cycle_t ret;
-	u64 last;
-
-	/*
-	 * Empirically, a fence (of type that depends on the CPU)
-	 * before rdtsc is enough to ensure that rdtsc is ordered
-	 * with respect to loads.  The various CPU manuals are unclear
-	 * as to whether rdtsc can be reordered with later loads,
-	 * but no one has ever seen it happen.
-	 */
-	rdtsc_barrier();
-	ret = (cycle_t)rdtsc();
-
-	last = pvclock_gtod_data.clock.cycle_last;
+	cycle_t ret = (cycle_t)rdtsc_ordered();
+	u64 last = pvclock_gtod_data.clock.cycle_last;
 
 	if (likely(ret >= last))
 		return ret;
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index f24bc59..4453d52 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -54,11 +54,9 @@ static void delay_tsc(unsigned long __loops)
 
 	preempt_disable();
 	cpu = smp_processor_id();
-	rdtsc_barrier();
-	bclock = rdtsc();
+	bclock = rdtsc_ordered();
 	for (;;) {
-		rdtsc_barrier();
-		now = rdtsc();
+		now = rdtsc_ordered();
 		if ((now - bclock) >= loops)
 			break;
 
@@ -79,8 +77,7 @@ static void delay_tsc(unsigned long __loops)
 		if (unlikely(cpu != smp_processor_id())) {
 			loops -= (now - bclock);
 			cpu = smp_processor_id();
-			rdtsc_barrier();
-			bclock = rdtsc();
+			bclock = rdtsc_ordered();
 		}
 	}
 	preempt_enable();
-- 
cgit v0.10.2


From eee6946e44510b61c35cf754f5505537c7a8eb77 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:09 +0200
Subject: x86/asm/tsc/sync: Use rdtsc_ordered() in check_tsc_warp() and drop
 extra barriers

Using get_cycles was unnecessary: check_tsc_warp() is not called
on TSC-less systems. Replace rdtsc_barrier(); get_cycles() with
rdtsc_ordered().

While we're at it, make the somewhat more dangerous change of
removing barrier_before_rdtsc after RDTSC in the TSC warp check
code. This should be okay, though -- the vDSO TSC code doesn't
have that barrier, so, if removing the barrier from the warp
check would cause us to detect a warp that we otherwise wouldn't
detect, then we have a genuine bug.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/387c4c3a75f875bcde6cd68cee013273a744f364.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index dd8d079..78083bf 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -39,16 +39,15 @@ static cycles_t max_warp;
 static int nr_warps;
 
 /*
- * TSC-warp measurement loop running on both CPUs:
+ * TSC-warp measurement loop running on both CPUs.  This is not called
+ * if there is no TSC.
  */
 static void check_tsc_warp(unsigned int timeout)
 {
 	cycles_t start, now, prev, end;
 	int i;
 
-	rdtsc_barrier();
-	start = get_cycles();
-	rdtsc_barrier();
+	start = rdtsc_ordered();
 	/*
 	 * The measurement runs for 'timeout' msecs:
 	 */
@@ -63,9 +62,7 @@ static void check_tsc_warp(unsigned int timeout)
 		 */
 		arch_spin_lock(&sync_lock);
 		prev = last_tsc;
-		rdtsc_barrier();
-		now = get_cycles();
-		rdtsc_barrier();
+		now = rdtsc_ordered();
 		last_tsc = now;
 		arch_spin_unlock(&sync_lock);
 
@@ -126,7 +123,7 @@ void check_tsc_sync_source(int cpu)
 
 	/*
 	 * No need to check if we already know that the TSC is not
-	 * synchronized:
+	 * synchronized or if we have no TSC.
 	 */
 	if (unsynchronized_tsc())
 		return;
@@ -190,6 +187,7 @@ void check_tsc_sync_target(void)
 {
 	int cpus = 2;
 
+	/* Also aborts if there is no TSC. */
 	if (unsynchronized_tsc() || tsc_clocksource_reliable)
 		return;
 
-- 
cgit v0.10.2


From 27c634054a3155e1d9a02f0e362e4f4ff8d28ee7 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:10 +0200
Subject: x86/asm/tsc: Use rdtsc_ordered() in read_tsc() instead of
 get_cycles()

There are two logical changes here.  First, this removes a check
for cpu_has_tsc.  That check is unnecessary, as we don't
register the TSC as a clocksource on systems that have no TSC.

Second, it adds a barrier, thus preventing observable
non-monotonicity.

I suspect that the missing barrier was never a problem in
practice because system calls themselves were heavy enough
barriers to prevent user code from observing time warps due to
speculation. (Without the corresponding barrier in the vDSO,
however, non-monotonicity is easy to detect.)

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/c6ff621a053127a65b70f175443578db7a0711be.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 21d6e04..451bade0 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -961,7 +961,7 @@ static struct clocksource clocksource_tsc;
  */
 static cycle_t read_tsc(struct clocksource *cs)
 {
-	return (cycle_t)get_cycles();
+	return (cycle_t)rdtsc_ordered();
 }
 
 /*
-- 
cgit v0.10.2


From 502dfeff239e8313bfbe906ca0a1a6827ac8481b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:11 +0200
Subject: x86/asm/tsc, x86/kvm: Drop open-coded barrier and use rdtsc_ordered()
 in kvmclock

__pvclock_read_cycles() used to have two barriers, one of which was unnecessary,
which got removed after an initial version of this patch was sent.

But the barrier is still open-coded unnecessarily - get rid of
that barrier and clean up the code by just using rdtsc_ordered().

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/678981cc4761fb38a793c217c9cac42503cf3719.1434501121.git.luto@kernel.org
[ Ported it to v4.2-rc1. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index 5c490db..7a6bed5 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -62,7 +62,7 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 static __always_inline
 u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
 {
-	u64 delta = rdtsc() - src->tsc_timestamp;
+	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
 	return pvclock_scale_delta(delta, src->tsc_to_system_mul,
 				   src->tsc_shift);
 }
@@ -76,13 +76,7 @@ unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
 	u8 ret_flags;
 
 	version = src->version;
-	/* Note: emulated platforms which do not advertise SSE2 support
-	 * result in kvmclock not using the necessary RDTSC barriers.
-	 * Without barriers, it is possible that RDTSC instruction reads from
-	 * the time stamp counter outside rdtsc_barrier protected section
-	 * below, resulting in violation of monotonicity.
-	 */
-	rdtsc_barrier();
+
 	offset = pvclock_get_nsec_offset(src);
 	ret = src->system_time + offset;
 	ret_flags = src->flags;
-- 
cgit v0.10.2


From bb8dd96032fc63babfc8b378a37dd7681eeec326 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 25 Jun 2015 18:44:12 +0200
Subject: x86/asm/tsc: Remove rdtsc_barrier()

All callers have been converted to rdtsc_ordered().

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Richard Weinberger <richard@nod.at>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/9baa4ae9a1e7c7c282f9cb2f15bb6bf5c2004032.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index e51a8f8..818cb87 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -91,15 +91,4 @@ do {									\
 #define smp_mb__before_atomic()	barrier()
 #define smp_mb__after_atomic()	barrier()
 
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- */
-static __always_inline void rdtsc_barrier(void)
-{
-	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
-			  "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
 #endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index b9531d3..755481f 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -45,17 +45,4 @@
 #define read_barrier_depends()		do { } while (0)
 #define smp_read_barrier_depends()	do { } while (0)
 
-/*
- * Stop RDTSC speculation. This is needed when you need to use RDTSC
- * (or get_cycles or vread that possibly accesses the TSC) in a defined
- * code region.
- *
- * (Could use an alternative three way for this if there was one.)
- */
-static inline void rdtsc_barrier(void)
-{
-	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
-			  "lfence", X86_FEATURE_LFENCE_RDTSC);
-}
-
 #endif
-- 
cgit v0.10.2


From 5a33fcb8d991209bac0a266ab499e4b53d116cdd Mon Sep 17 00:00:00 2001
From: George Spelvin <linux@horizon.com>
Date: Thu, 25 Jun 2015 18:44:13 +0200
Subject: x86/asm/tsc: Save an instruction in DECLARE_ARGS users

Before, the code to do RDTSC looked like:

  rdtsc
  shl $0x20, %rdx
  mov %eax, %eax
  or  %rdx, %rax

The "mov %eax, %eax" is required to clear the high 32 bits of RAX.

By declaring low and high as 64-bit variables, the code is
simplified to:

  rdtsc
  shl $0x20,%rdx
  or  %rdx,%rax

Yes, it's a 2-byte instruction that's not on a critical path,
but there are principles to be upheld.

Every user of EAX_EDX_RET has been checked. I tried to check
users of EAX_EDX_ARGS, but there weren't any, so I deleted it to
be safe.

( There's no benefit to making "high" 64 bits, but it was the
  simplest way to proceed. )

Signed-off-by: George Spelvin <linux@horizon.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: jacob.jun.pan@linux.intel.com
Link: http://lkml.kernel.org/r/20150618075906.4615.qmail@ns.horizon.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 02bdd6c..131eec2 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -47,14 +47,13 @@ static inline unsigned long long native_read_tscp(unsigned int *aux)
  * it means rax *or* rdx.
  */
 #ifdef CONFIG_X86_64
-#define DECLARE_ARGS(val, low, high)	unsigned low, high
-#define EAX_EDX_VAL(val, low, high)	((low) | ((u64)(high) << 32))
-#define EAX_EDX_ARGS(val, low, high)	"a" (low), "d" (high)
+/* Using 64-bit values saves one instruction clearing the high half of low */
+#define DECLARE_ARGS(val, low, high)	unsigned long low, high
+#define EAX_EDX_VAL(val, low, high)	((low) | (high) << 32)
 #define EAX_EDX_RET(val, low, high)	"=a" (low), "=d" (high)
 #else
 #define DECLARE_ARGS(val, low, high)	unsigned long long val
 #define EAX_EDX_VAL(val, low, high)	(val)
-#define EAX_EDX_ARGS(val, low, high)	"A" (val)
 #define EAX_EDX_RET(val, low, high)	"=A" (val)
 #endif
 
-- 
cgit v0.10.2


From c0bfd26e136cafc2b23c16225b4d7b1e14de81c1 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:10 -0400
Subject: x86/compat: Move copy_siginfo_*_user32() to signal_compat.c

copy_siginfo_to_user32() and copy_siginfo_from_user32() are used
by both the 32-bit compat and x32 ABIs.  Move them to
signal_compat.c.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-2-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index ae3a29a..a0a19b7 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -34,99 +34,6 @@
 #include <asm/sys_ia32.h>
 #include <asm/smap.h>
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
-{
-	int err = 0;
-	bool ia32 = test_thread_flag(TIF_IA32);
-
-	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
-		return -EFAULT;
-
-	put_user_try {
-		/* If you change siginfo_t structure, please make sure that
-		   this code is fixed accordingly.
-		   It should never copy any pad contained in the structure
-		   to avoid security leaks, but must copy the generic
-		   3 ints plus the relevant union member.  */
-		put_user_ex(from->si_signo, &to->si_signo);
-		put_user_ex(from->si_errno, &to->si_errno);
-		put_user_ex((short)from->si_code, &to->si_code);
-
-		if (from->si_code < 0) {
-			put_user_ex(from->si_pid, &to->si_pid);
-			put_user_ex(from->si_uid, &to->si_uid);
-			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
-		} else {
-			/*
-			 * First 32bits of unions are always present:
-			 * si_pid === si_band === si_tid === si_addr(LS half)
-			 */
-			put_user_ex(from->_sifields._pad[0],
-					  &to->_sifields._pad[0]);
-			switch (from->si_code >> 16) {
-			case __SI_FAULT >> 16:
-				break;
-			case __SI_SYS >> 16:
-				put_user_ex(from->si_syscall, &to->si_syscall);
-				put_user_ex(from->si_arch, &to->si_arch);
-				break;
-			case __SI_CHLD >> 16:
-				if (ia32) {
-					put_user_ex(from->si_utime, &to->si_utime);
-					put_user_ex(from->si_stime, &to->si_stime);
-				} else {
-					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
-					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
-				}
-				put_user_ex(from->si_status, &to->si_status);
-				/* FALL THROUGH */
-			default:
-			case __SI_KILL >> 16:
-				put_user_ex(from->si_uid, &to->si_uid);
-				break;
-			case __SI_POLL >> 16:
-				put_user_ex(from->si_fd, &to->si_fd);
-				break;
-			case __SI_TIMER >> 16:
-				put_user_ex(from->si_overrun, &to->si_overrun);
-				put_user_ex(ptr_to_compat(from->si_ptr),
-					    &to->si_ptr);
-				break;
-				 /* This is not generated by the kernel as of now.  */
-			case __SI_RT >> 16:
-			case __SI_MESGQ >> 16:
-				put_user_ex(from->si_uid, &to->si_uid);
-				put_user_ex(from->si_int, &to->si_int);
-				break;
-			}
-		}
-	} put_user_catch(err);
-
-	return err;
-}
-
-int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
-{
-	int err = 0;
-	u32 ptr32;
-
-	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
-		return -EFAULT;
-
-	get_user_try {
-		get_user_ex(to->si_signo, &from->si_signo);
-		get_user_ex(to->si_errno, &from->si_errno);
-		get_user_ex(to->si_code, &from->si_code);
-
-		get_user_ex(to->si_pid, &from->si_pid);
-		get_user_ex(to->si_uid, &from->si_uid);
-		get_user_ex(ptr32, &from->si_ptr);
-		to->si_ptr = compat_ptr(ptr32);
-	} get_user_catch(err);
-
-	return err;
-}
-
 /*
  * Do a signal return; undo the signal stack.
  */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 0f15af4..dc19730 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -23,6 +23,7 @@ KASAN_SANITIZE_dumpstack_$(BITS).o := n
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y			:= process_$(BITS).o signal.o
+obj-$(CONFIG_COMPAT)	+= signal_compat.o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y			+= time.o ioport.o ldt.o dumpstack.o nmi.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c
new file mode 100644
index 0000000..dc3c0b1
--- /dev/null
+++ b/arch/x86/kernel/signal_compat.c
@@ -0,0 +1,95 @@
+#include <linux/compat.h>
+#include <linux/uaccess.h>
+
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from)
+{
+	int err = 0;
+	bool ia32 = test_thread_flag(TIF_IA32);
+
+	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	put_user_try {
+		/* If you change siginfo_t structure, please make sure that
+		   this code is fixed accordingly.
+		   It should never copy any pad contained in the structure
+		   to avoid security leaks, but must copy the generic
+		   3 ints plus the relevant union member.  */
+		put_user_ex(from->si_signo, &to->si_signo);
+		put_user_ex(from->si_errno, &to->si_errno);
+		put_user_ex((short)from->si_code, &to->si_code);
+
+		if (from->si_code < 0) {
+			put_user_ex(from->si_pid, &to->si_pid);
+			put_user_ex(from->si_uid, &to->si_uid);
+			put_user_ex(ptr_to_compat(from->si_ptr), &to->si_ptr);
+		} else {
+			/*
+			 * First 32bits of unions are always present:
+			 * si_pid === si_band === si_tid === si_addr(LS half)
+			 */
+			put_user_ex(from->_sifields._pad[0],
+					  &to->_sifields._pad[0]);
+			switch (from->si_code >> 16) {
+			case __SI_FAULT >> 16:
+				break;
+			case __SI_SYS >> 16:
+				put_user_ex(from->si_syscall, &to->si_syscall);
+				put_user_ex(from->si_arch, &to->si_arch);
+				break;
+			case __SI_CHLD >> 16:
+				if (ia32) {
+					put_user_ex(from->si_utime, &to->si_utime);
+					put_user_ex(from->si_stime, &to->si_stime);
+				} else {
+					put_user_ex(from->si_utime, &to->_sifields._sigchld_x32._utime);
+					put_user_ex(from->si_stime, &to->_sifields._sigchld_x32._stime);
+				}
+				put_user_ex(from->si_status, &to->si_status);
+				/* FALL THROUGH */
+			default:
+			case __SI_KILL >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				break;
+			case __SI_POLL >> 16:
+				put_user_ex(from->si_fd, &to->si_fd);
+				break;
+			case __SI_TIMER >> 16:
+				put_user_ex(from->si_overrun, &to->si_overrun);
+				put_user_ex(ptr_to_compat(from->si_ptr),
+					    &to->si_ptr);
+				break;
+				 /* This is not generated by the kernel as of now.  */
+			case __SI_RT >> 16:
+			case __SI_MESGQ >> 16:
+				put_user_ex(from->si_uid, &to->si_uid);
+				put_user_ex(from->si_int, &to->si_int);
+				break;
+			}
+		}
+	} put_user_catch(err);
+
+	return err;
+}
+
+int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
+{
+	int err = 0;
+	u32 ptr32;
+
+	if (!access_ok(VERIFY_READ, from, sizeof(compat_siginfo_t)))
+		return -EFAULT;
+
+	get_user_try {
+		get_user_ex(to->si_signo, &from->si_signo);
+		get_user_ex(to->si_errno, &from->si_errno);
+		get_user_ex(to->si_code, &from->si_code);
+
+		get_user_ex(to->si_pid, &from->si_pid);
+		get_user_ex(to->si_uid, &from->si_uid);
+		get_user_ex(ptr32, &from->si_ptr);
+		to->si_ptr = compat_ptr(ptr32);
+	} get_user_catch(err);
+
+	return err;
+}
-- 
cgit v0.10.2


From b2e02b820d5b42479195b89d3d73f31bcedb264e Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:11 -0400
Subject: x86/compat: Make mmap_is_ia32() common compat

TIF_ADDR32 is set for both ia32 and x32 tasks, so change from
CONFIG_IA32_EMULATION to CONFIG_COMPAT.  Use config_enabled()
to make the function more readable.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-3-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index f161c18..180b6fe 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -344,14 +344,9 @@ extern int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
  */
 static inline int mmap_is_ia32(void)
 {
-#ifdef CONFIG_X86_32
-	return 1;
-#endif
-#ifdef CONFIG_IA32_EMULATION
-	if (test_thread_flag(TIF_ADDR32))
-		return 1;
-#endif
-	return 0;
+	return config_enabled(CONFIG_X86_32) ||
+	       (config_enabled(CONFIG_COMPAT) &&
+		test_thread_flag(TIF_ADDR32));
 }
 
 /* Do not change the values. See get_align_mask() */
-- 
cgit v0.10.2


From b829d1be20ab51a3b76ec003118c9260d1fa424e Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:12 -0400
Subject: x86/compat: Move ucontext_x32 to sigframe.h

ia32.h should only contain the code for 32-bit compatability.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-4-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/ia32.h b/arch/x86/include/asm/ia32.h
index d0e8e01..2801976 100644
--- a/arch/x86/include/asm/ia32.h
+++ b/arch/x86/include/asm/ia32.h
@@ -22,15 +22,6 @@ struct ucontext_ia32 {
 	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
 };
 
-struct ucontext_x32 {
-	unsigned int	  uc_flags;
-	unsigned int 	  uc_link;
-	compat_stack_t	  uc_stack;
-	unsigned int	  uc__pad0;     /* needed for alignment */
-	struct sigcontext uc_mcontext;  /* the 64-bit sigcontext type */
-	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
-};
-
 /* This matches struct stat64 in glibc2.2, hence the absolutely
  * insane amounts of padding around dev_t's.
  */
diff --git a/arch/x86/include/asm/sigframe.h b/arch/x86/include/asm/sigframe.h
index 7c7c27c..1f3175b 100644
--- a/arch/x86/include/asm/sigframe.h
+++ b/arch/x86/include/asm/sigframe.h
@@ -4,6 +4,7 @@
 #include <asm/sigcontext.h>
 #include <asm/siginfo.h>
 #include <asm/ucontext.h>
+#include <linux/compat.h>
 
 #ifdef CONFIG_X86_32
 #define sigframe_ia32		sigframe
@@ -69,6 +70,15 @@ struct rt_sigframe {
 
 #ifdef CONFIG_X86_X32_ABI
 
+struct ucontext_x32 {
+	unsigned int	  uc_flags;
+	unsigned int 	  uc_link;
+	compat_stack_t	  uc_stack;
+	unsigned int	  uc__pad0;     /* needed for alignment */
+	struct sigcontext uc_mcontext;  /* the 64-bit sigcontext type */
+	compat_sigset_t	  uc_sigmask;	/* mask last for extensibility */
+};
+
 struct rt_sigframe_x32 {
 	u64 pretcode;
 	struct ucontext_x32 uc;
-- 
cgit v0.10.2


From 7da770785f9740af1cb24b8fd63075543bd00711 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:13 -0400
Subject: x86/compat: Rename 'start_thread_ia32' to 'compat_start_thread'

This function is shared between the 32-bit compat and x32 ABIs.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-5-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 180b6fe..2bf67c0 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -187,8 +187,8 @@ static inline void elf_common_init(struct thread_struct *t,
 #define	COMPAT_ELF_PLAT_INIT(regs, load_addr)		\
 	elf_common_init(&current->thread, regs, __USER_DS)
 
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
-#define compat_start_thread start_thread_ia32
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp);
+#define compat_start_thread compat_start_thread
 
 void set_personality_ia32(bool);
 #define COMPAT_SET_PERSONALITY(ex)			\
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 71d7849..0831ba3 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -248,8 +248,8 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
 			    __USER_CS, __USER_DS, 0);
 }
 
-#ifdef CONFIG_IA32_EMULATION
-void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
+#ifdef CONFIG_COMPAT
+void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
 {
 	start_thread_common(regs, new_ip, new_sp,
 			    test_thread_flag(TIF_X32)
-- 
cgit v0.10.2


From 601275c3e04c43b3b34237ab36c27fc1cfb8a189 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:14 -0400
Subject: x86/compat: Factor out ia32 compat code from compat_arch_ptrace()

Move the ia32-specific code in compat_arch_ptrace() into its
own function.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-6-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 9be72bc..7155957 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1123,6 +1123,73 @@ static int genregs32_set(struct task_struct *target,
 	return ret;
 }
 
+static long ia32_arch_ptrace(struct task_struct *child, compat_long_t request,
+			     compat_ulong_t caddr, compat_ulong_t cdata)
+{
+	unsigned long addr = caddr;
+	unsigned long data = cdata;
+	void __user *datap = compat_ptr(data);
+	int ret;
+	__u32 val;
+
+	switch (request) {
+	case PTRACE_PEEKUSR:
+		ret = getreg32(child, addr, &val);
+		if (ret == 0)
+			ret = put_user(val, (__u32 __user *)datap);
+		break;
+
+	case PTRACE_POKEUSR:
+		ret = putreg32(child, addr, data);
+		break;
+
+	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_GENERAL,
+					   0, sizeof(struct user_regs_struct32),
+					   datap);
+
+	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_GENERAL, 0,
+					     sizeof(struct user_regs_struct32),
+					     datap);
+
+	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_FP, 0,
+					   sizeof(struct user_i387_ia32_struct),
+					   datap);
+
+	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
+		return copy_regset_from_user(
+			child, &user_x86_32_view, REGSET_FP,
+			0, sizeof(struct user_i387_ia32_struct), datap);
+
+	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
+		return copy_regset_to_user(child, &user_x86_32_view,
+					   REGSET_XFP, 0,
+					   sizeof(struct user32_fxsr_struct),
+					   datap);
+
+	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
+		return copy_regset_from_user(child, &user_x86_32_view,
+					     REGSET_XFP, 0,
+					     sizeof(struct user32_fxsr_struct),
+					     datap);
+
+	case PTRACE_GET_THREAD_AREA:
+	case PTRACE_SET_THREAD_AREA:
+		return arch_ptrace(child, request, addr, data);
+
+	default:
+		return compat_ptrace_request(child, request, addr, data);
+	}
+
+	return ret;
+}
+#endif /* CONFIG_IA32_EMULATION */
+
 #ifdef CONFIG_X86_X32_ABI
 static long x32_arch_ptrace(struct task_struct *child,
 			    compat_long_t request, compat_ulong_t caddr,
@@ -1211,78 +1278,21 @@ static long x32_arch_ptrace(struct task_struct *child,
 }
 #endif
 
+#ifdef CONFIG_COMPAT
 long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
 			compat_ulong_t caddr, compat_ulong_t cdata)
 {
-	unsigned long addr = caddr;
-	unsigned long data = cdata;
-	void __user *datap = compat_ptr(data);
-	int ret;
-	__u32 val;
-
 #ifdef CONFIG_X86_X32_ABI
 	if (!is_ia32_task())
 		return x32_arch_ptrace(child, request, caddr, cdata);
 #endif
-
-	switch (request) {
-	case PTRACE_PEEKUSR:
-		ret = getreg32(child, addr, &val);
-		if (ret == 0)
-			ret = put_user(val, (__u32 __user *)datap);
-		break;
-
-	case PTRACE_POKEUSR:
-		ret = putreg32(child, addr, data);
-		break;
-
-	case PTRACE_GETREGS:	/* Get all gp regs from the child. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_GENERAL,
-					   0, sizeof(struct user_regs_struct32),
-					   datap);
-
-	case PTRACE_SETREGS:	/* Set all gp regs in the child. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_GENERAL, 0,
-					     sizeof(struct user_regs_struct32),
-					     datap);
-
-	case PTRACE_GETFPREGS:	/* Get the child FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_FP, 0,
-					   sizeof(struct user_i387_ia32_struct),
-					   datap);
-
-	case PTRACE_SETFPREGS:	/* Set the child FPU state. */
-		return copy_regset_from_user(
-			child, &user_x86_32_view, REGSET_FP,
-			0, sizeof(struct user_i387_ia32_struct), datap);
-
-	case PTRACE_GETFPXREGS:	/* Get the child extended FPU state. */
-		return copy_regset_to_user(child, &user_x86_32_view,
-					   REGSET_XFP, 0,
-					   sizeof(struct user32_fxsr_struct),
-					   datap);
-
-	case PTRACE_SETFPXREGS:	/* Set the child extended FPU state. */
-		return copy_regset_from_user(child, &user_x86_32_view,
-					     REGSET_XFP, 0,
-					     sizeof(struct user32_fxsr_struct),
-					     datap);
-
-	case PTRACE_GET_THREAD_AREA:
-	case PTRACE_SET_THREAD_AREA:
-		return arch_ptrace(child, request, addr, data);
-
-	default:
-		return compat_ptrace_request(child, request, addr, data);
-	}
-
-	return ret;
+#ifdef CONFIG_IA32_EMULATION
+	return ia32_arch_ptrace(child, request, caddr, cdata);
+#else
+	return 0;
+#endif
 }
-
-#endif	/* CONFIG_IA32_EMULATION */
+#endif	/* CONFIG_COMPAT */
 
 #ifdef CONFIG_X86_64
 
-- 
cgit v0.10.2


From ab8b82ee6dad7c9c257f450d14719a0e3f327244 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:15 -0400
Subject: x86/compat: Don't build the 32-bit VDSO if not needed

Build the 32-bit vdso only for native 32-bit or 32-bit compat is
enabled.  x32 should not force it to build.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-7-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index e970320..96c0617 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -8,7 +8,7 @@ KASAN_SANITIZE := n
 VDSO64-$(CONFIG_X86_64)		:= y
 VDSOX32-$(CONFIG_X86_X32_ABI)	:= y
 VDSO32-$(CONFIG_X86_32)		:= y
-VDSO32-$(CONFIG_COMPAT)		:= y
+VDSO32-$(CONFIG_IA32_EMULATION)	:= y
 
 # files to link into the vdso
 vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
@@ -20,7 +20,7 @@ obj-y				+= vma.o
 vdso_img-$(VDSO64-y)		+= 64
 vdso_img-$(VDSOX32-y)		+= x32
 vdso_img-$(VDSO32-y)		+= 32-int80
-vdso_img-$(CONFIG_COMPAT)	+= 32-syscall
+vdso_img-$(CONFIG_IA32_EMULATION)	+= 32-syscall
 vdso_img-$(VDSO32-y)		+= 32-sysenter
 
 obj-$(VDSO32-y)			+= vdso32-setup.o
@@ -126,7 +126,7 @@ $(obj)/vdsox32.so.dbg: $(src)/vdsox32.lds $(vobjx32s) FORCE
 # Build multiple 32-bit vDSO images to choose from at boot time.
 #
 vdso32.so-$(VDSO32-y)		+= int80
-vdso32.so-$(CONFIG_COMPAT)	+= syscall
+vdso32.so-$(CONFIG_IA32_EMULATION)	+= syscall
 vdso32.so-$(VDSO32-y)		+= sysenter
 
 vdso32-images			= $(vdso32.so-y:%=vdso32-%.so)
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 1c9f750..4345431 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -177,7 +177,7 @@ up_fail:
 	return ret;
 }
 
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 static int load_vdso32(void)
 {
 	int ret;
@@ -219,8 +219,11 @@ int compat_arch_setup_additional_pages(struct linux_binprm *bprm,
 		return map_vdso(&vdso_image_x32, true);
 	}
 #endif
-
+#ifdef CONFIG_IA32_EMULATION
 	return load_vdso32();
+#else
+	return 0;
+#endif
 }
 #endif
 #else
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 2bf67c0..141c561 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -78,7 +78,7 @@ typedef struct user_fxsr_struct elf_fpxregset_t;
 #ifdef CONFIG_X86_64
 extern unsigned int vdso64_enabled;
 #endif
-#if defined(CONFIG_X86_32) || defined(CONFIG_COMPAT)
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
 extern unsigned int vdso32_enabled;
 #endif
 
-- 
cgit v0.10.2


From c338867d0e4224771c68d0a7727289b86c23eccd Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:16 -0400
Subject: x86/compat: Check for both 32-bit compat and x32 in get_gate_vma()

Change this to CONFIG_COMPAT so both 32-bit compat and x32 will
do the check.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-8-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vsyscall/vsyscall_64.c b/arch/x86/entry/vsyscall/vsyscall_64.c
index 2dcc6ff..26a46f4 100644
--- a/arch/x86/entry/vsyscall/vsyscall_64.c
+++ b/arch/x86/entry/vsyscall/vsyscall_64.c
@@ -290,7 +290,7 @@ static struct vm_area_struct gate_vma = {
 
 struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
 {
-#ifdef CONFIG_IA32_EMULATION
+#ifdef CONFIG_COMPAT
 	if (!mm || mm->context.ia32_compat)
 		return NULL;
 #endif
-- 
cgit v0.10.2


From 10ed34935e7e828ce4ce566647a2d6b8240e4dee Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:17 -0400
Subject: x86/compat, x86/perf: Don't build perf_callchain_user32() on x32

perf_callchain_user32() is not needed for x32.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-9-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 3658de4..641413d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2196,7 +2196,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	return get_desc_base(desc + idx);
 }
 
-#ifdef CONFIG_COMPAT
+#ifdef CONFIG_IA32_EMULATION
 
 #include <asm/compat.h>
 
-- 
cgit v0.10.2


From 5e2aad2460bd38d0777052486893b32902efcdcd Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:18 -0400
Subject: x86/compat: Remove unneeded #include

Including sys_ia32.h is not needed in signal.c.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-10-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 206996c..6c22aad 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -35,7 +35,6 @@
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
 #include <asm/ia32_unistd.h>
-#include <asm/sys_ia32.h>
 #endif /* CONFIG_X86_64 */
 
 #include <asm/syscall.h>
-- 
cgit v0.10.2


From 3bead553ab657d482c3fd6559a1fd7f024414a63 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:19 -0400
Subject: x86/compat: Define ARCH_WANT_OLD_COMPAT_IPC only for 32-bit compat

x32 does not need CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-11-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 55bced1..6e910ba 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2517,6 +2517,7 @@ config IA32_EMULATION
 	select BINFMT_ELF
 	select COMPAT_BINFMT_ELF
 	select HAVE_UID16
+	select ARCH_WANT_OLD_COMPAT_IPC
 	---help---
 	  Include code to run legacy 32-bit programs under a
 	  64-bit kernel. You should likely turn this on, unless you're
@@ -2544,7 +2545,6 @@ config X86_X32
 config COMPAT
 	def_bool y
 	depends on IA32_EMULATION || X86_X32
-	select ARCH_WANT_OLD_COMPAT_IPC
 
 if COMPAT
 config COMPAT_FOR_U64_ALIGNMENT
-- 
cgit v0.10.2


From 0c3619ea6756833e5c636c886cb55ca5b77f5d73 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:20 -0400
Subject: x86/compat: Clean up HAVE_UID16 config

Merge the 32-bit compat config setting for HAVE_UID16 with the
32-bit native one.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-12-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 6e910ba..d823a33 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -132,7 +132,7 @@ config X86
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
 	select HAVE_SYSCALL_TRACEPOINTS
-	select HAVE_UID16			if X86_32
+	select HAVE_UID16			if X86_32 || IA32_EMULATION
 	select HAVE_UNSTABLE_SCHED_CLOCK
 	select HAVE_USER_RETURN_NOTIFIER
 	select IRQ_FORCED_THREADING
@@ -2516,7 +2516,6 @@ config IA32_EMULATION
 	depends on X86_64
 	select BINFMT_ELF
 	select COMPAT_BINFMT_ELF
-	select HAVE_UID16
 	select ARCH_WANT_OLD_COMPAT_IPC
 	---help---
 	  Include code to run legacy 32-bit programs under a
-- 
cgit v0.10.2


From 9b54050bfe438d9e1108211d28cb0b995b1f347c Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Mon, 22 Jun 2015 07:55:21 -0400
Subject: x86/compat: Separate ia32 and x32 compat ABIs

The x32 ABI is now independent of the ia32 compat ABI.  Common
code is now conditional on CONFIG_COMPAT, but unshared code like
syscall entry, signal handling, and the VDSO are under separate
config options.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1434974121-32575-13-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index d823a33..aa94fd0 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2530,7 +2530,7 @@ config IA32_AOUT
 
 config X86_X32
 	bool "x32 ABI for 64-bit mode"
-	depends on X86_64 && IA32_EMULATION
+	depends on X86_64
 	---help---
 	  Include code to run binaries for the x32 native 32-bit ABI
 	  for 64-bit processors.  An x32 process gets access to the
-- 
cgit v0.10.2


From 5e5c684a2c78b98dcba3d6fce56773a375f63980 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:18 -0700
Subject: x86/entry, selftests/x86: Add a test for 32-bit fast syscall arg
 faults

This test passes on 4.0 and fails on some newer kernels.
Fortunately, the failure is likely not a big deal.

This test will make sure that we don't break it further (e.g. OOPSing)
as we clean up the entry code and that we eventually fix the
regression.

There's arguably no need to preserve the old ABI here --
anything that makes it into a fast (vDSO) syscall with a bad
stack is about to crash no matter what we do.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/9cfcc51005168cb1b06b31991931214d770fc59a.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index caa60d5..e8df47e 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -5,7 +5,7 @@ include ../lib.mk
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
 TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs
-TARGETS_C_32BIT_ONLY := entry_from_vm86
+TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
diff --git a/tools/testing/selftests/x86/syscall_arg_fault.c b/tools/testing/selftests/x86/syscall_arg_fault.c
new file mode 100644
index 0000000..7db4fc9
--- /dev/null
+++ b/tools/testing/selftests/x86/syscall_arg_fault.c
@@ -0,0 +1,130 @@
+/*
+ * syscall_arg_fault.c - tests faults 32-bit fast syscall stack args
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ */
+
+#define _GNU_SOURCE
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/signal.h>
+#include <sys/ucontext.h>
+#include <err.h>
+#include <setjmp.h>
+#include <errno.h>
+
+/* Our sigaltstack scratch space. */
+static unsigned char altstack_data[SIGSTKSZ];
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static volatile sig_atomic_t sig_traps;
+static sigjmp_buf jmpbuf;
+
+static volatile sig_atomic_t n_errs;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+	ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+	if (ctx->uc_mcontext.gregs[REG_EAX] != -EFAULT) {
+		printf("[FAIL]\tAX had the wrong value: 0x%x\n",
+		       ctx->uc_mcontext.gregs[REG_EAX]);
+		n_errs++;
+	} else {
+		printf("[OK]\tSeems okay\n");
+	}
+
+	siglongjmp(jmpbuf, 1);
+}
+
+static void sigill(int sig, siginfo_t *info, void *ctx_void)
+{
+	printf("[SKIP]\tIllegal instruction\n");
+	siglongjmp(jmpbuf, 1);
+}
+
+int main()
+{
+	stack_t stack = {
+		.ss_sp = altstack_data,
+		.ss_size = SIGSTKSZ,
+	};
+	if (sigaltstack(&stack, NULL) != 0)
+		err(1, "sigaltstack");
+
+	sethandler(SIGSEGV, sigsegv, SA_ONSTACK);
+	sethandler(SIGILL, sigill, SA_ONSTACK);
+
+	/*
+	 * Exercise another nasty special case.  The 32-bit SYSCALL
+	 * and SYSENTER instructions (even in compat mode) each
+	 * clobber one register.  A Linux system call has a syscall
+	 * number and six arguments, and the user stack pointer
+	 * needs to live in some register on return.  That means
+	 * that we need eight registers, but SYSCALL and SYSENTER
+	 * only preserve seven registers.  As a result, one argument
+	 * ends up on the stack.  The stack is user memory, which
+	 * means that the kernel can fail to read it.
+	 *
+	 * The 32-bit fast system calls don't have a defined ABI:
+	 * we're supposed to invoke them through the vDSO.  So we'll
+	 * fudge it: we set all regs to invalid pointer values and
+	 * invoke the entry instruction.  The return will fail no
+	 * matter what, and we completely lose our program state,
+	 * but we can fix it up with a signal handler.
+	 */
+
+	printf("[RUN]\tSYSENTER with invalid state\n");
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		asm volatile (
+			"movl $-1, %%eax\n\t"
+			"movl $-1, %%ebx\n\t"
+			"movl $-1, %%ecx\n\t"
+			"movl $-1, %%edx\n\t"
+			"movl $-1, %%esi\n\t"
+			"movl $-1, %%edi\n\t"
+			"movl $-1, %%ebp\n\t"
+			"movl $-1, %%esp\n\t"
+			"sysenter"
+			: : : "memory", "flags");
+	}
+
+	printf("[RUN]\tSYSCALL with invalid state\n");
+	if (sigsetjmp(jmpbuf, 1) == 0) {
+		asm volatile (
+			"movl $-1, %%eax\n\t"
+			"movl $-1, %%ebx\n\t"
+			"movl $-1, %%ecx\n\t"
+			"movl $-1, %%edx\n\t"
+			"movl $-1, %%esi\n\t"
+			"movl $-1, %%edi\n\t"
+			"movl $-1, %%ebp\n\t"
+			"movl $-1, %%esp\n\t"
+			"syscall\n\t"
+			"pushl $0"	/* make sure we segfault cleanly */
+			: : : "memory", "flags");
+	}
+
+	return 0;
+}
-- 
cgit v0.10.2


From 5e99cb7c35ca0580da8e892f91c655d35ecf8798 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:19 -0700
Subject: x86/entry/64/compat: Fix bad fast syscall arg failure path

If user code does SYSCALL32 or SYSENTER without a valid stack,
then our attempt to determine the syscall args will result in a
failed uaccess fault.  Previously, we would try to recover by
jumping to the syscall exit code, but we'd run the syscall exit
work even though we never made it to the syscall entry work.

Clean it up by treating the failure path as a non-syscall entry
and exit pair.

This fixes strace's output when running the syscall_arg_fault
test. Without this fix, strace would get out of sync and would
fail to associate syscall entries with syscall exits.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/903010762c07a3d67df914fea2da84b52b0f8f1d.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3bb2c43..141a5d4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -613,7 +613,7 @@ ret_from_intr:
 	testb	$3, CS(%rsp)
 	jz	retint_kernel
 	/* Interrupt came from user space */
-retint_user:
+GLOBAL(retint_user)
 	GET_THREAD_INFO(%rcx)
 
 	/* %rcx: thread info. Interrupts are off. */
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index b868cfc..e5ebdd9 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -428,8 +428,39 @@ cstar_tracesys:
 END(entry_SYSCALL_compat)
 
 ia32_badarg:
-	ASM_CLAC
-	movq	$-EFAULT, RAX(%rsp)
+	/*
+	 * So far, we've entered kernel mode, set AC, turned on IRQs, and
+	 * saved C regs except r8-r11.  We haven't done any of the other
+	 * standard entry work, though.  We want to bail, but we shouldn't
+	 * treat this as a syscall entry since we don't even know what the
+	 * args are.  Instead, treat this as a non-syscall entry, finish
+	 * the entry work, and immediately exit after setting AX = -EFAULT.
+	 *
+	 * We're really just being polite here.  Killing the task outright
+	 * would be a reasonable action, too.  Given that the only valid
+	 * way to have gotten here is through the vDSO, and we already know
+	 * that the stack pointer is bad, the task isn't going to survive
+	 * for long no matter what we do.
+	 */
+
+	ASM_CLAC			/* undo STAC */
+	movq	$-EFAULT, RAX(%rsp)	/* return -EFAULT if possible */
+
+	/* Fill in the rest of pt_regs */
+	xorl	%eax, %eax
+	movq	%rax, R11(%rsp)
+	movq	%rax, R10(%rsp)
+	movq	%rax, R9(%rsp)
+	movq	%rax, R8(%rsp)
+	SAVE_EXTRA_REGS
+
+	/* Turn IRQs back off. */
+	DISABLE_INTERRUPTS(CLBR_NONE)
+	TRACE_IRQS_OFF
+
+	/* And exit again. */
+	jmp retint_user
+
 ia32_ret_from_sys_call:
 	xorl	%eax, %eax		/* Do not leak kernel information */
 	movq	%rax, R11(%rsp)
-- 
cgit v0.10.2


From ccaee5f851470dec6894a6835b6fadffc2bb7514 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 3 Jul 2015 12:44:20 -0700
Subject: um: Fix do_signal() prototype

Once x86 exports its do_signal(), the prototypes will clash.

Fix the clash and also improve the code a bit: remove the
unnecessary kern_do_signal() indirection. This allows
interrupt_end() to share the 'regs' parameter calculation.

Also remove the unused return code to match x86.

Minimally build and boot tested.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Richard Weinberger <richard.weinberger@gmail.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/67c57eac09a589bac3c6c5ff22f9623ec55a184a.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/um/include/shared/kern_util.h b/arch/um/include/shared/kern_util.h
index 83a91f9..35ab97e 100644
--- a/arch/um/include/shared/kern_util.h
+++ b/arch/um/include/shared/kern_util.h
@@ -22,7 +22,8 @@ extern int kmalloc_ok;
 extern unsigned long alloc_stack(int order, int atomic);
 extern void free_stack(unsigned long stack, int order);
 
-extern int do_signal(void);
+struct pt_regs;
+extern void do_signal(struct pt_regs *regs);
 extern void interrupt_end(void);
 extern void relay_signal(int sig, struct siginfo *si, struct uml_pt_regs *regs);
 
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 68b9119..a6d9226 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -90,12 +90,14 @@ void *__switch_to(struct task_struct *from, struct task_struct *to)
 
 void interrupt_end(void)
 {
+	struct pt_regs *regs = &current->thread.regs;
+
 	if (need_resched())
 		schedule();
 	if (test_thread_flag(TIF_SIGPENDING))
-		do_signal();
+		do_signal(regs);
 	if (test_and_clear_thread_flag(TIF_NOTIFY_RESUME))
-		tracehook_notify_resume(&current->thread.regs);
+		tracehook_notify_resume(regs);
 }
 
 void exit_thread(void)
diff --git a/arch/um/kernel/signal.c b/arch/um/kernel/signal.c
index 4f60e4a..57acbd6 100644
--- a/arch/um/kernel/signal.c
+++ b/arch/um/kernel/signal.c
@@ -64,7 +64,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	signal_setup_done(err, ksig, singlestep);
 }
 
-static int kern_do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
 {
 	struct ksignal ksig;
 	int handled_sig = 0;
@@ -110,10 +110,4 @@ static int kern_do_signal(struct pt_regs *regs)
 	 */
 	if (!handled_sig)
 		restore_saved_sigmask();
-	return handled_sig;
-}
-
-int do_signal(void)
-{
-	return kern_do_signal(&current->thread.regs);
 }
diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c
index f1b3eb1..2077248 100644
--- a/arch/um/kernel/tlb.c
+++ b/arch/um/kernel/tlb.c
@@ -291,7 +291,7 @@ void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
 		/* We are under mmap_sem, release it such that current can terminate */
 		up_write(&current->mm->mmap_sem);
 		force_sig(SIGKILL, current);
-		do_signal();
+		do_signal(&current->thread.regs);
 	}
 }
 
diff --git a/arch/um/kernel/trap.c b/arch/um/kernel/trap.c
index 557232f..d8a9fce 100644
--- a/arch/um/kernel/trap.c
+++ b/arch/um/kernel/trap.c
@@ -173,7 +173,7 @@ static void bad_segv(struct faultinfo fi, unsigned long ip)
 void fatal_sigsegv(void)
 {
 	force_sigsegv(SIGSEGV, current);
-	do_signal();
+	do_signal(&current->thread.regs);
 	/*
 	 * This is to tell gcc that we're not returning - do_signal
 	 * can, in general, return, but in this case, it's not, since
-- 
cgit v0.10.2


From f9281648ecd5081803bb2da84b9ccb0cf48436cd Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:21 -0700
Subject: context_tracking: Add ct_state() and CT_WARN_ON()

This will let us sprinkle sanity checks around the kernel
without making too much of a mess.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/5da41fb2ceb29eac671f427c67040401ba2a1fa0.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index b96bd29..008fc67 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -49,13 +49,28 @@ static inline void exception_exit(enum ctx_state prev_ctx)
 	}
 }
 
+
+/**
+ * ct_state() - return the current context tracking state if known
+ *
+ * Returns the current cpu's context tracking state if context tracking
+ * is enabled.  If context tracking is disabled, returns
+ * CONTEXT_DISABLED.  This should be used primarily for debugging.
+ */
+static inline enum ctx_state ct_state(void)
+{
+	return context_tracking_is_enabled() ?
+		this_cpu_read(context_tracking.state) : CONTEXT_DISABLED;
+}
 #else
 static inline void user_enter(void) { }
 static inline void user_exit(void) { }
 static inline enum ctx_state exception_enter(void) { return 0; }
 static inline void exception_exit(enum ctx_state prev_ctx) { }
+static inline enum ctx_state ct_state(void) { return CONTEXT_DISABLED; }
 #endif /* !CONFIG_CONTEXT_TRACKING */
 
+#define CT_WARN_ON(cond) WARN_ON(context_tracking_is_enabled() && (cond))
 
 #ifdef CONFIG_CONTEXT_TRACKING_FORCE
 extern void context_tracking_init(void);
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 678ecdf..ee956c5 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -14,6 +14,7 @@ struct context_tracking {
 	bool active;
 	int recursion;
 	enum ctx_state {
+		CONTEXT_DISABLED = -1,	/* returned by ct_state() if unknown */
 		CONTEXT_KERNEL = 0,
 		CONTEXT_USER,
 		CONTEXT_GUEST,
-- 
cgit v0.10.2


From e727c7d7a11e109849582e9165d54b254eb181d7 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:22 -0700
Subject: notifiers, RCU: Assert that RCU is watching in notify_die()

Low-level arch entries often call notify_die(), and it's easy for
arch code to fail to exit an RCU quiescent state first.  Assert
that we're not quiescent in notify_die().

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/1f5fe6c23d5b432a23267102f2d72b787d80fdd8.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/kernel/notifier.c b/kernel/notifier.c
index ae9fc7c..980e433 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -544,6 +544,8 @@ int notrace notify_die(enum die_val val, const char *str,
 		.signr	= sig,
 
 	};
+	rcu_lockdep_assert(rcu_is_watching(),
+			   "notify_die called but RCU thinks we're quiescent");
 	return atomic_notifier_call_chain(&die_chain, val, &args);
 }
 NOKPROBE_SYMBOL(notify_die);
-- 
cgit v0.10.2


From 1f484aa6904697f390027c12fba130fa94b20831 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:23 -0700
Subject: x86/entry: Move C entry and exit code to arch/x86/entry/common.c

The entry and exit C helpers were confusingly scattered between
ptrace.c and signal.c, even though they aren't specific to
ptrace or signal handling.  Move them together in a new file.

This change just moves code around.  It doesn't change anything.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/324d686821266544d8572423cc281f961da445f4.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index 7a14497..bd55ded 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -2,6 +2,7 @@
 # Makefile for the x86 low level entry code
 #
 obj-y				:= entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
+obj-y				+= common.o
 
 obj-y				+= vdso/
 obj-y				+= vsyscall/
diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
new file mode 100644
index 0000000..917d0c3
--- /dev/null
+++ b/arch/x86/entry/common.c
@@ -0,0 +1,253 @@
+/*
+ * common.c - C code for kernel entry and exit
+ * Copyright (c) 2015 Andrew Lutomirski
+ * GPL v2
+ *
+ * Based on asm and ptrace code by many authors.  The code here originated
+ * in ptrace.c and signal.c.
+ */
+
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/smp.h>
+#include <linux/errno.h>
+#include <linux/ptrace.h>
+#include <linux/tracehook.h>
+#include <linux/audit.h>
+#include <linux/seccomp.h>
+#include <linux/signal.h>
+#include <linux/export.h>
+#include <linux/context_tracking.h>
+#include <linux/user-return-notifier.h>
+#include <linux/uprobes.h>
+
+#include <asm/desc.h>
+#include <asm/traps.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/syscalls.h>
+
+static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
+{
+#ifdef CONFIG_X86_64
+	if (arch == AUDIT_ARCH_X86_64) {
+		audit_syscall_entry(regs->orig_ax, regs->di,
+				    regs->si, regs->dx, regs->r10);
+	} else
+#endif
+	{
+		audit_syscall_entry(regs->orig_ax, regs->bx,
+				    regs->cx, regs->dx, regs->si);
+	}
+}
+
+/*
+ * We can return 0 to resume the syscall or anything else to go to phase
+ * 2.  If we resume the syscall, we need to put something appropriate in
+ * regs->orig_ax.
+ *
+ * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
+ * are fully functional.
+ *
+ * For phase 2's benefit, our return value is:
+ * 0:			resume the syscall
+ * 1:			go to phase 2; no seccomp phase 2 needed
+ * anything else:	go to phase 2; pass return value to seccomp
+ */
+unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
+{
+	unsigned long ret = 0;
+	u32 work;
+
+	BUG_ON(regs != task_pt_regs(current));
+
+	work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
+
+	/*
+	 * If TIF_NOHZ is set, we are required to call user_exit() before
+	 * doing anything that could touch RCU.
+	 */
+	if (work & _TIF_NOHZ) {
+		user_exit();
+		work &= ~_TIF_NOHZ;
+	}
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Do seccomp first -- it should minimize exposure of other
+	 * code, and keeping seccomp fast is probably more valuable
+	 * than the rest of this.
+	 */
+	if (work & _TIF_SECCOMP) {
+		struct seccomp_data sd;
+
+		sd.arch = arch;
+		sd.nr = regs->orig_ax;
+		sd.instruction_pointer = regs->ip;
+#ifdef CONFIG_X86_64
+		if (arch == AUDIT_ARCH_X86_64) {
+			sd.args[0] = regs->di;
+			sd.args[1] = regs->si;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->r10;
+			sd.args[4] = regs->r8;
+			sd.args[5] = regs->r9;
+		} else
+#endif
+		{
+			sd.args[0] = regs->bx;
+			sd.args[1] = regs->cx;
+			sd.args[2] = regs->dx;
+			sd.args[3] = regs->si;
+			sd.args[4] = regs->di;
+			sd.args[5] = regs->bp;
+		}
+
+		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
+		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
+
+		ret = seccomp_phase1(&sd);
+		if (ret == SECCOMP_PHASE1_SKIP) {
+			regs->orig_ax = -1;
+			ret = 0;
+		} else if (ret != SECCOMP_PHASE1_OK) {
+			return ret;  /* Go directly to phase 2 */
+		}
+
+		work &= ~_TIF_SECCOMP;
+	}
+#endif
+
+	/* Do our best to finish without phase 2. */
+	if (work == 0)
+		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
+
+#ifdef CONFIG_AUDITSYSCALL
+	if (work == _TIF_SYSCALL_AUDIT) {
+		/*
+		 * If there is no more work to be done except auditing,
+		 * then audit in phase 1.  Phase 2 always audits, so, if
+		 * we audit here, then we can't go on to phase 2.
+		 */
+		do_audit_syscall_entry(regs, arch);
+		return 0;
+	}
+#endif
+
+	return 1;  /* Something is enabled that we can't handle in phase 1 */
+}
+
+/* Returns the syscall nr to run (which should match regs->orig_ax). */
+long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
+				unsigned long phase1_result)
+{
+	long ret = 0;
+	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
+		_TIF_WORK_SYSCALL_ENTRY;
+
+	BUG_ON(regs != task_pt_regs(current));
+
+	/*
+	 * If we stepped into a sysenter/syscall insn, it trapped in
+	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
+	 * If user-mode had set TF itself, then it's still clear from
+	 * do_debug() and we need to set it again to restore the user
+	 * state.  If we entered on the slow path, TF was already set.
+	 */
+	if (work & _TIF_SINGLESTEP)
+		regs->flags |= X86_EFLAGS_TF;
+
+#ifdef CONFIG_SECCOMP
+	/*
+	 * Call seccomp_phase2 before running the other hooks so that
+	 * they can see any changes made by a seccomp tracer.
+	 */
+	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
+		/* seccomp failures shouldn't expose any additional code. */
+		return -1;
+	}
+#endif
+
+	if (unlikely(work & _TIF_SYSCALL_EMU))
+		ret = -1L;
+
+	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
+	    tracehook_report_syscall_entry(regs))
+		ret = -1L;
+
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_enter(regs, regs->orig_ax);
+
+	do_audit_syscall_entry(regs, arch);
+
+	return ret ?: regs->orig_ax;
+}
+
+long syscall_trace_enter(struct pt_regs *regs)
+{
+	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
+	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
+
+	if (phase1_result == 0)
+		return regs->orig_ax;
+	else
+		return syscall_trace_enter_phase2(regs, arch, phase1_result);
+}
+
+void syscall_trace_leave(struct pt_regs *regs)
+{
+	bool step;
+
+	/*
+	 * We may come here right after calling schedule_user()
+	 * or do_notify_resume(), in which case we can be in RCU
+	 * user mode.
+	 */
+	user_exit();
+
+	audit_syscall_exit(regs);
+
+	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
+		trace_sys_exit(regs, regs->ax);
+
+	/*
+	 * If TIF_SYSCALL_EMU is set, we only get here because of
+	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+	 * We already reported this syscall instruction in
+	 * syscall_trace_enter().
+	 */
+	step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
+			!test_thread_flag(TIF_SYSCALL_EMU);
+	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
+		tracehook_report_syscall_exit(regs, step);
+
+	user_enter();
+}
+
+/*
+ * notification of userspace execution resumption
+ * - triggered by the TIF_WORK_MASK flags
+ */
+__visible void
+do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
+{
+	user_exit();
+
+	if (thread_info_flags & _TIF_UPROBE)
+		uprobe_notify_resume(regs);
+
+	/* deal with pending signal delivery */
+	if (thread_info_flags & _TIF_SIGPENDING)
+		do_signal(regs);
+
+	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+		clear_thread_flag(TIF_NOTIFY_RESUME);
+		tracehook_notify_resume(regs);
+	}
+	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
+		fire_user_return_notifiers();
+
+	user_enter();
+}
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index 31eab86..b42408b 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -30,6 +30,7 @@ typedef sigset_t compat_sigset_t;
 #endif /* __ASSEMBLY__ */
 #include <uapi/asm/signal.h>
 #ifndef __ASSEMBLY__
+extern void do_signal(struct pt_regs *regs);
 extern void do_notify_resume(struct pt_regs *, void *, __u32);
 
 #define __ARCH_HAS_SA_RESTORER
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7155957..558f50e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -37,12 +37,10 @@
 #include <asm/proto.h>
 #include <asm/hw_breakpoint.h>
 #include <asm/traps.h>
+#include <asm/syscall.h>
 
 #include "tls.h"
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/syscalls.h>
-
 enum x86_regset {
 	REGSET_GENERAL,
 	REGSET_FP,
@@ -1444,201 +1442,3 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
 	/* Send us the fake SIGTRAP */
 	force_sig_info(SIGTRAP, &info, tsk);
 }
-
-static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
-{
-#ifdef CONFIG_X86_64
-	if (arch == AUDIT_ARCH_X86_64) {
-		audit_syscall_entry(regs->orig_ax, regs->di,
-				    regs->si, regs->dx, regs->r10);
-	} else
-#endif
-	{
-		audit_syscall_entry(regs->orig_ax, regs->bx,
-				    regs->cx, regs->dx, regs->si);
-	}
-}
-
-/*
- * We can return 0 to resume the syscall or anything else to go to phase
- * 2.  If we resume the syscall, we need to put something appropriate in
- * regs->orig_ax.
- *
- * NB: We don't have full pt_regs here, but regs->orig_ax and regs->ax
- * are fully functional.
- *
- * For phase 2's benefit, our return value is:
- * 0:			resume the syscall
- * 1:			go to phase 2; no seccomp phase 2 needed
- * anything else:	go to phase 2; pass return value to seccomp
- */
-unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
-{
-	unsigned long ret = 0;
-	u32 work;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	/*
-	 * If TIF_NOHZ is set, we are required to call user_exit() before
-	 * doing anything that could touch RCU.
-	 */
-	if (work & _TIF_NOHZ) {
-		user_exit();
-		work &= ~_TIF_NOHZ;
-	}
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Do seccomp first -- it should minimize exposure of other
-	 * code, and keeping seccomp fast is probably more valuable
-	 * than the rest of this.
-	 */
-	if (work & _TIF_SECCOMP) {
-		struct seccomp_data sd;
-
-		sd.arch = arch;
-		sd.nr = regs->orig_ax;
-		sd.instruction_pointer = regs->ip;
-#ifdef CONFIG_X86_64
-		if (arch == AUDIT_ARCH_X86_64) {
-			sd.args[0] = regs->di;
-			sd.args[1] = regs->si;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->r10;
-			sd.args[4] = regs->r8;
-			sd.args[5] = regs->r9;
-		} else
-#endif
-		{
-			sd.args[0] = regs->bx;
-			sd.args[1] = regs->cx;
-			sd.args[2] = regs->dx;
-			sd.args[3] = regs->si;
-			sd.args[4] = regs->di;
-			sd.args[5] = regs->bp;
-		}
-
-		BUILD_BUG_ON(SECCOMP_PHASE1_OK != 0);
-		BUILD_BUG_ON(SECCOMP_PHASE1_SKIP != 1);
-
-		ret = seccomp_phase1(&sd);
-		if (ret == SECCOMP_PHASE1_SKIP) {
-			regs->orig_ax = -1;
-			ret = 0;
-		} else if (ret != SECCOMP_PHASE1_OK) {
-			return ret;  /* Go directly to phase 2 */
-		}
-
-		work &= ~_TIF_SECCOMP;
-	}
-#endif
-
-	/* Do our best to finish without phase 2. */
-	if (work == 0)
-		return ret;  /* seccomp and/or nohz only (ret == 0 here) */
-
-#ifdef CONFIG_AUDITSYSCALL
-	if (work == _TIF_SYSCALL_AUDIT) {
-		/*
-		 * If there is no more work to be done except auditing,
-		 * then audit in phase 1.  Phase 2 always audits, so, if
-		 * we audit here, then we can't go on to phase 2.
-		 */
-		do_audit_syscall_entry(regs, arch);
-		return 0;
-	}
-#endif
-
-	return 1;  /* Something is enabled that we can't handle in phase 1 */
-}
-
-/* Returns the syscall nr to run (which should match regs->orig_ax). */
-long syscall_trace_enter_phase2(struct pt_regs *regs, u32 arch,
-				unsigned long phase1_result)
-{
-	long ret = 0;
-	u32 work = ACCESS_ONCE(current_thread_info()->flags) &
-		_TIF_WORK_SYSCALL_ENTRY;
-
-	BUG_ON(regs != task_pt_regs(current));
-
-	/*
-	 * If we stepped into a sysenter/syscall insn, it trapped in
-	 * kernel mode; do_debug() cleared TF and set TIF_SINGLESTEP.
-	 * If user-mode had set TF itself, then it's still clear from
-	 * do_debug() and we need to set it again to restore the user
-	 * state.  If we entered on the slow path, TF was already set.
-	 */
-	if (work & _TIF_SINGLESTEP)
-		regs->flags |= X86_EFLAGS_TF;
-
-#ifdef CONFIG_SECCOMP
-	/*
-	 * Call seccomp_phase2 before running the other hooks so that
-	 * they can see any changes made by a seccomp tracer.
-	 */
-	if (phase1_result > 1 && seccomp_phase2(phase1_result)) {
-		/* seccomp failures shouldn't expose any additional code. */
-		return -1;
-	}
-#endif
-
-	if (unlikely(work & _TIF_SYSCALL_EMU))
-		ret = -1L;
-
-	if ((ret || test_thread_flag(TIF_SYSCALL_TRACE)) &&
-	    tracehook_report_syscall_entry(regs))
-		ret = -1L;
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_enter(regs, regs->orig_ax);
-
-	do_audit_syscall_entry(regs, arch);
-
-	return ret ?: regs->orig_ax;
-}
-
-long syscall_trace_enter(struct pt_regs *regs)
-{
-	u32 arch = is_ia32_task() ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
-	unsigned long phase1_result = syscall_trace_enter_phase1(regs, arch);
-
-	if (phase1_result == 0)
-		return regs->orig_ax;
-	else
-		return syscall_trace_enter_phase2(regs, arch, phase1_result);
-}
-
-void syscall_trace_leave(struct pt_regs *regs)
-{
-	bool step;
-
-	/*
-	 * We may come here right after calling schedule_user()
-	 * or do_notify_resume(), in which case we can be in RCU
-	 * user mode.
-	 */
-	user_exit();
-
-	audit_syscall_exit(regs);
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_exit(regs, regs->ax);
-
-	/*
-	 * If TIF_SYSCALL_EMU is set, we only get here because of
-	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
-	 * We already reported this syscall instruction in
-	 * syscall_trace_enter().
-	 */
-	step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
-			!test_thread_flag(TIF_SYSCALL_EMU);
-	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
-
-	user_enter();
-}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6c22aad..7e88cc7 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -700,7 +700,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
  * want to handle. Thus you cannot kill init even with a SIGKILL even by
  * mistake.
  */
-static void do_signal(struct pt_regs *regs)
+void do_signal(struct pt_regs *regs)
 {
 	struct ksignal ksig;
 
@@ -735,32 +735,6 @@ static void do_signal(struct pt_regs *regs)
 	restore_saved_sigmask();
 }
 
-/*
- * notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-__visible void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-	user_exit();
-
-	if (thread_info_flags & _TIF_UPROBE)
-		uprobe_notify_resume(regs);
-
-	/* deal with pending signal delivery */
-	if (thread_info_flags & _TIF_SIGPENDING)
-		do_signal(regs);
-
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
-		clear_thread_flag(TIF_NOTIFY_RESUME);
-		tracehook_notify_resume(regs);
-	}
-	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
-		fire_user_return_notifiers();
-
-	user_enter();
-}
-
 void signal_fault(struct pt_regs *regs, void __user *frame, char *where)
 {
 	struct task_struct *me = current;
-- 
cgit v0.10.2


From 02fdcd5eac9d653d1addbd69b0c58d73650e1c00 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:24 -0700
Subject: x86/traps, context_tracking: Assert that we're in CONTEXT_KERNEL in
 exception entries

Other than the super-atomic exception entries, all exception
entries are supposed to switch our context tracking state to
CONTEXT_KERNEL. Assert that they do.  These assertions appear
trivial at this point, as exception_enter() is the function
responsible for switching context, but I'm planning on reworking
x86's exception context tracking, and these assertions will help
make sure that all of this code keeps working.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/20fa1ee2d943233a184aaf96ff75394d3b34dfba.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index f579192..2a783c4 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -292,6 +292,8 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 	enum ctx_state prev_state = exception_enter();
 	siginfo_t info;
 
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
 		conditional_sti(regs);
@@ -376,6 +378,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	siginfo_t *info;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
 		goto exit;
@@ -457,6 +460,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	conditional_sti(regs);
 
 	if (v8086_mode(regs)) {
@@ -514,6 +518,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 		return;
 
 	prev_state = ist_enter(regs);
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 				SIGTRAP) == NOTIFY_STOP)
@@ -750,6 +755,7 @@ dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	math_error(regs, error_code, X86_TRAP_MF);
 	exception_exit(prev_state);
 }
@@ -760,6 +766,7 @@ do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	math_error(regs, error_code, X86_TRAP_XF);
 	exception_exit(prev_state);
 }
@@ -776,6 +783,7 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	BUG_ON(use_eager_fpu());
 
 #ifdef CONFIG_MATH_EMULATION
@@ -805,6 +813,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 	enum ctx_state prev_state;
 
 	prev_state = exception_enter();
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	local_irq_enable();
 
 	info.si_signo = SIGILL;
-- 
cgit v0.10.2


From feed36cde0a10adb957445a37e48f957f30b2273 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:25 -0700
Subject: x86/entry: Add enter_from_user_mode() and use it in syscalls

Changing the x86 context tracking hooks is dangerous because
there are no good checks that we track our context correctly.
Add a helper to check that we're actually in CONTEXT_USER when
we enter from user mode and wire it up for syscall entries.

Subsequent patches will wire this up for all non-NMI entries as
well.  NMIs are their own special beast and cannot currently
switch overall context tracking state.  Instead, they have their
own special RCU hooks.

This is a tiny speedup if !CONFIG_CONTEXT_TRACKING (removes a
branch) and a tiny slowdown if CONFIG_CONTEXT_TRACING (adds a
layer of indirection).  Eventually, we should fix up the core
context tracking code to supply a function that does what we
want (and can be much simpler than user_exit), which will enable
us to get rid of the extra call.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/853b42420066ec3fb856779cdc223a6dcb5d355b.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 917d0c3..9a327ee 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -28,6 +28,15 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/syscalls.h>
 
+#ifdef CONFIG_CONTEXT_TRACKING
+/* Called on entry from user mode with IRQs off. */
+__visible void enter_from_user_mode(void)
+{
+	CT_WARN_ON(ct_state() != CONTEXT_USER);
+	user_exit();
+}
+#endif
+
 static void do_audit_syscall_entry(struct pt_regs *regs, u32 arch)
 {
 #ifdef CONFIG_X86_64
@@ -65,14 +74,16 @@ unsigned long syscall_trace_enter_phase1(struct pt_regs *regs, u32 arch)
 	work = ACCESS_ONCE(current_thread_info()->flags) &
 		_TIF_WORK_SYSCALL_ENTRY;
 
+#ifdef CONFIG_CONTEXT_TRACKING
 	/*
 	 * If TIF_NOHZ is set, we are required to call user_exit() before
 	 * doing anything that could touch RCU.
 	 */
 	if (work & _TIF_NOHZ) {
-		user_exit();
+		enter_from_user_mode();
 		work &= ~_TIF_NOHZ;
 	}
+#endif
 
 #ifdef CONFIG_SECCOMP
 	/*
-- 
cgit v0.10.2


From c5c46f59e4e7c1ab244b8d38f2b61d317df90bba Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:26 -0700
Subject: x86/entry: Add new, comprehensible entry and exit handlers written in
 C

The current x86 entry and exit code, written in a mixture of assembly and
C code, is incomprehensible due to being open-coded in a lot of places
without coherent documentation.

It appears to work primary by luck and duct tape: i.e. obvious runtime
failures were fixed on-demand, without re-thinking the design.

Due to those reasons our confidence level in that code is low, and it is
very difficult to incrementally improve.

Add new code written in C, in preparation for simply deleting the old
entry code.

prepare_exit_to_usermode() is a new function that will handle all
slow path exits to user mode.  It is called with IRQs disabled
and it leaves us in a state in which it is safe to immediately
return to user mode.  IRQs must not be re-enabled at any point
after prepare_exit_to_usermode() returns and user mode is actually
entered. (We can, of course, fail to enter user mode and treat
that failure as a fresh entry to kernel mode.)

All callers of do_notify_resume() will be migrated to call
prepare_exit_to_usermode() instead; prepare_exit_to_usermode() needs
to do everything that do_notify_resume() does today, but it also
takes care of scheduling and context tracking.  Unlike
do_notify_resume(), it does not need to be called in a loop.

syscall_return_slowpath() is exactly what it sounds like: it will
be called on any syscall exit slow path. It will replace
syscall_trace_leave() and it calls prepare_exit_to_usermode() on the
way out.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/c57c8b87661a4152801d7d3786eac2d1a2f209dd.1435952415.git.luto@kernel.org
[ Improved the changelog a bit. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 9a327ee..febc530 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -207,6 +207,7 @@ long syscall_trace_enter(struct pt_regs *regs)
 		return syscall_trace_enter_phase2(regs, arch, phase1_result);
 }
 
+/* Deprecated. */
 void syscall_trace_leave(struct pt_regs *regs)
 {
 	bool step;
@@ -237,8 +238,117 @@ void syscall_trace_leave(struct pt_regs *regs)
 	user_enter();
 }
 
+static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
+{
+	unsigned long top_of_stack =
+		(unsigned long)(regs + 1) + TOP_OF_KERNEL_STACK_PADDING;
+	return (struct thread_info *)(top_of_stack - THREAD_SIZE);
+}
+
+/* Called with IRQs disabled. */
+__visible void prepare_exit_to_usermode(struct pt_regs *regs)
+{
+	if (WARN_ON(!irqs_disabled()))
+		local_irq_disable();
+
+	/*
+	 * In order to return to user mode, we need to have IRQs off with
+	 * none of _TIF_SIGPENDING, _TIF_NOTIFY_RESUME, _TIF_USER_RETURN_NOTIFY,
+	 * _TIF_UPROBE, or _TIF_NEED_RESCHED set.  Several of these flags
+	 * can be set at any time on preemptable kernels if we have IRQs on,
+	 * so we need to loop.  Disabling preemption wouldn't help: doing the
+	 * work to clear some of the flags can sleep.
+	 */
+	while (true) {
+		u32 cached_flags =
+			READ_ONCE(pt_regs_to_thread_info(regs)->flags);
+
+		if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
+				      _TIF_UPROBE | _TIF_NEED_RESCHED)))
+			break;
+
+		/* We have work to do. */
+		local_irq_enable();
+
+		if (cached_flags & _TIF_NEED_RESCHED)
+			schedule();
+
+		if (cached_flags & _TIF_UPROBE)
+			uprobe_notify_resume(regs);
+
+		/* deal with pending signal delivery */
+		if (cached_flags & _TIF_SIGPENDING)
+			do_signal(regs);
+
+		if (cached_flags & _TIF_NOTIFY_RESUME) {
+			clear_thread_flag(TIF_NOTIFY_RESUME);
+			tracehook_notify_resume(regs);
+		}
+
+		if (cached_flags & _TIF_USER_RETURN_NOTIFY)
+			fire_user_return_notifiers();
+
+		/* Disable IRQs and retry */
+		local_irq_disable();
+	}
+
+	user_enter();
+}
+
+/*
+ * Called with IRQs on and fully valid regs.  Returns with IRQs off in a
+ * state such that we can immediately switch to user mode.
+ */
+__visible void syscall_return_slowpath(struct pt_regs *regs)
+{
+	struct thread_info *ti = pt_regs_to_thread_info(regs);
+	u32 cached_flags = READ_ONCE(ti->flags);
+	bool step;
+
+	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+
+	if (WARN(irqs_disabled(), "syscall %ld left IRQs disabled",
+		 regs->orig_ax))
+		local_irq_enable();
+
+	/*
+	 * First do one-time work.  If these work items are enabled, we
+	 * want to run them exactly once per syscall exit with IRQs on.
+	 */
+	if (cached_flags & (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT |
+			    _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT)) {
+		audit_syscall_exit(regs);
+
+		if (cached_flags & _TIF_SYSCALL_TRACEPOINT)
+			trace_sys_exit(regs, regs->ax);
+
+		/*
+		 * If TIF_SYSCALL_EMU is set, we only get here because of
+		 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
+		 * We already reported this syscall instruction in
+		 * syscall_trace_enter().
+		 */
+		step = unlikely(
+			(cached_flags & (_TIF_SINGLESTEP | _TIF_SYSCALL_EMU))
+			== _TIF_SINGLESTEP);
+		if (step || cached_flags & _TIF_SYSCALL_TRACE)
+			tracehook_report_syscall_exit(regs, step);
+	}
+
+#ifdef CONFIG_COMPAT
+	/*
+	 * Compat syscalls set TS_COMPAT.  Make sure we clear it before
+	 * returning to user mode.
+	 */
+	ti->status &= ~TS_COMPAT;
+#endif
+
+	local_irq_disable();
+	prepare_exit_to_usermode(regs);
+}
+
 /*
- * notification of userspace execution resumption
+ * Deprecated notification of userspace execution resumption
  * - triggered by the TIF_WORK_MASK flags
  */
 __visible void
-- 
cgit v0.10.2


From cb6f64ed5a04036eef07e70b57dd5dd78f2fbcef Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:27 -0700
Subject: x86/entry/64: Really create an error-entry-from-usermode code path

In 539f51136500 ("x86/asm/entry/64: Disentangle error_entry/exit
gsbase/ebx/usermode code"), I arranged the code slightly wrong
-- IRET faults would skip the code path that was intended to
execute on all error entries from user mode.  Fix it up.

While we're at it, make all the labels in error_entry local.

This does not fix a bug, but we'll need it, and it slightly
shrinks the code.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/91e17891e49fa3d61357eadc451529ad48143ee1.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 141a5d4..ccfcba9 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1143,12 +1143,17 @@ ENTRY(error_entry)
 	SAVE_EXTRA_REGS 8
 	xorl	%ebx, %ebx
 	testb	$3, CS+8(%rsp)
-	jz	error_kernelspace
+	jz	.Lerror_kernelspace
 
-	/* We entered from user mode */
+.Lerror_entry_from_usermode_swapgs:
+	/*
+	 * We entered from user mode or we're pretending to have entered
+	 * from user mode due to an IRET fault.
+	 */
 	SWAPGS
 
-error_entry_done:
+.Lerror_entry_from_usermode_after_swapgs:
+.Lerror_entry_done:
 	TRACE_IRQS_OFF
 	ret
 
@@ -1158,31 +1163,30 @@ error_entry_done:
 	 * truncated RIP for IRET exceptions returning to compat mode. Check
 	 * for these here too.
 	 */
-error_kernelspace:
+.Lerror_kernelspace:
 	incl	%ebx
 	leaq	native_irq_return_iret(%rip), %rcx
 	cmpq	%rcx, RIP+8(%rsp)
-	je	error_bad_iret
+	je	.Lerror_bad_iret
 	movl	%ecx, %eax			/* zero extend */
 	cmpq	%rax, RIP+8(%rsp)
-	je	bstep_iret
+	je	.Lbstep_iret
 	cmpq	$gs_change, RIP+8(%rsp)
-	jne	error_entry_done
+	jne	.Lerror_entry_done
 
 	/*
 	 * hack: gs_change can fail with user gsbase.  If this happens, fix up
 	 * gsbase and proceed.  We'll fix up the exception and land in
 	 * gs_change's error handler with kernel gsbase.
 	 */
-	SWAPGS
-	jmp	error_entry_done
+	jmp	.Lerror_entry_from_usermode_swapgs
 
-bstep_iret:
+.Lbstep_iret:
 	/* Fix truncated RIP */
 	movq	%rcx, RIP+8(%rsp)
 	/* fall through */
 
-error_bad_iret:
+.Lerror_bad_iret:
 	/*
 	 * We came from an IRET to user mode, so we have user gsbase.
 	 * Switch to kernel gsbase:
@@ -1198,7 +1202,7 @@ error_bad_iret:
 	call	fixup_bad_iret
 	mov	%rax, %rsp
 	decl	%ebx
-	jmp	error_entry_done
+	jmp	.Lerror_entry_from_usermode_after_swapgs
 END(error_entry)
 
 
-- 
cgit v0.10.2


From 29ea1b258b98a862e59d72556714b75051ae93fb Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:28 -0700
Subject: x86/entry/64: Migrate 64-bit and compat syscalls to the new exit
 handlers and remove old assembly code

These need to be migrated together, as the compat case used to
jump into the middle of the 64-bit exit code.

Remove the old assembly code.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/d4d1d70de08ac3640badf50048a9e8f18fe2497f.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index ccfcba9..4ca5b78 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -229,6 +229,11 @@ entry_SYSCALL_64_fastpath:
 	 */
 	USERGS_SYSRET64
 
+GLOBAL(int_ret_from_sys_call_irqs_off)
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+	jmp int_ret_from_sys_call
+
 	/* Do syscall entry tracing */
 tracesys:
 	movq	%rsp, %rdi
@@ -272,69 +277,11 @@ tracesys_phase2:
  * Has correct iret frame.
  */
 GLOBAL(int_ret_from_sys_call)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
-	TRACE_IRQS_OFF
-	movl	$_TIF_ALLWORK_MASK, %edi
-	/* edi:	mask to check */
-GLOBAL(int_with_check)
-	LOCKDEP_SYS_EXIT_IRQ
-	GET_THREAD_INFO(%rcx)
-	movl	TI_flags(%rcx), %edx
-	andl	%edi, %edx
-	jnz	int_careful
-	andl	$~TS_COMPAT, TI_status(%rcx)
-	jmp	syscall_return
-
-	/*
-	 * Either reschedule or signal or syscall exit tracking needed.
-	 * First do a reschedule test.
-	 * edx:	work, edi: workmask
-	 */
-int_careful:
-	bt	$TIF_NEED_RESCHED, %edx
-	jnc	int_very_careful
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq	%rdi
-	SCHEDULE_USER
-	popq	%rdi
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp	int_with_check
-
-	/* handle signals and tracing -- both require a full pt_regs */
-int_very_careful:
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
 	SAVE_EXTRA_REGS
-	/* Check for syscall exit trace */
-	testl	$_TIF_WORK_SYSCALL_EXIT, %edx
-	jz	int_signal
-	pushq	%rdi
-	leaq	8(%rsp), %rdi			/* &ptregs -> arg1 */
-	call	syscall_trace_leave
-	popq	%rdi
-	andl	$~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU), %edi
-	jmp	int_restore_rest
-
-int_signal:
-	testl	$_TIF_DO_NOTIFY_MASK, %edx
-	jz	1f
-	movq	%rsp, %rdi			/* &ptregs -> arg1 */
-	xorl	%esi, %esi			/* oldset -> arg2 */
-	call	do_notify_resume
-1:	movl	$_TIF_WORK_MASK, %edi
-int_restore_rest:
+	movq	%rsp, %rdi
+	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	RESTORE_EXTRA_REGS
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp	int_with_check
-
-syscall_return:
-	/* The IRETQ could re-enable interrupts: */
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_IRETQ
+	TRACE_IRQS_IRETQ		/* we're about to change IF */
 
 	/*
 	 * Try to use SYSRET instead of IRET if we're returning to
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index e5ebdd9..d9bbd31 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -210,10 +210,10 @@ sysexit_from_sys_call:
 	.endm
 
 	.macro auditsys_exit exit
-	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
-	jnz	ia32_ret_from_sys_call
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
+	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+	jnz	ia32_ret_from_sys_call
 	movl	%eax, %esi		/* second arg, syscall return value */
 	cmpl	$-MAX_ERRNO, %eax	/* is it an error ? */
 	jbe	1f
@@ -232,7 +232,7 @@ sysexit_from_sys_call:
 	movq	%rax, R10(%rsp)
 	movq	%rax, R9(%rsp)
 	movq	%rax, R8(%rsp)
-	jmp	int_with_check
+	jmp	int_ret_from_sys_call_irqs_off
 	.endm
 
 sysenter_auditsys:
-- 
cgit v0.10.2


From ff467594f2a4be01a0fa5e9ffc223fa930d232dd Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:29 -0700
Subject: x86/asm/entry/64: Save all regs on interrupt entry

To prepare for the big rewrite of the error and interrupt exit
paths, we will need pt_regs completely filled in.

It's already completely filled in when error_exit runs, so rearrange
interrupt handling to match it.  This will slow down interrupt
handling very slightly (eight instructions), but the
simplification it enables will be more than worth it.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/d8a766a7f558b30e6e01352854628a2d9943460c.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 519207f..3c71dd9 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -135,9 +135,6 @@ For 32-bit we have the following conventions - kernel is built with
 	movq %rbp, 4*8+\offset(%rsp)
 	movq %rbx, 5*8+\offset(%rsp)
 	.endm
-	.macro SAVE_EXTRA_REGS_RBP offset=0
-	movq %rbp, 4*8+\offset(%rsp)
-	.endm
 
 	.macro RESTORE_EXTRA_REGS offset=0
 	movq 0*8+\offset(%rsp), %r15
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 4ca5b78..65029f4 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -502,21 +502,13 @@ END(irq_entries_start)
 /* 0(%rsp): ~(interrupt number) */
 	.macro interrupt func
 	cld
-	/*
-	 * Since nothing in interrupt handling code touches r12...r15 members
-	 * of "struct pt_regs", and since interrupts can nest, we can save
-	 * four stack slots and simultaneously provide
-	 * an unwind-friendly stack layout by saving "truncated" pt_regs
-	 * exactly up to rbp slot, without these members.
-	 */
-	ALLOC_PT_GPREGS_ON_STACK -RBP
-	SAVE_C_REGS -RBP
-	/* this goes to 0(%rsp) for unwinder, not for saving the value: */
-	SAVE_EXTRA_REGS_RBP -RBP
+	ALLOC_PT_GPREGS_ON_STACK
+	SAVE_C_REGS
+	SAVE_EXTRA_REGS
 
-	leaq	-RBP(%rsp), %rdi		/* arg1 for \func (pointer to pt_regs) */
+	movq	%rsp,%rdi	/* arg1 for \func (pointer to pt_regs) */
 
-	testb	$3, CS-RBP(%rsp)
+	testb	$3, CS(%rsp)
 	jz	1f
 	SWAPGS
 1:
@@ -553,9 +545,7 @@ ret_from_intr:
 	decl	PER_CPU_VAR(irq_count)
 
 	/* Restore saved previous stack */
-	popq	%rsi
-	/* return code expects complete pt_regs - adjust rsp accordingly: */
-	leaq	-RBP(%rsi), %rsp
+	popq	%rsp
 
 	testb	$3, CS(%rsp)
 	jz	retint_kernel
@@ -580,7 +570,7 @@ retint_swapgs:					/* return to user-space */
 	TRACE_IRQS_IRETQ
 
 	SWAPGS
-	jmp	restore_c_regs_and_iret
+	jmp	restore_regs_and_iret
 
 /* Returning to kernel space */
 retint_kernel:
@@ -604,6 +594,8 @@ retint_kernel:
  * At this label, code paths which return to kernel and to user,
  * which come from interrupts/exception and from syscalls, merge.
  */
+restore_regs_and_iret:
+	RESTORE_EXTRA_REGS
 restore_c_regs_and_iret:
 	RESTORE_C_REGS
 	REMOVE_PT_GPREGS_FROM_STACK 8
@@ -674,12 +666,10 @@ retint_signal:
 	jz	retint_swapgs
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	SAVE_EXTRA_REGS
 	movq	$-1, ORIG_RAX(%rsp)
 	xorl	%esi, %esi			/* oldset */
 	movq	%rsp, %rdi			/* &pt_regs */
 	call	do_notify_resume
-	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	GET_THREAD_INFO(%rcx)
@@ -1160,7 +1150,6 @@ END(error_entry)
  */
 ENTRY(error_exit)
 	movl	%ebx, %eax
-	RESTORE_EXTRA_REGS
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 	testl	%eax, %eax
-- 
cgit v0.10.2


From a586f98e9767fb0dfdb989002866b4024f00ce08 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:30 -0700
Subject: x86/asm/entry/64: Simplify IRQ stack pt_regs handling

There's no need for both RSI and RDI to point to the original stack.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/3a0481f809dd340c7d3f54ce3fd6d66ef2a578cd.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 65029f4..83eb63d 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -506,8 +506,6 @@ END(irq_entries_start)
 	SAVE_C_REGS
 	SAVE_EXTRA_REGS
 
-	movq	%rsp,%rdi	/* arg1 for \func (pointer to pt_regs) */
-
 	testb	$3, CS(%rsp)
 	jz	1f
 	SWAPGS
@@ -519,14 +517,14 @@ END(irq_entries_start)
 	 * a little cheaper to use a separate counter in the PDA (short of
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
-	movq	%rsp, %rsi
+	movq	%rsp, %rdi
 	incl	PER_CPU_VAR(irq_count)
 	cmovzq	PER_CPU_VAR(irq_stack_ptr), %rsp
-	pushq	%rsi
+	pushq	%rdi
 	/* We entered an interrupt context - irqs are off: */
 	TRACE_IRQS_OFF
 
-	call	\func
+	call	\func	/* rdi points to pt_regs */
 	.endm
 
 	/*
-- 
cgit v0.10.2


From 02bc7768fe447ae305e924b931fa629073a4a1b9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:31 -0700
Subject: x86/asm/entry/64: Migrate error and IRQ exit work to C and remove old
 assembly code

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/60e90901eee611e59e958bfdbbe39969b4f88fe5.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 83eb63d..168ee26 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -508,7 +508,16 @@ END(irq_entries_start)
 
 	testb	$3, CS(%rsp)
 	jz	1f
+
+	/*
+	 * IRQ from user mode.  Switch to kernel gsbase and inform context
+	 * tracking that we're in kernel mode.
+	 */
 	SWAPGS
+#ifdef CONFIG_CONTEXT_TRACKING
+	call enter_from_user_mode
+#endif
+
 1:
 	/*
 	 * Save previous stack pointer, optionally switch to interrupt stack.
@@ -547,26 +556,13 @@ ret_from_intr:
 
 	testb	$3, CS(%rsp)
 	jz	retint_kernel
-	/* Interrupt came from user space */
-GLOBAL(retint_user)
-	GET_THREAD_INFO(%rcx)
 
-	/* %rcx: thread info. Interrupts are off. */
-retint_with_reschedule:
-	movl	$_TIF_WORK_MASK, %edi
-retint_check:
+	/* Interrupt came from user space */
 	LOCKDEP_SYS_EXIT_IRQ
-	movl	TI_flags(%rcx), %edx
-	andl	%edi, %edx
-	jnz	retint_careful
-
-retint_swapgs:					/* return to user-space */
-	/*
-	 * The iretq could re-enable interrupts:
-	 */
-	DISABLE_INTERRUPTS(CLBR_ANY)
+GLOBAL(retint_user)
+	mov	%rsp,%rdi
+	call	prepare_exit_to_usermode
 	TRACE_IRQS_IRETQ
-
 	SWAPGS
 	jmp	restore_regs_and_iret
 
@@ -644,35 +640,6 @@ native_irq_return_ldt:
 	popq	%rax
 	jmp	native_irq_return_iret
 #endif
-
-	/* edi: workmask, edx: work */
-retint_careful:
-	bt	$TIF_NEED_RESCHED, %edx
-	jnc	retint_signal
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	pushq	%rdi
-	SCHEDULE_USER
-	popq	%rdi
-	GET_THREAD_INFO(%rcx)
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	jmp	retint_check
-
-retint_signal:
-	testl	$_TIF_DO_NOTIFY_MASK, %edx
-	jz	retint_swapgs
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	movq	$-1, ORIG_RAX(%rsp)
-	xorl	%esi, %esi			/* oldset */
-	movq	%rsp, %rdi			/* &pt_regs */
-	call	do_notify_resume
-	DISABLE_INTERRUPTS(CLBR_NONE)
-	TRACE_IRQS_OFF
-	GET_THREAD_INFO(%rcx)
-	jmp	retint_with_reschedule
-
 END(common_interrupt)
 
 /*
@@ -1088,7 +1055,12 @@ ENTRY(error_entry)
 	SWAPGS
 
 .Lerror_entry_from_usermode_after_swapgs:
+#ifdef CONFIG_CONTEXT_TRACKING
+	call enter_from_user_mode
+#endif
+
 .Lerror_entry_done:
+
 	TRACE_IRQS_OFF
 	ret
 
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index d9bbd31..25aca51 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -458,6 +458,11 @@ ia32_badarg:
 	DISABLE_INTERRUPTS(CLBR_NONE)
 	TRACE_IRQS_OFF
 
+	/* Now finish entering normal kernel mode. */
+#ifdef CONFIG_CONTEXT_TRACKING
+	call enter_from_user_mode
+#endif
+
 	/* And exit again. */
 	jmp retint_user
 
-- 
cgit v0.10.2


From 8c84014f3bbb112d07e73f30a10ac8a3a72f8649 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:32 -0700
Subject: x86/entry: Remove exception_enter() from most trap handlers

On 64-bit kernels, we don't need it any more: we handle context
tracking directly on entry from user mode and exit to user mode.

On 32-bit kernels, we don't support context tracking at all, so
these callbacks had no effect.

Note: this doesn't change do_page_fault().  Before we do that,
we need to make sure that there is no code that can page fault
from kernel mode with CONTEXT_USER.  The 32-bit fast system call
stack argument code is the only offender I'm aware of right now.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/ae22f4dfebd799c916574089964592be218151f9.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index c5380be..c3496619 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -112,8 +112,8 @@ asmlinkage void smp_threshold_interrupt(void);
 asmlinkage void smp_deferred_error_interrupt(void);
 #endif
 
-extern enum ctx_state ist_enter(struct pt_regs *regs);
-extern void ist_exit(struct pt_regs *regs, enum ctx_state prev_state);
+extern void ist_enter(struct pt_regs *regs);
+extern void ist_exit(struct pt_regs *regs);
 extern void ist_begin_non_atomic(struct pt_regs *regs);
 extern void ist_end_non_atomic(void);
 
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 96ccecc..99940d1 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -1029,7 +1029,6 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 {
 	struct mca_config *cfg = &mca_cfg;
 	struct mce m, *final;
-	enum ctx_state prev_state;
 	int i;
 	int worst = 0;
 	int severity;
@@ -1055,7 +1054,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
 	int flags = MF_ACTION_REQUIRED;
 	int lmce = 0;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	this_cpu_inc(mce_exception_count);
 
@@ -1227,7 +1226,7 @@ out:
 	local_irq_disable();
 	ist_end_non_atomic();
 done:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 EXPORT_SYMBOL_GPL(do_machine_check);
 
diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c
index 737b0ad..12402e1 100644
--- a/arch/x86/kernel/cpu/mcheck/p5.c
+++ b/arch/x86/kernel/cpu/mcheck/p5.c
@@ -19,10 +19,9 @@ int mce_p5_enabled __read_mostly;
 /* Machine check handler for Pentium class Intel CPUs: */
 static void pentium_machine_check(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
 	u32 loaddr, hi, lotype;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	rdmsr(MSR_IA32_P5_MC_ADDR, loaddr, hi);
 	rdmsr(MSR_IA32_P5_MC_TYPE, lotype, hi);
@@ -39,7 +38,7 @@ static void pentium_machine_check(struct pt_regs *regs, long error_code)
 
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 
 /* Set up machine check reporting for processors with Intel style MCE: */
diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c
index 44f1382..01dd870 100644
--- a/arch/x86/kernel/cpu/mcheck/winchip.c
+++ b/arch/x86/kernel/cpu/mcheck/winchip.c
@@ -15,12 +15,12 @@
 /* Machine check handler for WinChip C6: */
 static void winchip_machine_check(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	printk(KERN_EMERG "CPU0: Machine Check Exception.\n");
 	add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE);
 
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 
 /* Set up machine check reporting on the Winchip C6 series */
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 2a783c4..8e65d8a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -108,13 +108,10 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 	preempt_count_dec();
 }
 
-enum ctx_state ist_enter(struct pt_regs *regs)
+void ist_enter(struct pt_regs *regs)
 {
-	enum ctx_state prev_state;
-
 	if (user_mode(regs)) {
-		/* Other than that, we're just an exception. */
-		prev_state = exception_enter();
+		CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	} else {
 		/*
 		 * We might have interrupted pretty much anything.  In
@@ -123,32 +120,25 @@ enum ctx_state ist_enter(struct pt_regs *regs)
 		 * but we need to notify RCU.
 		 */
 		rcu_nmi_enter();
-		prev_state = CONTEXT_KERNEL;  /* the value is irrelevant. */
 	}
 
 	/*
-	 * We are atomic because we're on the IST stack (or we're on x86_32,
-	 * in which case we still shouldn't schedule).
-	 *
-	 * This must be after exception_enter(), because exception_enter()
-	 * won't do anything if in_interrupt() returns true.
+	 * We are atomic because we're on the IST stack; or we're on
+	 * x86_32, in which case we still shouldn't schedule; or we're
+	 * on x86_64 and entered from user mode, in which case we're
+	 * still atomic unless ist_begin_non_atomic is called.
 	 */
 	preempt_count_add(HARDIRQ_OFFSET);
 
 	/* This code is a bit fragile.  Test it. */
 	rcu_lockdep_assert(rcu_is_watching(), "ist_enter didn't work");
-
-	return prev_state;
 }
 
-void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
+void ist_exit(struct pt_regs *regs)
 {
-	/* Must be before exception_exit. */
 	preempt_count_sub(HARDIRQ_OFFSET);
 
-	if (user_mode(regs))
-		return exception_exit(prev_state);
-	else
+	if (!user_mode(regs))
 		rcu_nmi_exit();
 }
 
@@ -162,7 +152,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
  * a double fault, it can be safe to schedule.  ist_begin_non_atomic()
  * begins a non-atomic section within an ist_enter()/ist_exit() region.
  * Callers are responsible for enabling interrupts themselves inside
- * the non-atomic section, and callers must call is_end_non_atomic()
+ * the non-atomic section, and callers must call ist_end_non_atomic()
  * before ist_exit().
  */
 void ist_begin_non_atomic(struct pt_regs *regs)
@@ -289,7 +279,6 @@ NOKPROBE_SYMBOL(do_trap);
 static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 			  unsigned long trapnr, int signr)
 {
-	enum ctx_state prev_state = exception_enter();
 	siginfo_t info;
 
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
@@ -300,8 +289,6 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 		do_trap(trapnr, signr, str, regs, error_code,
 			fill_trap_info(regs, signr, trapnr, &info));
 	}
-
-	exception_exit(prev_state);
 }
 
 #define DO_ERROR(trapnr, signr, str, name)				\
@@ -353,7 +340,7 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 	}
 #endif
 
-	ist_enter(regs);  /* Discard prev_state because we won't return. */
+	ist_enter(regs);
 	notify_die(DIE_TRAP, str, regs, error_code, X86_TRAP_DF, SIGSEGV);
 
 	tsk->thread.error_code = error_code;
@@ -373,15 +360,13 @@ dotraplinkage void do_double_fault(struct pt_regs *regs, long error_code)
 
 dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
 	const struct bndcsr *bndcsr;
 	siginfo_t *info;
 
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
-		goto exit;
+		return;
 	conditional_sti(regs);
 
 	if (!user_mode(regs))
@@ -438,9 +423,8 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 		die("bounds", regs, error_code);
 	}
 
-exit:
-	exception_exit(prev_state);
 	return;
+
 exit_trap:
 	/*
 	 * This path out is for all the cases where we could not
@@ -450,36 +434,33 @@ exit_trap:
 	 * time..
 	 */
 	do_trap(X86_TRAP_BR, SIGSEGV, "bounds", regs, error_code, NULL);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
 do_general_protection(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk;
-	enum ctx_state prev_state;
 
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	conditional_sti(regs);
 
 	if (v8086_mode(regs)) {
 		local_irq_enable();
 		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
-		goto exit;
+		return;
 	}
 
 	tsk = current;
 	if (!user_mode(regs)) {
 		if (fixup_exception(regs))
-			goto exit;
+			return;
 
 		tsk->thread.error_code = error_code;
 		tsk->thread.trap_nr = X86_TRAP_GP;
 		if (notify_die(DIE_GPF, "general protection fault", regs, error_code,
 			       X86_TRAP_GP, SIGSEGV) != NOTIFY_STOP)
 			die("general protection fault", regs, error_code);
-		goto exit;
+		return;
 	}
 
 	tsk->thread.error_code = error_code;
@@ -495,16 +476,12 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	}
 
 	force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
-exit:
-	exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(do_general_protection);
 
 /* May run on IST stack. */
 dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 	/*
 	 * ftrace must be first, everything else may cause a recursive crash.
@@ -517,7 +494,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	if (poke_int3_handler(regs))
 		return;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
@@ -544,7 +521,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 	preempt_conditional_cli(regs);
 	debug_stack_usage_dec();
 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_int3);
 
@@ -620,12 +597,11 @@ NOKPROBE_SYMBOL(fixup_bad_iret);
 dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk = current;
-	enum ctx_state prev_state;
 	int user_icebp = 0;
 	unsigned long dr6;
 	int si_code;
 
-	prev_state = ist_enter(regs);
+	ist_enter(regs);
 
 	get_debugreg(dr6, 6);
 
@@ -700,7 +676,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
 	debug_stack_usage_dec();
 
 exit:
-	ist_exit(regs, prev_state);
+	ist_exit(regs);
 }
 NOKPROBE_SYMBOL(do_debug);
 
@@ -752,23 +728,15 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	math_error(regs, error_code, X86_TRAP_MF);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	math_error(regs, error_code, X86_TRAP_XF);
-	exception_exit(prev_state);
 }
 
 dotraplinkage void
@@ -780,9 +748,6 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-	enum ctx_state prev_state;
-
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	BUG_ON(use_eager_fpu());
 
@@ -794,7 +759,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 
 		info.regs = regs;
 		math_emulate(&info);
-		exception_exit(prev_state);
 		return;
 	}
 #endif
@@ -802,7 +766,6 @@ do_device_not_available(struct pt_regs *regs, long error_code)
 #ifdef CONFIG_X86_32
 	conditional_sti(regs);
 #endif
-	exception_exit(prev_state);
 }
 NOKPROBE_SYMBOL(do_device_not_available);
 
@@ -810,9 +773,7 @@ NOKPROBE_SYMBOL(do_device_not_available);
 dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 {
 	siginfo_t info;
-	enum ctx_state prev_state;
 
-	prev_state = exception_enter();
 	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
 	local_irq_enable();
 
@@ -825,7 +786,6 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 		do_trap(X86_TRAP_IRET, SIGILL, "iret exception", regs, error_code,
 			&info);
 	}
-	exception_exit(prev_state);
 }
 #endif
 
-- 
cgit v0.10.2


From 06a7b36c7bd932e60997bedbae32b3d8e6722281 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:33 -0700
Subject: x86/entry: Remove SCHEDULE_USER and asm/context-tracking.h

SCHEDULE_USER is no longer used, and asm/context-tracking.h
contained nothing else.  Remove the header entirely.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/854e9b45f69af20e26c47099eb236321563ebcee.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 168ee26..041a37a 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -33,7 +33,6 @@
 #include <asm/paravirt.h>
 #include <asm/percpu.h>
 #include <asm/asm.h>
-#include <asm/context_tracking.h>
 #include <asm/smap.h>
 #include <asm/pgtable_types.h>
 #include <linux/err.h>
diff --git a/arch/x86/include/asm/context_tracking.h b/arch/x86/include/asm/context_tracking.h
deleted file mode 100644
index 1fe4970..0000000
--- a/arch/x86/include/asm/context_tracking.h
+++ /dev/null
@@ -1,10 +0,0 @@
-#ifndef _ASM_X86_CONTEXT_TRACKING_H
-#define _ASM_X86_CONTEXT_TRACKING_H
-
-#ifdef CONFIG_CONTEXT_TRACKING
-# define SCHEDULE_USER call schedule_user
-#else
-# define SCHEDULE_USER call schedule
-#endif
-
-#endif
-- 
cgit v0.10.2


From 0333a209cbf600e980fc55c24878a56f25f48b65 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 3 Jul 2015 12:44:34 -0700
Subject: x86/irq, context_tracking: Document how IRQ context tracking works
 and add an RCU assertion

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: paulmck@linux.vnet.ibm.com
Link: http://lkml.kernel.org/r/e8bdc4ed0193fb2fd130f3d6b7b8023e2ec1ab62.1435952415.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 88b36648..6233de0 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -216,8 +216,23 @@ __visible unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
 	unsigned vector = ~regs->orig_ax;
 	unsigned irq;
 
+	/*
+	 * NB: Unlike exception entries, IRQ entries do not reliably
+	 * handle context tracking in the low-level entry code.  This is
+	 * because syscall entries execute briefly with IRQs on before
+	 * updating context tracking state, so we can take an IRQ from
+	 * kernel mode with CONTEXT_USER.  The low-level entry code only
+	 * updates the context if we came from user mode, so we won't
+	 * switch to CONTEXT_KERNEL.  We'll fix that once the syscall
+	 * code is cleaned up enough that we can cleanly defer enabling
+	 * IRQs.
+	 */
+
 	entering_irq();
 
+	/* entering_irq() tells RCU that we're not quiescent.  Check it. */
+	rcu_lockdep_assert(rcu_is_watching(), "IRQ failed to wake up RCU");
+
 	irq = __this_cpu_read(vector_irq[vector]);
 
 	if (!handle_irq(irq, regs)) {
-- 
cgit v0.10.2


From 8f7f06b87acd2e017d6c536f59e10045dd8d0578 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 7 Jul 2015 10:55:28 -0700
Subject: x86/entry/64: Fix IRQ state confusion and related warning on compat
 syscalls with CONFIG_AUDITSYSCALL=n

int_ret_from_sys_call now expects IRQs to be enabled.  I got
this right in the real sysexit_audit and sysretl_audit asm
paths, but I missed it in the #defined-away versions when
CONFIG_AUDITSYSCALL=n.  This is a straightforward fix for
CONFIG_AUDITSYSCALL=n

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 29ea1b258b98 ("x86/entry/64: Migrate 64-bit and compat syscalls to the new exit handlers and remove old assembly code")
Link: http://lkml.kernel.org/r/25cf0a01e01c6008118dd8f8d9f043020416700c.1436291493.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index 25aca51..d757153 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -22,8 +22,8 @@
 #define __AUDIT_ARCH_LE		0x40000000
 
 #ifndef CONFIG_AUDITSYSCALL
-# define sysexit_audit		ia32_ret_from_sys_call
-# define sysretl_audit		ia32_ret_from_sys_call
+# define sysexit_audit		ia32_ret_from_sys_call_irqs_off
+# define sysretl_audit		ia32_ret_from_sys_call_irqs_off
 #endif
 
 	.section .entry.text, "ax"
@@ -466,6 +466,10 @@ ia32_badarg:
 	/* And exit again. */
 	jmp retint_user
 
+ia32_ret_from_sys_call_irqs_off:
+	TRACE_IRQS_ON
+	ENABLE_INTERRUPTS(CLBR_NONE)
+
 ia32_ret_from_sys_call:
 	xorl	%eax, %eax		/* Do not leak kernel information */
 	movq	%rax, R11(%rsp)
-- 
cgit v0.10.2


From d132803e6c611d50c19baedc8ae520203a2baca7 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 15 Jul 2015 14:25:16 -0700
Subject: x86/entry: Fix _TIF_USER_RETURN_NOTIFY check in
 prepare_exit_to_usermode

Linus noticed that the early return check was missing
_TIF_USER_RETURN_NOTIFY.  If the only work flag was
_TIF_USER_RETURN_NOTIFY, we'd skip user return notifiers.  Fix
it. (This is the only missing bit.)

This fixes double faults on a KVM host.  It's the same issue as
last time, except that this time it's very easy to trigger.
Apparently no one uses -next as a KVM host.

( I'm still not quite sure what it is that KVM does that blows up
  so badly if we miss a user return notifier.  My best guess is that KVM
  lets KERNEL_GS_BASE (i.e. the user's gs base) be negative and fixes
  it up in a user return notifier.  If we actually end up in user mode
  with a negative gs base, we blow up pretty badly. )

Reported-by: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: c5c46f59e4e7 ("x86/entry: Add new, comprehensible entry and exit handlers written in C")
Link: http://lkml.kernel.org/r/3f801104d24ee7a6bb1446408d9950777aa63277.1436995419.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index febc530..a3e9c7f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -264,7 +264,8 @@ __visible void prepare_exit_to_usermode(struct pt_regs *regs)
 			READ_ONCE(pt_regs_to_thread_info(regs)->flags);
 
 		if (!(cached_flags & (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |
-				      _TIF_UPROBE | _TIF_NEED_RESCHED)))
+				      _TIF_UPROBE | _TIF_NEED_RESCHED |
+				      _TIF_USER_RETURN_NOTIFY)))
 			break;
 
 		/* We have work to do. */
-- 
cgit v0.10.2


From bf9f2ee28d475ada0005c59382852cb70f1419ac Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 20 Jul 2015 11:52:23 -0700
Subject: x86/nmi: Remove the 'b2b' parameter from nmi_handle()

It has never had any effect. Remove it for comprehensibility.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c91fa38507760d9e54a4b8737fa6409bde896b33.1437418322.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c
index c3e985d..f76d650 100644
--- a/arch/x86/kernel/nmi.c
+++ b/arch/x86/kernel/nmi.c
@@ -110,7 +110,7 @@ static void nmi_max_handler(struct irq_work *w)
 		a->handler, whole_msecs, decimal_msecs);
 }
 
-static int nmi_handle(unsigned int type, struct pt_regs *regs, bool b2b)
+static int nmi_handle(unsigned int type, struct pt_regs *regs)
 {
 	struct nmi_desc *desc = nmi_to_desc(type);
 	struct nmiaction *a;
@@ -213,7 +213,7 @@ static void
 pci_serr_error(unsigned char reason, struct pt_regs *regs)
 {
 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_SERR, regs, false))
+	if (nmi_handle(NMI_SERR, regs))
 		return;
 
 	pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
@@ -247,7 +247,7 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
 	unsigned long i;
 
 	/* check to see if anyone registered against these types of errors */
-	if (nmi_handle(NMI_IO_CHECK, regs, false))
+	if (nmi_handle(NMI_IO_CHECK, regs))
 		return;
 
 	pr_emerg(
@@ -284,7 +284,7 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
 	 * as only the first one is ever run (unless it can actually determine
 	 * if it caused the NMI)
 	 */
-	handled = nmi_handle(NMI_UNKNOWN, regs, false);
+	handled = nmi_handle(NMI_UNKNOWN, regs);
 	if (handled) {
 		__this_cpu_add(nmi_stats.unknown, handled);
 		return;
@@ -332,7 +332,7 @@ static void default_do_nmi(struct pt_regs *regs)
 
 	__this_cpu_write(last_nmi_rip, regs->ip);
 
-	handled = nmi_handle(NMI_LOCAL, regs, b2b);
+	handled = nmi_handle(NMI_LOCAL, regs);
 	__this_cpu_add(nmi_stats.normal, handled);
 	if (handled) {
 		/*
-- 
cgit v0.10.2


From 0233606ce5cf12c1a0e27cb197066ea5bc2bb488 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 19 Jul 2015 21:09:04 -0400
Subject: x86/entry/vm86: Clean up saved_fs/gs

There is no need to save FS and non-lazy GS outside the 32-bit
regs.  Lazy GS still needs to be saved because it wasn't saved
on syscall entry.  Save it in the gs slot of regs32, which is
present but unused.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437354550-25858-2-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 43e6519..f4e4e3f 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -410,8 +410,6 @@ struct thread_struct {
 	unsigned long		v86flags;
 	unsigned long		v86mask;
 	unsigned long		saved_sp0;
-	unsigned int		saved_fs;
-	unsigned int		saved_gs;
 #endif
 	/* IO permissions: */
 	unsigned long		*io_bitmap_ptr;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index fc9db6e..761a2f9 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -159,8 +159,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 
 	ret = KVM86->regs32;
 
-	ret->fs = current->thread.saved_fs;
-	set_user_gs(ret, current->thread.saved_gs);
+	lazy_load_gs(ret->gs);
 
 	return ret;
 }
@@ -315,8 +314,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
  */
 	info->regs32->ax = VM86_SIGNAL;
 	tsk->thread.saved_sp0 = tsk->thread.sp0;
-	tsk->thread.saved_fs = info->regs32->fs;
-	tsk->thread.saved_gs = get_user_gs(info->regs32);
+	lazy_save_gs(info->regs32->gs);
 
 	tss = &per_cpu(cpu_tss, get_cpu());
 	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
-- 
cgit v0.10.2


From df1ae9a5dc66d9fd57109240042372b1065d984a Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 19 Jul 2015 21:09:05 -0400
Subject: x86/entry/vm86: Preserve 'orig_ax'

There is no legitimate reason for usermode to modify the 'orig_ax'
field on entry to vm86 mode, so copy it from the 32-bit regs.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437354550-25858-3-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 761a2f9..9a2dc80 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -294,6 +294,8 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 	info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
 	info->regs.pt.flags |= X86_VM_MASK;
 
+	info->regs.pt.orig_ax = info->regs32->orig_ax;
+
 	switch (info->cpu_type) {
 	case CPU_286:
 		tsk->thread.v86mask = 0;
-- 
cgit v0.10.2


From ed0b2edb61ba4e557de759093d965654186f28b2 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Sun, 19 Jul 2015 21:09:06 -0400
Subject: x86/entry/vm86: Move userspace accesses to do_sys_vm86()

Move the userspace accesses down into the common function in
preparation for the next set of patches.  Also change to copying
the fields explicitly instead of assuming a fixed order in
pt_regs and the kernel data structures.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1437354550-25858-4-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index f4e4e3f..35ad554 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -405,7 +405,7 @@ struct thread_struct {
 	unsigned long		error_code;
 #ifdef CONFIG_X86_32
 	/* Virtual 86 mode info */
-	struct vm86_struct __user *vm86_info;
+	struct vm86plus_struct __user *vm86_info;
 	unsigned long		screen_bitmap;
 	unsigned long		v86flags;
 	unsigned long		v86mask;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9a2dc80..e6c2b47 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -90,46 +90,13 @@
 #define SAFE_MASK	(0xDD5)
 #define RETURN_MASK	(0xDFF)
 
-/* convert kernel_vm86_regs to vm86_regs */
-static int copy_vm86_regs_to_user(struct vm86_regs __user *user,
-				  const struct kernel_vm86_regs *regs)
-{
-	int ret = 0;
-
-	/*
-	 * kernel_vm86_regs is missing gs, so copy everything up to
-	 * (but not including) orig_eax, and then rest including orig_eax.
-	 */
-	ret += copy_to_user(user, regs, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	ret += copy_to_user(&user->orig_eax, &regs->pt.orig_ax,
-			    sizeof(struct kernel_vm86_regs) -
-			    offsetof(struct kernel_vm86_regs, pt.orig_ax));
-
-	return ret;
-}
-
-/* convert vm86_regs to kernel_vm86_regs */
-static int copy_vm86_regs_from_user(struct kernel_vm86_regs *regs,
-				    const struct vm86_regs __user *user,
-				    unsigned extra)
-{
-	int ret = 0;
-
-	/* copy ax-fs inclusive */
-	ret += copy_from_user(regs, user, offsetof(struct kernel_vm86_regs, pt.orig_ax));
-	/* copy orig_ax-__gsh+extra */
-	ret += copy_from_user(&regs->pt.orig_ax, &user->orig_eax,
-			      sizeof(struct kernel_vm86_regs) -
-			      offsetof(struct kernel_vm86_regs, pt.orig_ax) +
-			      extra);
-	return ret;
-}
-
 struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 {
 	struct tss_struct *tss;
 	struct pt_regs *ret;
-	unsigned long tmp;
+	struct task_struct *tsk = current;
+	struct vm86plus_struct __user *user;
+	long err = 0;
 
 	/*
 	 * This gets called from entry.S with interrupts disabled, but
@@ -138,23 +105,50 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	 */
 	local_irq_enable();
 
-	if (!current->thread.vm86_info) {
+	if (!tsk->thread.vm86_info) {
 		pr_alert("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | current->thread.v86mask);
-	tmp = copy_vm86_regs_to_user(&current->thread.vm86_info->regs, regs);
-	tmp += put_user(current->thread.screen_bitmap, &current->thread.vm86_info->screen_bitmap);
-	if (tmp) {
+	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | tsk->thread.v86mask);
+	user = tsk->thread.vm86_info;
+
+	if (!access_ok(VERIFY_WRITE, user, VMPI.is_vm86pus ?
+		       sizeof(struct vm86plus_struct) :
+		       sizeof(struct vm86_struct))) {
+		pr_alert("could not access userspace vm86_info\n");
+		do_exit(SIGSEGV);
+	}
+
+	put_user_try {
+		put_user_ex(regs->pt.bx, &user->regs.ebx);
+		put_user_ex(regs->pt.cx, &user->regs.ecx);
+		put_user_ex(regs->pt.dx, &user->regs.edx);
+		put_user_ex(regs->pt.si, &user->regs.esi);
+		put_user_ex(regs->pt.di, &user->regs.edi);
+		put_user_ex(regs->pt.bp, &user->regs.ebp);
+		put_user_ex(regs->pt.ax, &user->regs.eax);
+		put_user_ex(regs->pt.ip, &user->regs.eip);
+		put_user_ex(regs->pt.cs, &user->regs.cs);
+		put_user_ex(regs->pt.flags, &user->regs.eflags);
+		put_user_ex(regs->pt.sp, &user->regs.esp);
+		put_user_ex(regs->pt.ss, &user->regs.ss);
+		put_user_ex(regs->es, &user->regs.es);
+		put_user_ex(regs->ds, &user->regs.ds);
+		put_user_ex(regs->fs, &user->regs.fs);
+		put_user_ex(regs->gs, &user->regs.gs);
+
+		put_user_ex(tsk->thread.screen_bitmap, &user->screen_bitmap);
+	} put_user_catch(err);
+	if (err) {
 		pr_alert("could not access userspace vm86_info\n");
 		do_exit(SIGSEGV);
 	}
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	current->thread.sp0 = current->thread.saved_sp0;
-	current->thread.sysenter_cs = __KERNEL_CS;
-	load_sp0(tss, &current->thread);
-	current->thread.saved_sp0 = 0;
+	tsk->thread.sp0 = tsk->thread.saved_sp0;
+	tsk->thread.sysenter_cs = __KERNEL_CS;
+	load_sp0(tss, &tsk->thread);
+	tsk->thread.saved_sp0 = 0;
 	put_cpu();
 
 	ret = KVM86->regs32;
@@ -199,7 +193,8 @@ out:
 
 
 static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
+static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
+			struct kernel_vm86_struct *info);
 
 SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
 {
@@ -208,21 +203,8 @@ SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
 					 * This remains on the stack until we
 					 * return to 32 bit user space.
 					 */
-	struct task_struct *tsk = current;
-	int tmp;
 
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, vm86plus) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	memset(&info.vm86plus, 0, (int)&info.regs32 - (int)&info.vm86plus);
-	info.regs32 = current_pt_regs();
-	tsk->thread.vm86_info = v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) v86, false, &info);
 }
 
 
@@ -233,11 +215,7 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 					 * This remains on the stack until we
 					 * return to 32 bit user space.
 					 */
-	struct task_struct *tsk;
-	int tmp;
-	struct vm86plus_struct __user *v86;
 
-	tsk = current;
 	switch (cmd) {
 	case VM86_REQUEST_IRQ:
 	case VM86_FREE_IRQ:
@@ -255,34 +233,69 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 	}
 
 	/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
-	if (tsk->thread.saved_sp0)
-		return -EPERM;
-	v86 = (struct vm86plus_struct __user *)arg;
-	tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
-				       offsetof(struct kernel_vm86_struct, regs32) -
-				       sizeof(info.regs));
-	if (tmp)
-		return -EFAULT;
-	info.regs32 = current_pt_regs();
-	info.vm86plus.is_vm86pus = 1;
-	tsk->thread.vm86_info = (struct vm86_struct __user *)v86;
-	do_sys_vm86(&info, tsk);
-	return 0;	/* we never return here */
+	return do_sys_vm86((struct vm86plus_struct __user *) arg, true, &info);
 }
 
 
-static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk)
+static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
+			struct kernel_vm86_struct *info)
 {
 	struct tss_struct *tss;
-/*
- * make sure the vm86() system call doesn't try to do anything silly
- */
-	info->regs.pt.ds = 0;
-	info->regs.pt.es = 0;
-	info->regs.pt.fs = 0;
-#ifndef CONFIG_X86_32_LAZY_GS
-	info->regs.pt.gs = 0;
-#endif
+	struct task_struct *tsk = current;
+	unsigned long err = 0;
+
+	if (tsk->thread.saved_sp0)
+		return -EPERM;
+
+	if (!access_ok(VERIFY_READ, v86, plus ?
+		       sizeof(struct vm86_struct) :
+		       sizeof(struct vm86plus_struct)))
+		return -EFAULT;
+
+	memset(info, 0, sizeof(*info));
+	get_user_try {
+		unsigned short seg;
+		get_user_ex(info->regs.pt.bx, &v86->regs.ebx);
+		get_user_ex(info->regs.pt.cx, &v86->regs.ecx);
+		get_user_ex(info->regs.pt.dx, &v86->regs.edx);
+		get_user_ex(info->regs.pt.si, &v86->regs.esi);
+		get_user_ex(info->regs.pt.di, &v86->regs.edi);
+		get_user_ex(info->regs.pt.bp, &v86->regs.ebp);
+		get_user_ex(info->regs.pt.ax, &v86->regs.eax);
+		get_user_ex(info->regs.pt.ip, &v86->regs.eip);
+		get_user_ex(seg, &v86->regs.cs);
+		info->regs.pt.cs = seg;
+		get_user_ex(info->regs.pt.flags, &v86->regs.eflags);
+		get_user_ex(info->regs.pt.sp, &v86->regs.esp);
+		get_user_ex(seg, &v86->regs.ss);
+		info->regs.pt.ss = seg;
+		get_user_ex(info->regs.es, &v86->regs.es);
+		get_user_ex(info->regs.ds, &v86->regs.ds);
+		get_user_ex(info->regs.fs, &v86->regs.fs);
+		get_user_ex(info->regs.gs, &v86->regs.gs);
+
+		get_user_ex(info->flags, &v86->flags);
+		get_user_ex(info->screen_bitmap, &v86->screen_bitmap);
+		get_user_ex(info->cpu_type, &v86->cpu_type);
+	} get_user_catch(err);
+	if (err)
+		return err;
+
+	if (copy_from_user(&info->int_revectored, &v86->int_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (copy_from_user(&info->int21_revectored, &v86->int21_revectored,
+			   sizeof(struct revectored_struct)))
+		return -EFAULT;
+	if (plus) {
+		if (copy_from_user(&info->vm86plus, &v86->vm86plus,
+				   sizeof(struct vm86plus_info_struct)))
+			return -EFAULT;
+		info->vm86plus.is_vm86pus = 1;
+	}
+
+	info->regs32 = current_pt_regs();
+	tsk->thread.vm86_info = v86;
 
 /*
  * The flags register is also special: we cannot trust that the user
@@ -344,7 +357,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
 		"jmp resume_userspace"
 		: /* no outputs */
 		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
-	/* we never return here */
+	unreachable();	/* we never return here */
 }
 
 static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
-- 
cgit v0.10.2


From 9dea5dc921b5f4045a18c63eb92e84dc274d17eb Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 14 Jul 2015 15:24:24 -0700
Subject: x86/entry/syscalls: Wire up 32-bit direct socket calls

On x86_64, there's no socketcall syscall; instead all of the
socket calls are real syscalls.  For 32-bit programs, we're
stuck offering the socketcall syscall, but it would be nice to
expose the direct calls as well.  This will enable seccomp to
filter socket calls (for new userspace only, but that's fine for
some applications) and it will provide a tiny performance boost.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Alexander Larsson <alexl@redhat.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Cosimo Cecchi <cosimo@endlessm.com>
Cc: Dan Nicholson <nicholson@endlessm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Cc: libc-alpha <libc-alpha@sourceware.org>
Link: http://lkml.kernel.org/r/cb5138299d37d5800e2d135b01a7667fa6115854.1436912629.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl
index ef8187f..25e3cf1 100644
--- a/arch/x86/entry/syscalls/syscall_32.tbl
+++ b/arch/x86/entry/syscalls/syscall_32.tbl
@@ -365,3 +365,18 @@
 356	i386	memfd_create		sys_memfd_create
 357	i386	bpf			sys_bpf
 358	i386	execveat		sys_execveat			stub32_execveat
+359	i386	socket			sys_socket
+360	i386	socketpair		sys_socketpair
+361	i386	bind			sys_bind
+362	i386	connect			sys_connect
+363	i386	listen			sys_listen
+364	i386	accept4			sys_accept4
+365	i386	getsockopt		sys_getsockopt			compat_sys_getsockopt
+366	i386	setsockopt		sys_setsockopt			compat_sys_setsockopt
+367	i386	getsockname		sys_getsockname
+368	i386	getpeername		sys_getpeername
+369	i386	sendto			sys_sendto
+370	i386	sendmsg			sys_sendmsg			compat_sys_sendmsg
+371	i386	recvfrom		sys_recvfrom			compat_sys_recvfrom
+372	i386	recvmsg			sys_recvmsg			compat_sys_recvmsg
+373	i386	shutdown		sys_shutdown
-- 
cgit v0.10.2


From 3490565b633c705d2fb1f6ede51228952664663d Mon Sep 17 00:00:00 2001
From: Denys Vlasenko <dvlasenk@redhat.com>
Date: Mon, 13 Jul 2015 20:31:03 +0200
Subject: locking/spinlocks: Force inlining of spinlock ops

With both gcc 4.7.2 and 4.9.2, sometimes GCC mysteriously
doesn't inline very small functions we expect to be inlined.
See:

    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=66122

In particular, with this config:

   http://busybox.net/~vda/kernel_config

there are more than a thousand copies of tiny spinlock-related
functions:

  $ nm --size-sort vmlinux | grep -iF ' t ' | uniq -c | grep -v '^ *1 ' | sort -rn | grep ' spin'
    473 000000000000000b t spin_unlock_irqrestore
    292 000000000000000b t spin_unlock
    215 000000000000000b t spin_lock
    134 000000000000000b t spin_unlock_irq
    130 000000000000000b t spin_unlock_bh
    120 000000000000000b t spin_lock_irq
    106 000000000000000b t spin_lock_bh

Disassembly:

 ffffffff81004720 <spin_lock>:
 ffffffff81004720:       55                      push   %rbp
 ffffffff81004721:       48 89 e5                mov    %rsp,%rbp
 ffffffff81004724:       e8 f8 4e e2 02          callq <_raw_spin_lock>
 ffffffff81004729:       5d                      pop    %rbp
 ffffffff8100472a:       c3                      retq

This patch fixes this via s/inline/__always_inline/ in
spinlock.h. This decreases vmlinux by about 40k:

      text     data      bss       dec     hex filename
  82375570 22255544 20627456 125258570 7774b4a vmlinux.before
  82335059 22255416 20627456 125217931 776ac8b vmlinux

Signed-off-by: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Bart Van Assche <bvanassche@acm.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thomas Graf <tgraf@suug.ch>
Link: http://lkml.kernel.org/r/1436812263-15243-1-git-send-email-dvlasenk@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index 0063b24..ffcd053 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -296,7 +296,7 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock)
  * Map the spin_lock functions to the raw variants for PREEMPT_RT=n
  */
 
-static inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
+static __always_inline raw_spinlock_t *spinlock_check(spinlock_t *lock)
 {
 	return &lock->rlock;
 }
@@ -307,17 +307,17 @@ do {							\
 	raw_spin_lock_init(&(_lock)->rlock);		\
 } while (0)
 
-static inline void spin_lock(spinlock_t *lock)
+static __always_inline void spin_lock(spinlock_t *lock)
 {
 	raw_spin_lock(&lock->rlock);
 }
 
-static inline void spin_lock_bh(spinlock_t *lock)
+static __always_inline void spin_lock_bh(spinlock_t *lock)
 {
 	raw_spin_lock_bh(&lock->rlock);
 }
 
-static inline int spin_trylock(spinlock_t *lock)
+static __always_inline int spin_trylock(spinlock_t *lock)
 {
 	return raw_spin_trylock(&lock->rlock);
 }
@@ -337,7 +337,7 @@ do {									\
 	raw_spin_lock_nest_lock(spinlock_check(lock), nest_lock);	\
 } while (0)
 
-static inline void spin_lock_irq(spinlock_t *lock)
+static __always_inline void spin_lock_irq(spinlock_t *lock)
 {
 	raw_spin_lock_irq(&lock->rlock);
 }
@@ -352,32 +352,32 @@ do {									\
 	raw_spin_lock_irqsave_nested(spinlock_check(lock), flags, subclass); \
 } while (0)
 
-static inline void spin_unlock(spinlock_t *lock)
+static __always_inline void spin_unlock(spinlock_t *lock)
 {
 	raw_spin_unlock(&lock->rlock);
 }
 
-static inline void spin_unlock_bh(spinlock_t *lock)
+static __always_inline void spin_unlock_bh(spinlock_t *lock)
 {
 	raw_spin_unlock_bh(&lock->rlock);
 }
 
-static inline void spin_unlock_irq(spinlock_t *lock)
+static __always_inline void spin_unlock_irq(spinlock_t *lock)
 {
 	raw_spin_unlock_irq(&lock->rlock);
 }
 
-static inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+static __always_inline void spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
 {
 	raw_spin_unlock_irqrestore(&lock->rlock, flags);
 }
 
-static inline int spin_trylock_bh(spinlock_t *lock)
+static __always_inline int spin_trylock_bh(spinlock_t *lock)
 {
 	return raw_spin_trylock_bh(&lock->rlock);
 }
 
-static inline int spin_trylock_irq(spinlock_t *lock)
+static __always_inline int spin_trylock_irq(spinlock_t *lock)
 {
 	return raw_spin_trylock_irq(&lock->rlock);
 }
@@ -387,22 +387,22 @@ static inline int spin_trylock_irq(spinlock_t *lock)
 	raw_spin_trylock_irqsave(spinlock_check(lock), flags); \
 })
 
-static inline void spin_unlock_wait(spinlock_t *lock)
+static __always_inline void spin_unlock_wait(spinlock_t *lock)
 {
 	raw_spin_unlock_wait(&lock->rlock);
 }
 
-static inline int spin_is_locked(spinlock_t *lock)
+static __always_inline int spin_is_locked(spinlock_t *lock)
 {
 	return raw_spin_is_locked(&lock->rlock);
 }
 
-static inline int spin_is_contended(spinlock_t *lock)
+static __always_inline int spin_is_contended(spinlock_t *lock)
 {
 	return raw_spin_is_contended(&lock->rlock);
 }
 
-static inline int spin_can_lock(spinlock_t *lock)
+static __always_inline int spin_can_lock(spinlock_t *lock)
 {
 	return raw_spin_can_lock(&lock->rlock);
 }
-- 
cgit v0.10.2


From 5aef51c340cb50ed9a3997dc5d782324372078bd Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 10 Jul 2015 08:34:23 -0700
Subject: x86/kconfig/32: Rename CONFIG_VM86 and default it to 'n'

VM86 is entirely broken if ptrace, syscall auditing, or
NOHZ_FULL is in use.  The code is a big undocumented mess, it's
a real PITA to test, and it looks like a big chunk of vm86_32.c
is dead code.  It also plays awful games with the entry asm.

No one should be using it anyway. Use DOSBOX or KVM instead.

Let's accelerate its slow death.  Remove it from EXPERT and
default it to n.  Distros should not enable it.  In the unlikely
event that some user needs it, they can easily re-enable it.

While we're at it, rename it to CONFIG_X86_LEGACY_VM86 so that 'make
oldconfig' users will be prompted again.  I left CONFIG_VM86 as
an alias to avoid a treewide replacement of the names.  We can
clean that up once the current asm and vm86 code churn settles
down.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Austin S Hemmelgarn <ahferroin7@gmail.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d29c6cc442d32d4df58849d2f8c89fb39ff88d61.1436542295.git.luto@kernel.org
[ Refined it some more. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index aa94fd0..2cb2211 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -996,15 +996,36 @@ config X86_THERMAL_VECTOR
 	def_bool y
 	depends on X86_MCE_INTEL
 
-config VM86
-	bool "Enable VM86 support" if EXPERT
-	default y
+config X86_LEGACY_VM86
+	bool "Legacy VM86 support (obsolete)"
+	default n
 	depends on X86_32
 	---help---
-	  This option is required by programs like DOSEMU to run
-	  16-bit real mode legacy code on x86 processors. It also may
-	  be needed by software like XFree86 to initialize some video
-	  cards via BIOS. Disabling this option saves about 6K.
+	  This option allows user programs to put the CPU into V8086
+	  mode, which is an 80286-era approximation of 16-bit real mode.
+
+	  Some very old versions of X and/or vbetool require this option
+	  for user mode setting.  Similarly, DOSEMU will use it if
+	  available to accelerate real mode DOS programs.  However, any
+	  recent version of DOSEMU, X, or vbetool should be fully
+	  functional even without kernel VM86 support, as they will all
+	  fall back to (pretty well performing) software emulation.
+
+	  Anything that works on a 64-bit kernel is unlikely to need
+	  this option, as 64-bit kernels don't, and can't, support V8086
+	  mode.  This option is also unrelated to 16-bit protected mode
+	  and is not needed to run most 16-bit programs under Wine.
+
+	  Enabling this option adds considerable attack surface to the
+	  kernel and slows down system calls and exception handling.
+
+	  Unless you use very old userspace or need the last drop of
+	  performance in your real mode DOS games and can't use KVM,
+	  say N here.
+
+config VM86
+       bool
+       default X86_LEGACY_VM86
 
 config X86_16BIT
 	bool "Enable support for 16-bit segments" if EXPERT
-- 
cgit v0.10.2


From f2a50f8b7da45ff2de93a71393e715a2ab9f3b68 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 9 Jul 2015 19:17:29 -0700
Subject: x86/selftests, x86/vm86: Improve entry_from_vm86 selftest

The entry_from_vm86 selftest was very weak.  Improve it: test
more types of kernel entries from vm86 mode and test them more
carefully.

While we're at it, try to improve behavior on non-SEP CPUs.  The
old code was buggy because I misunderstood the intended
semantics of #UD in vm86, so I didn't handle a possible signal.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Denys Vlasenko <vda.linux@googlemail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d8ef1d7368ac70d8342481563ed50f9a7d2eea6f.1436492057.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c
index 5c38a18..f004b2a 100644
--- a/tools/testing/selftests/x86/entry_from_vm86.c
+++ b/tools/testing/selftests/x86/entry_from_vm86.c
@@ -28,6 +28,55 @@
 static unsigned long load_addr = 0x10000;
 static int nerrs = 0;
 
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static void clearhandler(int sig)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_handler = SIG_DFL;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+}
+
+static sig_atomic_t got_signal;
+
+static void sighandler(int sig, siginfo_t *info, void *ctx_void)
+{
+	ucontext_t *ctx = (ucontext_t*)ctx_void;
+
+	if (ctx->uc_mcontext.gregs[REG_EFL] & X86_EFLAGS_VM ||
+	    (ctx->uc_mcontext.gregs[REG_CS] & 3) != 3) {
+		printf("[FAIL]\tSignal frame should not reflect vm86 mode\n");
+		nerrs++;
+	}
+
+	const char *signame;
+	if (sig == SIGSEGV)
+		signame = "SIGSEGV";
+	else if (sig == SIGILL)
+		signame = "SIGILL";
+	else
+		signame = "unexpected signal";
+
+	printf("[INFO]\t%s: FLAGS = 0x%lx, CS = 0x%hx\n", signame,
+	       (unsigned long)ctx->uc_mcontext.gregs[REG_EFL],
+	       (unsigned short)ctx->uc_mcontext.gregs[REG_CS]);
+
+	got_signal = 1;
+}
+
 asm (
 	".pushsection .rodata\n\t"
 	".type vmcode_bound, @object\n\t"
@@ -38,6 +87,14 @@ asm (
 	"int3\n\t"
 	"vmcode_sysenter:\n\t"
 	"sysenter\n\t"
+	"vmcode_syscall:\n\t"
+	"syscall\n\t"
+	"vmcode_sti:\n\t"
+	"sti\n\t"
+	"vmcode_int3:\n\t"
+	"int3\n\t"
+	"vmcode_int80:\n\t"
+	"int $0x80\n\t"
 	".size vmcode, . - vmcode\n\t"
 	"end_vmcode:\n\t"
 	".code32\n\t"
@@ -45,9 +102,11 @@ asm (
 	);
 
 extern unsigned char vmcode[], end_vmcode[];
-extern unsigned char vmcode_bound[], vmcode_sysenter[];
+extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[],
+	vmcode_sti[], vmcode_int3[], vmcode_int80[];
 
 static void do_test(struct vm86plus_struct *v86, unsigned long eip,
+		    unsigned int rettype, unsigned int retarg,
 		    const char *text)
 {
 	long ret;
@@ -73,13 +132,28 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip,
 		else
 			sprintf(trapname, "%d", trapno);
 
-		printf("[OK]\tExited vm86 mode due to #%s\n", trapname);
+		printf("[INFO]\tExited vm86 mode due to #%s\n", trapname);
 	} else if (VM86_TYPE(ret) == VM86_UNKNOWN) {
-		printf("[OK]\tExited vm86 mode due to unhandled GP fault\n");
+		printf("[INFO]\tExited vm86 mode due to unhandled GP fault\n");
+	} else if (VM86_TYPE(ret) == VM86_TRAP) {
+		printf("[INFO]\tExited vm86 mode due to a trap (arg=%ld)\n",
+		       VM86_ARG(ret));
+	} else if (VM86_TYPE(ret) == VM86_SIGNAL) {
+		printf("[INFO]\tExited vm86 mode due to a signal\n");
+	} else if (VM86_TYPE(ret) == VM86_STI) {
+		printf("[INFO]\tExited vm86 mode due to STI\n");
 	} else {
-		printf("[OK]\tExited vm86 mode due to type %ld, arg %ld\n",
+		printf("[INFO]\tExited vm86 mode due to type %ld, arg %ld\n",
 		       VM86_TYPE(ret), VM86_ARG(ret));
 	}
+
+	if (rettype == -1 ||
+	    (VM86_TYPE(ret) == rettype && VM86_ARG(ret) == retarg)) {
+		printf("[OK]\tReturned correctly\n");
+	} else {
+		printf("[FAIL]\tIncorrect return reason\n");
+		nerrs++;
+	}
 }
 
 int main(void)
@@ -105,10 +179,52 @@ int main(void)
 	assert((v86.regs.cs & 3) == 0);	/* Looks like RPL = 0 */
 
 	/* #BR -- should deliver SIG??? */
-	do_test(&v86, vmcode_bound - vmcode, "#BR");
-
-	/* SYSENTER -- should cause #GP or #UD depending on CPU */
-	do_test(&v86, vmcode_sysenter - vmcode, "SYSENTER");
+	do_test(&v86, vmcode_bound - vmcode, VM86_INTx, 5, "#BR");
+
+	/*
+	 * SYSENTER -- should cause #GP or #UD depending on CPU.
+	 * Expected return type -1 means that we shouldn't validate
+	 * the vm86 return value.  This will avoid problems on non-SEP
+	 * CPUs.
+	 */
+	sethandler(SIGILL, sighandler, 0);
+	do_test(&v86, vmcode_sysenter - vmcode, -1, 0, "SYSENTER");
+	clearhandler(SIGILL);
+
+	/*
+	 * SYSCALL would be a disaster in VM86 mode.  Fortunately,
+	 * there is no kernel that both enables SYSCALL and sets
+	 * EFER.SCE, so it's #UD on all systems.  But vm86 is
+	 * buggy (or has a "feature"), so the SIGILL will actually
+	 * be delivered.
+	 */
+	sethandler(SIGILL, sighandler, 0);
+	do_test(&v86, vmcode_syscall - vmcode, VM86_SIGNAL, 0, "SYSCALL");
+	clearhandler(SIGILL);
+
+	/* STI with VIP set */
+	v86.regs.eflags |= X86_EFLAGS_VIP;
+	v86.regs.eflags &= ~X86_EFLAGS_IF;
+	do_test(&v86, vmcode_sti - vmcode, VM86_STI, 0, "STI with VIP set");
+
+	/* INT3 -- should cause #BP */
+	do_test(&v86, vmcode_int3 - vmcode, VM86_TRAP, 3, "INT3");
+
+	/* INT80 -- should exit with "INTx 0x80" */
+	v86.regs.eax = (unsigned int)-1;
+	do_test(&v86, vmcode_int80 - vmcode, VM86_INTx, 0x80, "int80");
+
+	/* Execute a null pointer */
+	v86.regs.cs = 0;
+	v86.regs.ss = 0;
+	sethandler(SIGSEGV, sighandler, 0);
+	got_signal = 0;
+	do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer");
+	if (!got_signal) {
+		printf("[FAIL]\tDid not receive SIGSEGV\n");
+		nerrs++;
+	}
+	clearhandler(SIGSEGV);
 
 	return (nerrs == 0 ? 0 : 1);
 }
-- 
cgit v0.10.2


From b2c51106c7581866c37ffc77c5d739f3d4b7cbc9 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 21 Jul 2015 09:27:18 -0700
Subject: x86/build: Fix detection of GCC -mpreferred-stack-boundary support

As per:

  https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383

GCC only allows -mpreferred-stack-boundary=3 on x86_64 if -mno-sse is set.
That means that cc-option will not detect -mpreferred-stack-boundary=3
support, because we test for it before setting -mno-sse.

Fix it by reordering the Makefile bits.

Compile-tested only.  This should help avoid code generation
issues such as the one that was worked around in:

  b96fecbfa8c8 ("x86/fpu: Fix boot crash in the early FPU code")

I'm a bit concerned that we could still have problems on older
GCC versions given that our asm code does not respect GCC's idea
of the ABI-required stack alignment.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/f5297c192969adfa0d28b84cf8a22d59573db26d.1436126872.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 118e6de..054ff96 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -39,6 +39,16 @@ ifdef CONFIG_X86_NEED_RELOCS
         LDFLAGS_vmlinux := --emit-relocs
 endif
 
+#
+# Prevent GCC from generating any FP code by mistake.
+#
+# This must happen before we try the -mpreferred-stack-boundary, see:
+#
+#    https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383
+#
+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
+KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
+
 ifeq ($(CONFIG_X86_32),y)
         BITS := 32
         UTS_MACHINE := i386
@@ -167,9 +177,6 @@ KBUILD_CFLAGS += -pipe
 KBUILD_CFLAGS += -Wno-sign-compare
 #
 KBUILD_CFLAGS += -fno-asynchronous-unwind-tables
-# prevent gcc from generating any FP code by mistake
-KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow
-KBUILD_CFLAGS += $(call cc-option,-mno-avx,)
 
 KBUILD_CFLAGS += $(mflags-y)
 KBUILD_AFLAGS += $(mflags-y)
-- 
cgit v0.10.2


From 014dc90b66c8d0b5f5a9400440727c134ee5e5a3 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 30 Jul 2015 14:31:33 -0700
Subject: selftests/x86, x86/ldt: Add a selftest for modify_ldt()

This tests general modify_ldt() behavior (only writes, so far) as
well as synchronous updates via IPI.  It fails on old kernels.

I called this ldt_gdt because I'll add set_thread_area() tests to
it at some point.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: security@kernel.org <security@kernel.org>
Cc: xen-devel <xen-devel@lists.xen.org>
Link: http://lkml.kernel.org/r/dcfda65dad07ff5a3ea97a9172b5963bf8031b2e.1438291540.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index e8df47e..b70da4a 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,7 +4,7 @@ include ../lib.mk
 
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
-TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs
+TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs ldt_gdt
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
diff --git a/tools/testing/selftests/x86/ldt_gdt.c b/tools/testing/selftests/x86/ldt_gdt.c
new file mode 100644
index 0000000..31a3035
--- /dev/null
+++ b/tools/testing/selftests/x86/ldt_gdt.c
@@ -0,0 +1,576 @@
+/*
+ * ldt_gdt.c - Test cases for LDT and GDT access
+ * Copyright (c) 2015 Andrew Lutomirski
+ */
+
+#define _GNU_SOURCE
+#include <err.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <signal.h>
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <asm/ldt.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <stdbool.h>
+#include <pthread.h>
+#include <sched.h>
+#include <linux/futex.h>
+
+#define AR_ACCESSED		(1<<8)
+
+#define AR_TYPE_RODATA		(0 * (1<<9))
+#define AR_TYPE_RWDATA		(1 * (1<<9))
+#define AR_TYPE_RODATA_EXPDOWN	(2 * (1<<9))
+#define AR_TYPE_RWDATA_EXPDOWN	(3 * (1<<9))
+#define AR_TYPE_XOCODE		(4 * (1<<9))
+#define AR_TYPE_XRCODE		(5 * (1<<9))
+#define AR_TYPE_XOCODE_CONF	(6 * (1<<9))
+#define AR_TYPE_XRCODE_CONF	(7 * (1<<9))
+
+#define AR_DPL3			(3 * (1<<13))
+
+#define AR_S			(1 << 12)
+#define AR_P			(1 << 15)
+#define AR_AVL			(1 << 20)
+#define AR_L			(1 << 21)
+#define AR_DB			(1 << 22)
+#define AR_G			(1 << 23)
+
+static int nerrs;
+
+static void check_invalid_segment(uint16_t index, int ldt)
+{
+	uint32_t has_limit = 0, has_ar = 0, limit, ar;
+	uint32_t selector = (index << 3) | (ldt << 2) | 3;
+
+	asm ("lsl %[selector], %[limit]\n\t"
+	     "jnz 1f\n\t"
+	     "movl $1, %[has_limit]\n\t"
+	     "1:"
+	     : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
+	     : [selector] "r" (selector));
+	asm ("larl %[selector], %[ar]\n\t"
+	     "jnz 1f\n\t"
+	     "movl $1, %[has_ar]\n\t"
+	     "1:"
+	     : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
+	     : [selector] "r" (selector));
+
+	if (has_limit || has_ar) {
+		printf("[FAIL]\t%s entry %hu is valid but should be invalid\n",
+		       (ldt ? "LDT" : "GDT"), index);
+		nerrs++;
+	} else {
+		printf("[OK]\t%s entry %hu is invalid\n",
+		       (ldt ? "LDT" : "GDT"), index);
+	}
+}
+
+static void check_valid_segment(uint16_t index, int ldt,
+				uint32_t expected_ar, uint32_t expected_limit,
+				bool verbose)
+{
+	uint32_t has_limit = 0, has_ar = 0, limit, ar;
+	uint32_t selector = (index << 3) | (ldt << 2) | 3;
+
+	asm ("lsl %[selector], %[limit]\n\t"
+	     "jnz 1f\n\t"
+	     "movl $1, %[has_limit]\n\t"
+	     "1:"
+	     : [limit] "=r" (limit), [has_limit] "+rm" (has_limit)
+	     : [selector] "r" (selector));
+	asm ("larl %[selector], %[ar]\n\t"
+	     "jnz 1f\n\t"
+	     "movl $1, %[has_ar]\n\t"
+	     "1:"
+	     : [ar] "=r" (ar), [has_ar] "+rm" (has_ar)
+	     : [selector] "r" (selector));
+
+	if (!has_limit || !has_ar) {
+		printf("[FAIL]\t%s entry %hu is invalid but should be valid\n",
+		       (ldt ? "LDT" : "GDT"), index);
+		nerrs++;
+		return;
+	}
+
+	if (ar != expected_ar) {
+		printf("[FAIL]\t%s entry %hu has AR 0x%08X but expected 0x%08X\n",
+		       (ldt ? "LDT" : "GDT"), index, ar, expected_ar);
+		nerrs++;
+	} else if (limit != expected_limit) {
+		printf("[FAIL]\t%s entry %hu has limit 0x%08X but expected 0x%08X\n",
+		       (ldt ? "LDT" : "GDT"), index, limit, expected_limit);
+		nerrs++;
+	} else if (verbose) {
+		printf("[OK]\t%s entry %hu has AR 0x%08X and limit 0x%08X\n",
+		       (ldt ? "LDT" : "GDT"), index, ar, limit);
+	}
+}
+
+static bool install_valid_mode(const struct user_desc *desc, uint32_t ar,
+			       bool oldmode)
+{
+	int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+			  desc, sizeof(*desc));
+	if (ret < -1)
+		errno = -ret;
+	if (ret == 0) {
+		uint32_t limit = desc->limit;
+		if (desc->limit_in_pages)
+			limit = (limit << 12) + 4095;
+		check_valid_segment(desc->entry_number, 1, ar, limit, true);
+		return true;
+	} else if (errno == ENOSYS) {
+		printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+		return false;
+	} else {
+		if (desc->seg_32bit) {
+			printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
+			       errno);
+			nerrs++;
+			return false;
+		} else {
+			printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
+			return false;
+		}
+	}
+}
+
+static bool install_valid(const struct user_desc *desc, uint32_t ar)
+{
+	return install_valid_mode(desc, ar, false);
+}
+
+static void install_invalid(const struct user_desc *desc, bool oldmode)
+{
+	int ret = syscall(SYS_modify_ldt, oldmode ? 1 : 0x11,
+			  desc, sizeof(*desc));
+	if (ret < -1)
+		errno = -ret;
+	if (ret == 0) {
+		check_invalid_segment(desc->entry_number, 1);
+	} else if (errno == ENOSYS) {
+		printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+	} else {
+		if (desc->seg_32bit) {
+			printf("[FAIL]\tUnexpected modify_ldt failure %d\n",
+			       errno);
+			nerrs++;
+		} else {
+			printf("[OK]\tmodify_ldt rejected 16 bit segment\n");
+		}
+	}
+}
+
+static int safe_modify_ldt(int func, struct user_desc *ptr,
+			   unsigned long bytecount)
+{
+	int ret = syscall(SYS_modify_ldt, 0x11, ptr, bytecount);
+	if (ret < -1)
+		errno = -ret;
+	return ret;
+}
+
+static void fail_install(struct user_desc *desc)
+{
+	if (safe_modify_ldt(0x11, desc, sizeof(*desc)) == 0) {
+		printf("[FAIL]\tmodify_ldt accepted a bad descriptor\n");
+		nerrs++;
+	} else if (errno == ENOSYS) {
+		printf("[OK]\tmodify_ldt returned -ENOSYS\n");
+	} else {
+		printf("[OK]\tmodify_ldt failure %d\n", errno);
+	}
+}
+
+static void do_simple_tests(void)
+{
+	struct user_desc desc = {
+		.entry_number    = 0,
+		.base_addr       = 0,
+		.limit           = 10,
+		.seg_32bit       = 1,
+		.contents        = 2, /* Code, not conforming */
+		.read_exec_only  = 0,
+		.limit_in_pages  = 0,
+		.seg_not_present = 0,
+		.useable         = 0
+	};
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
+
+	desc.limit_in_pages = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_P | AR_DB | AR_G);
+
+	check_invalid_segment(1, 1);
+
+	desc.entry_number = 2;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_P | AR_DB | AR_G);
+
+	check_invalid_segment(1, 1);
+
+	desc.base_addr = 0xf0000000;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_P | AR_DB | AR_G);
+
+	desc.useable = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_P | AR_DB | AR_G | AR_AVL);
+
+	desc.seg_not_present = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_DB | AR_G | AR_AVL);
+
+	desc.seg_32bit = 0;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_G | AR_AVL);
+
+	desc.seg_32bit = 1;
+	desc.contents = 0;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA |
+		      AR_S | AR_DB | AR_G | AR_AVL);
+
+	desc.read_exec_only = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA |
+		      AR_S | AR_DB | AR_G | AR_AVL);
+
+	desc.contents = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA_EXPDOWN |
+		      AR_S | AR_DB | AR_G | AR_AVL);
+
+	desc.read_exec_only = 0;
+	desc.limit_in_pages = 0;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA_EXPDOWN |
+		      AR_S | AR_DB | AR_AVL);
+
+	desc.contents = 3;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE_CONF |
+		      AR_S | AR_DB | AR_AVL);
+
+	desc.read_exec_only = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE_CONF |
+		      AR_S | AR_DB | AR_AVL);
+
+	desc.read_exec_only = 0;
+	desc.contents = 2;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE |
+		      AR_S | AR_DB | AR_AVL);
+
+	desc.read_exec_only = 1;
+
+#ifdef __x86_64__
+	desc.lm = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
+		      AR_S | AR_DB | AR_AVL);
+	desc.lm = 0;
+#endif
+
+	bool entry1_okay = install_valid(&desc, AR_DPL3 | AR_TYPE_XOCODE |
+					 AR_S | AR_DB | AR_AVL);
+
+	if (entry1_okay) {
+		printf("[RUN]\tTest fork\n");
+		pid_t child = fork();
+		if (child == 0) {
+			nerrs = 0;
+			check_valid_segment(desc.entry_number, 1,
+					    AR_DPL3 | AR_TYPE_XOCODE |
+					    AR_S | AR_DB | AR_AVL, desc.limit,
+					    true);
+			check_invalid_segment(1, 1);
+			exit(nerrs ? 1 : 0);
+		} else {
+			int status;
+			if (waitpid(child, &status, 0) != child ||
+			    !WIFEXITED(status)) {
+				printf("[FAIL]\tChild died\n");
+				nerrs++;
+			} else if (WEXITSTATUS(status) != 0) {
+				printf("[FAIL]\tChild failed\n");
+				nerrs++;
+			} else {
+				printf("[OK]\tChild succeeded\n");
+			}
+		}
+
+		printf("[RUN]\tTest size\n");
+		int i;
+		for (i = 0; i < 8192; i++) {
+			desc.entry_number = i;
+			desc.limit = i;
+			if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
+				printf("[FAIL]\tFailed to install entry %d\n", i);
+				nerrs++;
+				break;
+			}
+		}
+		for (int j = 0; j < i; j++) {
+			check_valid_segment(j, 1, AR_DPL3 | AR_TYPE_XOCODE |
+					    AR_S | AR_DB | AR_AVL, j, false);
+		}
+		printf("[DONE]\tSize test\n");
+	} else {
+		printf("[SKIP]\tSkipping fork and size tests because we have no LDT\n");
+	}
+
+	/* Test entry_number too high. */
+	desc.entry_number = 8192;
+	fail_install(&desc);
+
+	/* Test deletion and actions mistakeable for deletion. */
+	memset(&desc, 0, sizeof(desc));
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P);
+
+	desc.seg_not_present = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
+
+	desc.seg_not_present = 0;
+	desc.read_exec_only = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S | AR_P);
+
+	desc.read_exec_only = 0;
+	desc.seg_not_present = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S);
+
+	desc.read_exec_only = 1;
+	desc.limit = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
+
+	desc.limit = 0;
+	desc.base_addr = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RODATA | AR_S);
+
+	desc.base_addr = 0;
+	install_invalid(&desc, false);
+
+	desc.seg_not_present = 0;
+	desc.read_exec_only = 0;
+	desc.seg_32bit = 1;
+	install_valid(&desc, AR_DPL3 | AR_TYPE_RWDATA | AR_S | AR_P | AR_DB);
+	install_invalid(&desc, true);
+}
+
+/*
+ * 0: thread is idle
+ * 1: thread armed
+ * 2: thread should clear LDT entry 0
+ * 3: thread should exit
+ */
+static volatile unsigned int ftx;
+
+static void *threadproc(void *ctx)
+{
+	cpu_set_t cpuset;
+	CPU_ZERO(&cpuset);
+	CPU_SET(1, &cpuset);
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0)
+		err(1, "sched_setaffinity to CPU 1");	/* should never fail */
+
+	while (1) {
+		syscall(SYS_futex, &ftx, FUTEX_WAIT, 0, NULL, NULL, 0);
+		while (ftx != 2) {
+			if (ftx >= 3)
+				return NULL;
+		}
+
+		/* clear LDT entry 0 */
+		const struct user_desc desc = {};
+		if (syscall(SYS_modify_ldt, 1, &desc, sizeof(desc)) != 0)
+			err(1, "modify_ldt");
+
+		/* If ftx == 2, set it to zero.  If ftx == 100, quit. */
+		unsigned int x = -2;
+		asm volatile ("lock xaddl %[x], %[ftx]" :
+			      [x] "+r" (x), [ftx] "+m" (ftx));
+		if (x != 2)
+			return NULL;
+	}
+}
+
+static void sethandler(int sig, void (*handler)(int, siginfo_t *, void *),
+		       int flags)
+{
+	struct sigaction sa;
+	memset(&sa, 0, sizeof(sa));
+	sa.sa_sigaction = handler;
+	sa.sa_flags = SA_SIGINFO | flags;
+	sigemptyset(&sa.sa_mask);
+	if (sigaction(sig, &sa, 0))
+		err(1, "sigaction");
+
+}
+
+static jmp_buf jmpbuf;
+
+static void sigsegv(int sig, siginfo_t *info, void *ctx_void)
+{
+	siglongjmp(jmpbuf, 1);
+}
+
+static void do_multicpu_tests(void)
+{
+	cpu_set_t cpuset;
+	pthread_t thread;
+	int failures = 0, iters = 5, i;
+	unsigned short orig_ss;
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(1, &cpuset);
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+		printf("[SKIP]\tCannot set affinity to CPU 1\n");
+		return;
+	}
+
+	CPU_ZERO(&cpuset);
+	CPU_SET(0, &cpuset);
+	if (sched_setaffinity(0, sizeof(cpuset), &cpuset) != 0) {
+		printf("[SKIP]\tCannot set affinity to CPU 0\n");
+		return;
+	}
+
+	sethandler(SIGSEGV, sigsegv, 0);
+#ifdef __i386__
+	/* True 32-bit kernels send SIGILL instead of SIGSEGV on IRET faults. */
+	sethandler(SIGILL, sigsegv, 0);
+#endif
+
+	printf("[RUN]\tCross-CPU LDT invalidation\n");
+
+	if (pthread_create(&thread, 0, threadproc, 0) != 0)
+		err(1, "pthread_create");
+
+	asm volatile ("mov %%ss, %0" : "=rm" (orig_ss));
+
+	for (i = 0; i < 5; i++) {
+		if (sigsetjmp(jmpbuf, 1) != 0)
+			continue;
+
+		/* Make sure the thread is ready after the last test. */
+		while (ftx != 0)
+			;
+
+		struct user_desc desc = {
+			.entry_number    = 0,
+			.base_addr       = 0,
+			.limit           = 0xfffff,
+			.seg_32bit       = 1,
+			.contents        = 0, /* Data */
+			.read_exec_only  = 0,
+			.limit_in_pages  = 1,
+			.seg_not_present = 0,
+			.useable         = 0
+		};
+
+		if (safe_modify_ldt(0x11, &desc, sizeof(desc)) != 0) {
+			if (errno != ENOSYS)
+				err(1, "modify_ldt");
+			printf("[SKIP]\tmodify_ldt unavailable\n");
+			break;
+		}
+
+		/* Arm the thread. */
+		ftx = 1;
+		syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+		asm volatile ("mov %0, %%ss" : : "r" (0x7));
+
+		/* Go! */
+		ftx = 2;
+
+		while (ftx != 0)
+			;
+
+		/*
+		 * On success, modify_ldt will segfault us synchronously,
+		 * and we'll escape via siglongjmp.
+		 */
+
+		failures++;
+		asm volatile ("mov %0, %%ss" : : "rm" (orig_ss));
+	};
+
+	ftx = 100;  /* Kill the thread. */
+	syscall(SYS_futex, &ftx, FUTEX_WAKE, 0, NULL, NULL, 0);
+
+	if (pthread_join(thread, NULL) != 0)
+		err(1, "pthread_join");
+
+	if (failures) {
+		printf("[FAIL]\t%d of %d iterations failed\n", failures, iters);
+		nerrs++;
+	} else {
+		printf("[OK]\tAll %d iterations succeeded\n", iters);
+	}
+}
+
+static int finish_exec_test(void)
+{
+	/*
+	 * In a sensible world, this would be check_invalid_segment(0, 1);
+	 * For better or for worse, though, the LDT is inherited across exec.
+	 * We can probably change this safely, but for now we test it.
+	 */
+	check_valid_segment(0, 1,
+			    AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB,
+			    42, true);
+
+	return nerrs ? 1 : 0;
+}
+
+static void do_exec_test(void)
+{
+	printf("[RUN]\tTest exec\n");
+
+	struct user_desc desc = {
+		.entry_number    = 0,
+		.base_addr       = 0,
+		.limit           = 42,
+		.seg_32bit       = 1,
+		.contents        = 2, /* Code, not conforming */
+		.read_exec_only  = 0,
+		.limit_in_pages  = 0,
+		.seg_not_present = 0,
+		.useable         = 0
+	};
+	install_valid(&desc, AR_DPL3 | AR_TYPE_XRCODE | AR_S | AR_P | AR_DB);
+
+	pid_t child = fork();
+	if (child == 0) {
+		execl("/proc/self/exe", "ldt_gdt_test_exec", NULL);
+		printf("[FAIL]\tCould not exec self\n");
+		exit(1);	/* exec failed */
+	} else {
+		int status;
+		if (waitpid(child, &status, 0) != child ||
+		    !WIFEXITED(status)) {
+			printf("[FAIL]\tChild died\n");
+			nerrs++;
+		} else if (WEXITSTATUS(status) != 0) {
+			printf("[FAIL]\tChild failed\n");
+			nerrs++;
+		} else {
+			printf("[OK]\tChild succeeded\n");
+		}
+	}
+}
+
+int main(int argc, char **argv)
+{
+	if (argc == 1 && !strcmp(argv[0], "ldt_gdt_test_exec"))
+		return finish_exec_test();
+
+	do_simple_tests();
+
+	do_multicpu_tests();
+
+	do_exec_test();
+
+	return nerrs ? 1 : 0;
+}
-- 
cgit v0.10.2


From a5b9e5a2f14f25a8dae987494d50ad3aac7366b6 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 30 Jul 2015 14:31:34 -0700
Subject: x86/ldt: Make modify_ldt() optional

The modify_ldt syscall exposes a large attack surface and is
unnecessary for modern userspace.  Make it optional.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: Andrew Cooper <andrew.cooper3@citrix.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Jan Beulich <jbeulich@suse.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: security@kernel.org <security@kernel.org>
Cc: xen-devel <xen-devel@lists.xen.org>
Link: http://lkml.kernel.org/r/a605166a771c343fd64802dece77a903507333bd.1438291540.git.luto@kernel.org
[ Made MATH_EMULATION dependent on MODIFY_LDT_SYSCALL. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e26fe7a..7986580 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1036,6 +1036,7 @@ config VM86
 config X86_16BIT
 	bool "Enable support for 16-bit segments" if EXPERT
 	default y
+	depends on MODIFY_LDT_SYSCALL
 	---help---
 	  This option is required by programs like Wine to run 16-bit
 	  protected mode legacy code on x86 processors.  Disabling
@@ -1530,6 +1531,7 @@ config X86_RESERVE_LOW
 
 config MATH_EMULATION
 	bool
+	depends on MODIFY_LDT_SYSCALL
 	prompt "Math emulation" if X86_32
 	---help---
 	  Linux can emulate a math coprocessor (used for floating point
@@ -2074,6 +2076,22 @@ config CMDLINE_OVERRIDE
 	  This is used to work around broken boot loaders.  This should
 	  be set to 'N' under normal conditions.
 
+config MODIFY_LDT_SYSCALL
+	bool "Enable the LDT (local descriptor table)" if EXPERT
+	default y
+	---help---
+	  Linux can allow user programs to install a per-process x86
+	  Local Descriptor Table (LDT) using the modify_ldt(2) system
+	  call.  This is required to run 16-bit or segmented code such as
+	  DOSEMU or some Wine programs.  It is also used by some very old
+	  threading libraries.
+
+	  Enabling this feature adds a small amount of overhead to
+	  context switches and increases the low-level kernel attack
+	  surface.  Disabling it removes the modify_ldt(2) system call.
+
+	  Saying 'N' here may make sense for embedded or server kernels.
+
 source "kernel/livepatch/Kconfig"
 
 endmenu
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 364d274..55234d5 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -9,7 +9,9 @@
  * we put the segment information here.
  */
 typedef struct {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
+#endif
 
 #ifdef CONFIG_X86_64
 	/* True if mm supports a task running in 32 bit compatibility mode. */
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 984abfe..379cd36 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -33,6 +33,7 @@ static inline void load_mm_cr4(struct mm_struct *mm)
 static inline void load_mm_cr4(struct mm_struct *mm) {}
 #endif
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 /*
  * ldt_structs can be allocated, used, and freed, but they are never
  * modified while live.
@@ -48,8 +49,23 @@ struct ldt_struct {
 	int size;
 };
 
+/*
+ * Used for LDT copy/destruction.
+ */
+int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
+void destroy_context(struct mm_struct *mm);
+#else	/* CONFIG_MODIFY_LDT_SYSCALL */
+static inline int init_new_context(struct task_struct *tsk,
+				   struct mm_struct *mm)
+{
+	return 0;
+}
+static inline void destroy_context(struct mm_struct *mm) {}
+#endif
+
 static inline void load_mm_ldt(struct mm_struct *mm)
 {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	struct ldt_struct *ldt;
 
 	/* lockless_dereference synchronizes with smp_store_release */
@@ -73,17 +89,13 @@ static inline void load_mm_ldt(struct mm_struct *mm)
 		set_ldt(ldt->entries, ldt->size);
 	else
 		clear_LDT();
+#else
+	clear_LDT();
+#endif
 
 	DEBUG_LOCKS_WARN_ON(preemptible());
 }
 
-/*
- * Used for LDT copy/destruction.
- */
-int init_new_context(struct task_struct *tsk, struct mm_struct *mm);
-void destroy_context(struct mm_struct *mm);
-
-
 static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
 {
 #ifdef CONFIG_SMP
@@ -114,6 +126,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		/* Load per-mm CR4 state */
 		load_mm_cr4(next);
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		/*
 		 * Load the LDT, if the LDT is different.
 		 *
@@ -128,6 +141,7 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		 */
 		if (unlikely(prev->context.ldt != next->context.ldt))
 			load_mm_ldt(next);
+#endif
 	}
 #ifdef CONFIG_SMP
 	  else {
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index dc19730..5140648 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,7 +25,8 @@ CFLAGS_irq.o := -I$(src)/../include/asm/trace
 obj-y			:= process_$(BITS).o signal.o
 obj-$(CONFIG_COMPAT)	+= signal_compat.o
 obj-y			+= traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
-obj-y			+= time.o ioport.o ldt.o dumpstack.o nmi.o
+obj-y			+= time.o ioport.o dumpstack.o nmi.o
+obj-$(CONFIG_MODIFY_LDT_SYSCALL)	+= ldt.o
 obj-y			+= setup.o x86_init.o i8259.o irqinit.o jump_label.o
 obj-$(CONFIG_IRQ_WORK)  += irq_work.o
 obj-y			+= probe_roms.o
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 09f9ff2..cc25c8f 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2179,6 +2179,7 @@ static unsigned long get_segment_base(unsigned int segment)
 	int idx = segment >> 3;
 
 	if ((segment & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		struct ldt_struct *ldt;
 
 		if (idx > LDT_ENTRIES)
@@ -2190,6 +2191,9 @@ static unsigned long get_segment_base(unsigned int segment)
 			return 0;
 
 		desc = &ldt->entries[idx];
+#else
+		return 0;
+#endif
 	} else {
 		if (idx > GDT_ENTRIES)
 			return 0;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 7ff035c..3c1bbcf 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -121,6 +121,7 @@ void __show_regs(struct pt_regs *regs, int all)
 void release_thread(struct task_struct *dead_task)
 {
 	if (dead_task->mm) {
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 		if (dead_task->mm->context.ldt) {
 			pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
 				dead_task->comm,
@@ -128,6 +129,7 @@ void release_thread(struct task_struct *dead_task)
 				dead_task->mm->context.ldt->size);
 			BUG();
 		}
+#endif
 	}
 }
 
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 6273324..fd88e15 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -18,6 +18,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		return addr;
 	}
 
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
 	/*
 	 * We'll assume that the code segments in the GDT
 	 * are all zero-based. That is largely true: the
@@ -45,6 +46,7 @@ unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *re
 		}
 		mutex_unlock(&child->mm->context.lock);
 	}
+#endif
 
 	return addr;
 }
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 7995ef5..ca7d84f 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -140,6 +140,7 @@ cond_syscall(sys_sgetmask);
 cond_syscall(sys_ssetmask);
 cond_syscall(sys_vm86old);
 cond_syscall(sys_vm86);
+cond_syscall(sys_modify_ldt);
 cond_syscall(sys_ipc);
 cond_syscall(compat_sys_ipc);
 cond_syscall(compat_sys_sysctl);
-- 
cgit v0.10.2


From e800eb39e3f586e46a2007f72d3b609f6e3b888d Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 29 Jul 2015 14:35:43 -0700
Subject: selftests/x86/vm86: Fix entry_from_vm86 test on 64-bit kernels

The test failed due to an oversight on my part when run on a
64-bit kernel.  vm86 isn't expected to work at all, and I
mistakenly failed one part of the test because no signal was
delivered.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/502c8bef877b33fe4943885ded6125dfcc7892db.1438205722.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c
index f004b2a..9a43a59 100644
--- a/tools/testing/selftests/x86/entry_from_vm86.c
+++ b/tools/testing/selftests/x86/entry_from_vm86.c
@@ -105,7 +105,8 @@ extern unsigned char vmcode[], end_vmcode[];
 extern unsigned char vmcode_bound[], vmcode_sysenter[], vmcode_syscall[],
 	vmcode_sti[], vmcode_int3[], vmcode_int80[];
 
-static void do_test(struct vm86plus_struct *v86, unsigned long eip,
+/* Returns false if the test was skipped. */
+static bool do_test(struct vm86plus_struct *v86, unsigned long eip,
 		    unsigned int rettype, unsigned int retarg,
 		    const char *text)
 {
@@ -117,7 +118,7 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip,
 
 	if (ret == -1 && errno == ENOSYS) {
 		printf("[SKIP]\tvm86 not supported\n");
-		return;
+		return false;
 	}
 
 	if (VM86_TYPE(ret) == VM86_INTx) {
@@ -154,6 +155,8 @@ static void do_test(struct vm86plus_struct *v86, unsigned long eip,
 		printf("[FAIL]\tIncorrect return reason\n");
 		nerrs++;
 	}
+
+	return true;
 }
 
 int main(void)
@@ -219,8 +222,8 @@ int main(void)
 	v86.regs.ss = 0;
 	sethandler(SIGSEGV, sighandler, 0);
 	got_signal = 0;
-	do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer");
-	if (!got_signal) {
+	if (do_test(&v86, 0, VM86_SIGNAL, 0, "Execute null pointer") &&
+	    !got_signal) {
 		printf("[FAIL]\tDid not receive SIGSEGV\n");
 		nerrs++;
 	}
-- 
cgit v0.10.2


From 9fda6a0681e070b496235b132bc70ceb80300211 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:16 -0400
Subject: x86/vm86: Move vm86 fields out of 'thread_struct'

Allocate a separate structure for the vm86 fields.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-2-git-send-email-brgerst@gmail.com
[ Build fixes. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index befc134..9615a4e 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -6,8 +6,8 @@
 /* Forward declaration, a strange C thing */
 struct task_struct;
 struct mm_struct;
+struct vm86;
 
-#include <asm/vm86.h>
 #include <asm/math_emu.h>
 #include <asm/segment.h>
 #include <asm/types.h>
@@ -400,13 +400,9 @@ struct thread_struct {
 	unsigned long		cr2;
 	unsigned long		trap_nr;
 	unsigned long		error_code;
-#ifdef CONFIG_X86_32
+#ifdef CONFIG_VM86
 	/* Virtual 86 mode info */
-	struct vm86plus_struct __user *vm86_info;
-	unsigned long		screen_bitmap;
-	unsigned long		v86flags;
-	unsigned long		v86mask;
-	unsigned long		saved_sp0;
+	struct vm86		*vm86;
 #endif
 	/* IO permissions: */
 	unsigned long		*io_bitmap_ptr;
@@ -718,7 +714,6 @@ static inline void spin_lock_prefetch(const void *x)
 
 #define INIT_THREAD  {							  \
 	.sp0			= TOP_OF_INIT_STACK,			  \
-	.vm86_info		= NULL,					  \
 	.sysenter_cs		= __KERNEL_CS,				  \
 	.io_bitmap_ptr		= NULL,					  \
 }
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 1d8de3f..20b43b7 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -1,7 +1,6 @@
 #ifndef _ASM_X86_VM86_H
 #define _ASM_X86_VM86_H
 
-
 #include <asm/ptrace.h>
 #include <uapi/asm/vm86.h>
 
@@ -58,6 +57,14 @@ struct kernel_vm86_struct {
  */
 };
 
+struct vm86 {
+	struct vm86plus_struct __user *vm86_info;
+	unsigned long screen_bitmap;
+	unsigned long v86flags;
+	unsigned long v86mask;
+	unsigned long saved_sp0;
+};
+
 #ifdef CONFIG_VM86
 
 void handle_vm86_fault(struct kernel_vm86_regs *, long);
@@ -67,6 +74,14 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
 struct task_struct;
 void release_vm86_irqs(struct task_struct *);
 
+#define free_vm86(t) do {				\
+	struct thread_struct *__t = (t);		\
+	if (__t->vm86 != NULL) {			\
+		kfree(__t->vm86);			\
+		__t->vm86 = NULL;			\
+	}						\
+} while (0)
+
 #else
 
 #define handle_vm86_fault(a, b)
@@ -77,6 +92,8 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
 	return 0;
 }
 
+#define free_vm86(t) do { } while(0)
+
 #endif /* CONFIG_VM86 */
 
 #endif /* _ASM_X86_VM86_H */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 397688b..2199d9b 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -29,6 +29,7 @@
 #include <asm/debugreg.h>
 #include <asm/nmi.h>
 #include <asm/tlbflush.h>
+#include <asm/vm86.h>
 
 /*
  * per-CPU TSS segments. Threads are completely 'soft' on Linux,
@@ -110,6 +111,8 @@ void exit_thread(void)
 		kfree(bp);
 	}
 
+	free_vm86(t);
+
 	fpu__drop(fpu);
 }
 
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e6c2b47..bfa59b1 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -44,6 +44,7 @@
 #include <linux/ptrace.h>
 #include <linux/audit.h>
 #include <linux/stddef.h>
+#include <linux/slab.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -81,8 +82,8 @@
 /*
  * virtual flags (16 and 32-bit versions)
  */
-#define VFLAGS	(*(unsigned short *)&(current->thread.v86flags))
-#define VEFLAGS	(current->thread.v86flags)
+#define VFLAGS	(*(unsigned short *)&(current->thread.vm86->v86flags))
+#define VEFLAGS	(current->thread.vm86->v86flags)
 
 #define set_flags(X, new, mask) \
 ((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -96,6 +97,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	struct pt_regs *ret;
 	struct task_struct *tsk = current;
 	struct vm86plus_struct __user *user;
+	struct vm86 *vm86 = current->thread.vm86;
 	long err = 0;
 
 	/*
@@ -105,12 +107,12 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	 */
 	local_irq_enable();
 
-	if (!tsk->thread.vm86_info) {
+	if (!vm86 || !vm86->vm86_info) {
 		pr_alert("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | tsk->thread.v86mask);
-	user = tsk->thread.vm86_info;
+	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask);
+	user = vm86->vm86_info;
 
 	if (!access_ok(VERIFY_WRITE, user, VMPI.is_vm86pus ?
 		       sizeof(struct vm86plus_struct) :
@@ -137,7 +139,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 		put_user_ex(regs->fs, &user->regs.fs);
 		put_user_ex(regs->gs, &user->regs.gs);
 
-		put_user_ex(tsk->thread.screen_bitmap, &user->screen_bitmap);
+		put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
 	} put_user_catch(err);
 	if (err) {
 		pr_alert("could not access userspace vm86_info\n");
@@ -145,10 +147,10 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	}
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	tsk->thread.sp0 = tsk->thread.saved_sp0;
+	tsk->thread.sp0 = vm86->saved_sp0;
 	tsk->thread.sysenter_cs = __KERNEL_CS;
 	load_sp0(tss, &tsk->thread);
-	tsk->thread.saved_sp0 = 0;
+	vm86->saved_sp0 = 0;
 	put_cpu();
 
 	ret = KVM86->regs32;
@@ -242,9 +244,15 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 {
 	struct tss_struct *tss;
 	struct task_struct *tsk = current;
+	struct vm86 *vm86 = tsk->thread.vm86;
 	unsigned long err = 0;
 
-	if (tsk->thread.saved_sp0)
+	if (!vm86) {
+		if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL)))
+			return -ENOMEM;
+		tsk->thread.vm86 = vm86;
+	}
+	if (vm86->saved_sp0)
 		return -EPERM;
 
 	if (!access_ok(VERIFY_READ, v86, plus ?
@@ -295,7 +303,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 	}
 
 	info->regs32 = current_pt_regs();
-	tsk->thread.vm86_info = v86;
+	vm86->vm86_info = v86;
 
 /*
  * The flags register is also special: we cannot trust that the user
@@ -311,16 +319,16 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 
 	switch (info->cpu_type) {
 	case CPU_286:
-		tsk->thread.v86mask = 0;
+		vm86->v86mask = 0;
 		break;
 	case CPU_386:
-		tsk->thread.v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	case CPU_486:
-		tsk->thread.v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	default:
-		tsk->thread.v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	}
 
@@ -328,7 +336,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
  * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
  */
 	info->regs32->ax = VM86_SIGNAL;
-	tsk->thread.saved_sp0 = tsk->thread.sp0;
+	vm86->saved_sp0 = tsk->thread.sp0;
 	lazy_save_gs(info->regs32->gs);
 
 	tss = &per_cpu(cpu_tss, get_cpu());
@@ -338,7 +346,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 	load_sp0(tss, &tsk->thread);
 	put_cpu();
 
-	tsk->thread.screen_bitmap = info->screen_bitmap;
+	vm86->screen_bitmap = info->screen_bitmap;
 	if (info->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
@@ -408,7 +416,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
 
 static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VEFLAGS, flags, current->thread.v86mask);
+	set_flags(VEFLAGS, flags, current->thread.vm86->v86mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -418,7 +426,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
 
 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VFLAGS, flags, current->thread.v86mask);
+	set_flags(VFLAGS, flags, current->thread.vm86->v86mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -433,7 +441,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
 	if (VEFLAGS & X86_EFLAGS_VIF)
 		flags |= X86_EFLAGS_IF;
 	flags |= X86_EFLAGS_IOPL;
-	return flags | (VEFLAGS & current->thread.v86mask);
+	return flags | (VEFLAGS & current->thread.vm86->v86mask);
 }
 
 static inline int is_revectored(int nr, struct revectored_struct *bitmap)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 9dc9098..34a368d 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -301,14 +301,16 @@ static inline void
 check_v8086_mode(struct pt_regs *regs, unsigned long address,
 		 struct task_struct *tsk)
 {
+#ifdef CONFIG_VM86
 	unsigned long bit;
 
-	if (!v8086_mode(regs))
+	if (!v8086_mode(regs) || !tsk->thread.vm86)
 		return;
 
 	bit = (address - 0xA0000) >> PAGE_SHIFT;
 	if (bit < 32)
-		tsk->thread.screen_bitmap |= 1 << bit;
+		tsk->thread.vm86->screen_bitmap |= 1 << bit;
+#endif
 }
 
 static bool low_pfn(unsigned long pfn)
-- 
cgit v0.10.2


From d4ce0f26c790af8e829d3fad0a6787f40f98e24f Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:17 -0400
Subject: x86/vm86: Move fields from 'struct kernel_vm86_struct' to 'struct
 vm86'

Move the non-regs fields to the off-stack data.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-3-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 20b43b7..47c7648 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -37,13 +37,7 @@ struct kernel_vm86_struct {
  * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
  * in kernelspace, hence we need not reget the data from userspace.
  */
-#define VM86_TSS_ESP0 flags
-	unsigned long flags;
-	unsigned long screen_bitmap;
-	unsigned long cpu_type;
-	struct revectored_struct int_revectored;
-	struct revectored_struct int21_revectored;
-	struct vm86plus_info_struct vm86plus;
+#define VM86_TSS_ESP0 regs32
 	struct pt_regs *regs32;   /* here we save the pointer to the old regs */
 /*
  * The below is not part of the structure, but the stack layout continues
@@ -59,10 +53,16 @@ struct kernel_vm86_struct {
 
 struct vm86 {
 	struct vm86plus_struct __user *vm86_info;
-	unsigned long screen_bitmap;
 	unsigned long v86flags;
 	unsigned long v86mask;
 	unsigned long saved_sp0;
+
+	unsigned long flags;
+	unsigned long screen_bitmap;
+	unsigned long cpu_type;
+	struct revectored_struct int_revectored;
+	struct revectored_struct int21_revectored;
+	struct vm86plus_info_struct vm86plus;
 };
 
 #ifdef CONFIG_VM86
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index bfa59b1..f71b4b9 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -68,7 +68,6 @@
 
 
 #define KVM86	((struct kernel_vm86_struct *)regs)
-#define VMPI	KVM86->vm86plus
 
 
 /*
@@ -114,7 +113,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask);
 	user = vm86->vm86_info;
 
-	if (!access_ok(VERIFY_WRITE, user, VMPI.is_vm86pus ?
+	if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
 		       sizeof(struct vm86plus_struct) :
 		       sizeof(struct vm86_struct))) {
 		pr_alert("could not access userspace vm86_info\n");
@@ -282,25 +281,27 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 		get_user_ex(info->regs.fs, &v86->regs.fs);
 		get_user_ex(info->regs.gs, &v86->regs.gs);
 
-		get_user_ex(info->flags, &v86->flags);
-		get_user_ex(info->screen_bitmap, &v86->screen_bitmap);
-		get_user_ex(info->cpu_type, &v86->cpu_type);
+		get_user_ex(vm86->flags, &v86->flags);
+		get_user_ex(vm86->screen_bitmap, &v86->screen_bitmap);
+		get_user_ex(vm86->cpu_type, &v86->cpu_type);
 	} get_user_catch(err);
 	if (err)
 		return err;
 
-	if (copy_from_user(&info->int_revectored, &v86->int_revectored,
+	if (copy_from_user(&vm86->int_revectored, &v86->int_revectored,
 			   sizeof(struct revectored_struct)))
 		return -EFAULT;
-	if (copy_from_user(&info->int21_revectored, &v86->int21_revectored,
+	if (copy_from_user(&vm86->int21_revectored, &v86->int21_revectored,
 			   sizeof(struct revectored_struct)))
 		return -EFAULT;
 	if (plus) {
-		if (copy_from_user(&info->vm86plus, &v86->vm86plus,
+		if (copy_from_user(&vm86->vm86plus, &v86->vm86plus,
 				   sizeof(struct vm86plus_info_struct)))
 			return -EFAULT;
-		info->vm86plus.is_vm86pus = 1;
-	}
+		vm86->vm86plus.is_vm86pus = 1;
+	} else
+		memset(&vm86->vm86plus, 0,
+		       sizeof(struct vm86plus_info_struct));
 
 	info->regs32 = current_pt_regs();
 	vm86->vm86_info = v86;
@@ -317,7 +318,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 
 	info->regs.pt.orig_ax = info->regs32->orig_ax;
 
-	switch (info->cpu_type) {
+	switch (vm86->cpu_type) {
 	case CPU_286:
 		vm86->v86mask = 0;
 		break;
@@ -346,8 +347,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 	load_sp0(tss, &tsk->thread);
 	put_cpu();
 
-	vm86->screen_bitmap = info->screen_bitmap;
-	if (info->flags & VM86_SCREEN_BITMAP)
+	if (vm86->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
 	/*call __audit_syscall_exit since we do not exit via the normal paths */
@@ -539,12 +539,13 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 {
 	unsigned long __user *intr_ptr;
 	unsigned long segoffs;
+	struct kernel_vm86_info *vm86 = current->thread.vm86;
 
 	if (regs->pt.cs == BIOSSEG)
 		goto cannot_handle;
-	if (is_revectored(i, &KVM86->int_revectored))
+	if (is_revectored(i, &vm86->int_revectored))
 		goto cannot_handle;
-	if (i == 0x21 && is_revectored(AH(regs), &KVM86->int21_revectored))
+	if (i == 0x21 && is_revectored(AH(regs), &vm86->int21_revectored))
 		goto cannot_handle;
 	intr_ptr = (unsigned long __user *) (i << 2);
 	if (get_user(segoffs, intr_ptr))
@@ -568,7 +569,7 @@ cannot_handle:
 
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 {
-	if (VMPI.is_vm86pus) {
+	if (current->thread.vm86->vm86plus.is_vm86pus) {
 		if ((trapno == 3) || (trapno == 1)) {
 			KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
 			/* setting this flag forces the code in entry_32.S to
@@ -595,12 +596,13 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	unsigned char __user *ssp;
 	unsigned short ip, sp, orig_flags;
 	int data32, pref_done;
+	struct vm86plus_info_struct *vmpi = &current->thread.vm86->vm86plus;
 
 #define CHECK_IF_IN_TRAP \
-	if (VMPI.vm86dbg_active && VMPI.vm86dbg_TFpendig) \
+	if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
 		newflags |= X86_EFLAGS_TF
 #define VM86_FAULT_RETURN do { \
-	if (VMPI.force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
+	if (vmpi->force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
 		return_to_32bit(regs, VM86_PICRETURN); \
 	if (orig_flags & X86_EFLAGS_TF) \
 		handle_vm86_trap(regs, 0, 1); \
@@ -670,8 +672,8 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	case 0xcd: {
 		int intno = popb(csp, ip, simulate_sigsegv);
 		IP(regs) = ip;
-		if (VMPI.vm86dbg_active) {
-			if ((1 << (intno & 7)) & VMPI.vm86dbg_intxxtab[intno >> 3])
+		if (vmpi->vm86dbg_active) {
+			if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3])
 				return_to_32bit(regs, VM86_INTx + (intno << 8));
 		}
 		do_int(regs, intno, ssp, sp);
-- 
cgit v0.10.2


From 90c6085a248f8f964588617f51329688bcc9f2bc Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:18 -0400
Subject: x86/vm86: Eliminate 'struct kernel_vm86_struct'

Now there is no vm86-specific data left on the kernel stack
while in userspace, except for the 32-bit regs.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-4-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 47c7648..226d6c1 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -27,32 +27,9 @@ struct kernel_vm86_regs {
 	unsigned short gs, __gsh;
 };
 
-struct kernel_vm86_struct {
-	struct kernel_vm86_regs regs;
-/*
- * the below part remains on the kernel stack while we are in VM86 mode.
- * 'tss.esp0' then contains the address of VM86_TSS_ESP0 below, and when we
- * get forced back from VM86, the CPU and "SAVE_ALL" will restore the above
- * 'struct kernel_vm86_regs' with the then actual values.
- * Therefore, pt_regs in fact points to a complete 'kernel_vm86_struct'
- * in kernelspace, hence we need not reget the data from userspace.
- */
-#define VM86_TSS_ESP0 regs32
-	struct pt_regs *regs32;   /* here we save the pointer to the old regs */
-/*
- * The below is not part of the structure, but the stack layout continues
- * this way. In front of 'return-eip' may be some data, depending on
- * compilation, so we don't rely on this and save the pointer to 'oldregs'
- * in 'regs32' above.
- * However, with GCC-2.7.2 and the current CFLAGS you see exactly this:
-
-	long return-eip;        from call to vm86()
-	struct pt_regs oldregs;  user space registers as saved by syscall
- */
-};
-
 struct vm86 {
 	struct vm86plus_struct __user *vm86_info;
+	struct pt_regs *regs32;
 	unsigned long v86flags;
 	unsigned long v86mask;
 	unsigned long saved_sp0;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index f71b4b9..696ef76 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -67,9 +67,6 @@
  */
 
 
-#define KVM86	((struct kernel_vm86_struct *)regs)
-
-
 /*
  * 8- and 16-bit register defines..
  */
@@ -152,7 +149,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	vm86->saved_sp0 = 0;
 	put_cpu();
 
-	ret = KVM86->regs32;
+	ret = vm86->regs32;
 
 	lazy_load_gs(ret->gs);
 
@@ -194,29 +191,16 @@ out:
 
 
 static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
-			struct kernel_vm86_struct *info);
+static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus);
 
 SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-
-	return do_sys_vm86((struct vm86plus_struct __user *) v86, false, &info);
+	return do_sys_vm86((struct vm86plus_struct __user *) v86, false);
 }
 
 
 SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 {
-	struct kernel_vm86_struct info; /* declare this _on top_,
-					 * this avoids wasting of stack space.
-					 * This remains on the stack until we
-					 * return to 32 bit user space.
-					 */
-
 	switch (cmd) {
 	case VM86_REQUEST_IRQ:
 	case VM86_FREE_IRQ:
@@ -234,16 +218,17 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 	}
 
 	/* we come here only for functions VM86_ENTER, VM86_ENTER_NO_BYPASS */
-	return do_sys_vm86((struct vm86plus_struct __user *) arg, true, &info);
+	return do_sys_vm86((struct vm86plus_struct __user *) arg, true);
 }
 
 
-static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
-			struct kernel_vm86_struct *info)
+static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 {
 	struct tss_struct *tss;
 	struct task_struct *tsk = current;
 	struct vm86 *vm86 = tsk->thread.vm86;
+	struct kernel_vm86_regs vm86regs;
+	struct pt_regs *regs32 = current_pt_regs();
 	unsigned long err = 0;
 
 	if (!vm86) {
@@ -259,27 +244,27 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 		       sizeof(struct vm86plus_struct)))
 		return -EFAULT;
 
-	memset(info, 0, sizeof(*info));
+	memset(&vm86regs, 0, sizeof(vm86regs));
 	get_user_try {
 		unsigned short seg;
-		get_user_ex(info->regs.pt.bx, &v86->regs.ebx);
-		get_user_ex(info->regs.pt.cx, &v86->regs.ecx);
-		get_user_ex(info->regs.pt.dx, &v86->regs.edx);
-		get_user_ex(info->regs.pt.si, &v86->regs.esi);
-		get_user_ex(info->regs.pt.di, &v86->regs.edi);
-		get_user_ex(info->regs.pt.bp, &v86->regs.ebp);
-		get_user_ex(info->regs.pt.ax, &v86->regs.eax);
-		get_user_ex(info->regs.pt.ip, &v86->regs.eip);
+		get_user_ex(vm86regs.pt.bx, &v86->regs.ebx);
+		get_user_ex(vm86regs.pt.cx, &v86->regs.ecx);
+		get_user_ex(vm86regs.pt.dx, &v86->regs.edx);
+		get_user_ex(vm86regs.pt.si, &v86->regs.esi);
+		get_user_ex(vm86regs.pt.di, &v86->regs.edi);
+		get_user_ex(vm86regs.pt.bp, &v86->regs.ebp);
+		get_user_ex(vm86regs.pt.ax, &v86->regs.eax);
+		get_user_ex(vm86regs.pt.ip, &v86->regs.eip);
 		get_user_ex(seg, &v86->regs.cs);
-		info->regs.pt.cs = seg;
-		get_user_ex(info->regs.pt.flags, &v86->regs.eflags);
-		get_user_ex(info->regs.pt.sp, &v86->regs.esp);
+		vm86regs.pt.cs = seg;
+		get_user_ex(vm86regs.pt.flags, &v86->regs.eflags);
+		get_user_ex(vm86regs.pt.sp, &v86->regs.esp);
 		get_user_ex(seg, &v86->regs.ss);
-		info->regs.pt.ss = seg;
-		get_user_ex(info->regs.es, &v86->regs.es);
-		get_user_ex(info->regs.ds, &v86->regs.ds);
-		get_user_ex(info->regs.fs, &v86->regs.fs);
-		get_user_ex(info->regs.gs, &v86->regs.gs);
+		vm86regs.pt.ss = seg;
+		get_user_ex(vm86regs.es, &v86->regs.es);
+		get_user_ex(vm86regs.ds, &v86->regs.ds);
+		get_user_ex(vm86regs.fs, &v86->regs.fs);
+		get_user_ex(vm86regs.gs, &v86->regs.gs);
 
 		get_user_ex(vm86->flags, &v86->flags);
 		get_user_ex(vm86->screen_bitmap, &v86->screen_bitmap);
@@ -302,8 +287,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 	} else
 		memset(&vm86->vm86plus, 0,
 		       sizeof(struct vm86plus_info_struct));
-
-	info->regs32 = current_pt_regs();
+	vm86->regs32 = regs32;
 	vm86->vm86_info = v86;
 
 /*
@@ -311,12 +295,12 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
  * has set it up safely, so this makes sure interrupt etc flags are
  * inherited from protected mode.
  */
-	VEFLAGS = info->regs.pt.flags;
-	info->regs.pt.flags &= SAFE_MASK;
-	info->regs.pt.flags |= info->regs32->flags & ~SAFE_MASK;
-	info->regs.pt.flags |= X86_VM_MASK;
+	VEFLAGS = vm86regs.pt.flags;
+	vm86regs.pt.flags &= SAFE_MASK;
+	vm86regs.pt.flags |= regs32->flags & ~SAFE_MASK;
+	vm86regs.pt.flags |= X86_VM_MASK;
 
-	info->regs.pt.orig_ax = info->regs32->orig_ax;
+	vm86regs.pt.orig_ax = regs32->orig_ax;
 
 	switch (vm86->cpu_type) {
 	case CPU_286:
@@ -336,12 +320,13 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 /*
  * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
  */
-	info->regs32->ax = VM86_SIGNAL;
+	regs32->ax = VM86_SIGNAL;
 	vm86->saved_sp0 = tsk->thread.sp0;
-	lazy_save_gs(info->regs32->gs);
+	lazy_save_gs(regs32->gs);
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
+	/* Set new sp0 right below 32-bit regs */
+	tsk->thread.sp0 = (unsigned long) regs32;
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
 	load_sp0(tss, &tsk->thread);
@@ -364,7 +349,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus,
 #endif
 		"jmp resume_userspace"
 		: /* no outputs */
-		:"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
+		:"r" (&vm86regs), "r" (task_thread_info(tsk)), "r" (0));
 	unreachable();	/* we never return here */
 }
 
@@ -539,7 +524,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 {
 	unsigned long __user *intr_ptr;
 	unsigned long segoffs;
-	struct kernel_vm86_info *vm86 = current->thread.vm86;
+	struct vm86 *vm86 = current->thread.vm86;
 
 	if (regs->pt.cs == BIOSSEG)
 		goto cannot_handle;
@@ -569,12 +554,14 @@ cannot_handle:
 
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 {
-	if (current->thread.vm86->vm86plus.is_vm86pus) {
+	struct vm86 *vm86 = current->thread.vm86;
+
+	if (vm86->vm86plus.is_vm86pus) {
 		if ((trapno == 3) || (trapno == 1)) {
-			KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
+			vm86->regs32->ax = VM86_TRAP + (trapno << 8);
 			/* setting this flag forces the code in entry_32.S to
 			   the path where we call save_v86_state() and change
-			   the stack pointer to KVM86->regs32 */
+			   the stack pointer to regs32 */
 			set_thread_flag(TIF_NOTIFY_RESUME);
 			return 0;
 		}
-- 
cgit v0.10.2


From 5ed92a8ab71f8865ba07811429c988c72299b315 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:19 -0400
Subject: x86/vm86: Use the normal pt_regs area for vm86

Change to use the normal pt_regs area to enter and exit vm86
mode.  This is done by increasing the padding at the top of the
stack to make room for the extra vm86 segment slots in the IRET
frame.  It then saves the 32-bit regs in the off-stack vm86
data, and copies in the vm86 regs.  Exiting back to 32-bit mode
does the reverse.  This allows removing the hacks to jump
directly into the exit asm code due to having to change the
stack pointer.  Returning normally from the vm86 syscall and the
exception handlers allows things like ptrace and auditing to work properly.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-5-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 21dc60a..f940e24 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -525,34 +525,12 @@ work_resched:
 
 work_notifysig:					# deal with pending signals and
 						# notify-resume requests
-#ifdef CONFIG_VM86
-	testl	$X86_EFLAGS_VM, PT_EFLAGS(%esp)
-	movl	%esp, %eax
-	jnz	work_notifysig_v86		# returning to kernel-space or
-						# vm86-space
-1:
-#else
-	movl	%esp, %eax
-#endif
 	TRACE_IRQS_ON
 	ENABLE_INTERRUPTS(CLBR_NONE)
-	movb	PT_CS(%esp), %bl
-	andb	$SEGMENT_RPL_MASK, %bl
-	cmpb	$USER_RPL, %bl
-	jb	resume_kernel
+	movl	%esp, %eax
 	xorl	%edx, %edx
 	call	do_notify_resume
 	jmp	resume_userspace
-
-#ifdef CONFIG_VM86
-	ALIGN
-work_notifysig_v86:
-	pushl	%ecx				# save ti_flags for do_notify_resume
-	call	save_v86_state			# %eax contains pt_regs pointer
-	popl	%ecx
-	movl	%eax, %esp
-	jmp	1b
-#endif
 END(work_pending)
 
 	# perform syscall exit tracing
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 225ee54..fdad5c2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -27,14 +27,17 @@
  * Without this offset, that can result in a page fault.  (We are
  * careful that, in this case, the value we read doesn't matter.)
  *
- * In vm86 mode, the hardware frame is much longer still, but we neither
- * access the extra members from NMI context, nor do we write such a
- * frame at sp0 at all.
+ * In vm86 mode, the hardware frame is much longer still, so add 16
+ * bytes to make room for the real-mode segments.
  *
  * x86_64 has a fixed-length stack frame.
  */
 #ifdef CONFIG_X86_32
-# define TOP_OF_KERNEL_STACK_PADDING 8
+# ifdef CONFIG_VM86
+#  define TOP_OF_KERNEL_STACK_PADDING 16
+# else
+#  define TOP_OF_KERNEL_STACK_PADDING 8
+# endif
 #else
 # define TOP_OF_KERNEL_STACK_PADDING 0
 #endif
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index 226d6c1..e45386e 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -29,7 +29,7 @@ struct kernel_vm86_regs {
 
 struct vm86 {
 	struct vm86plus_struct __user *vm86_info;
-	struct pt_regs *regs32;
+	struct pt_regs regs32;
 	unsigned long v86flags;
 	unsigned long v86mask;
 	unsigned long saved_sp0;
@@ -46,7 +46,7 @@ struct vm86 {
 
 void handle_vm86_fault(struct kernel_vm86_regs *, long);
 int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *);
+void save_v86_state(struct kernel_vm86_regs *, int);
 
 struct task_struct;
 void release_vm86_irqs(struct task_struct *);
@@ -69,6 +69,8 @@ static inline int handle_vm86_trap(struct kernel_vm86_regs *a, long b, int c)
 	return 0;
 }
 
+static inline void save_v86_state(struct kernel_vm86_regs *a, int b) { }
+
 #define free_vm86(t) do { } while(0)
 
 #endif /* CONFIG_VM86 */
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 7e88cc7..bfd736e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -635,6 +635,9 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
 	bool stepping, failed;
 	struct fpu *fpu = &current->thread.fpu;
 
+	if (v8086_mode(regs))
+		save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL);
+
 	/* Are we from a system call? */
 	if (syscall_get_nr(current, regs) >= 0) {
 		/* If so, check system call restarting.. */
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 696ef76..ffe98ec 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -50,6 +50,7 @@
 #include <asm/io.h>
 #include <asm/tlbflush.h>
 #include <asm/irq.h>
+#include <asm/traps.h>
 
 /*
  * Known problems:
@@ -87,10 +88,9 @@
 #define SAFE_MASK	(0xDD5)
 #define RETURN_MASK	(0xDFF)
 
-struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
+void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 {
 	struct tss_struct *tss;
-	struct pt_regs *ret;
 	struct task_struct *tsk = current;
 	struct vm86plus_struct __user *user;
 	struct vm86 *vm86 = current->thread.vm86;
@@ -149,11 +149,11 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
 	vm86->saved_sp0 = 0;
 	put_cpu();
 
-	ret = vm86->regs32;
+	memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
 
-	lazy_load_gs(ret->gs);
+	lazy_load_gs(vm86->regs32.gs);
 
-	return ret;
+	regs->pt.ax = retval;
 }
 
 static void mark_screen_rdonly(struct mm_struct *mm)
@@ -228,7 +228,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	struct task_struct *tsk = current;
 	struct vm86 *vm86 = tsk->thread.vm86;
 	struct kernel_vm86_regs vm86regs;
-	struct pt_regs *regs32 = current_pt_regs();
+	struct pt_regs *regs = current_pt_regs();
 	unsigned long err = 0;
 
 	if (!vm86) {
@@ -287,7 +287,8 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	} else
 		memset(&vm86->vm86plus, 0,
 		       sizeof(struct vm86plus_info_struct));
-	vm86->regs32 = regs32;
+
+	memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
 	vm86->vm86_info = v86;
 
 /*
@@ -297,10 +298,10 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
  */
 	VEFLAGS = vm86regs.pt.flags;
 	vm86regs.pt.flags &= SAFE_MASK;
-	vm86regs.pt.flags |= regs32->flags & ~SAFE_MASK;
+	vm86regs.pt.flags |= regs->flags & ~SAFE_MASK;
 	vm86regs.pt.flags |= X86_VM_MASK;
 
-	vm86regs.pt.orig_ax = regs32->orig_ax;
+	vm86regs.pt.orig_ax = regs->orig_ax;
 
 	switch (vm86->cpu_type) {
 	case CPU_286:
@@ -318,15 +319,14 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	}
 
 /*
- * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
+ * Save old state
  */
-	regs32->ax = VM86_SIGNAL;
 	vm86->saved_sp0 = tsk->thread.sp0;
-	lazy_save_gs(regs32->gs);
+	lazy_save_gs(vm86->regs32.gs);
 
 	tss = &per_cpu(cpu_tss, get_cpu());
-	/* Set new sp0 right below 32-bit regs */
-	tsk->thread.sp0 = (unsigned long) regs32;
+	/* make room for real-mode segments */
+	tsk->thread.sp0 += 16;
 	if (cpu_has_sep)
 		tsk->thread.sysenter_cs = 0;
 	load_sp0(tss, &tsk->thread);
@@ -335,41 +335,14 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	if (vm86->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
 
-	/*call __audit_syscall_exit since we do not exit via the normal paths */
-#ifdef CONFIG_AUDITSYSCALL
-	if (unlikely(current->audit_context))
-		__audit_syscall_exit(1, 0);
-#endif
-
-	__asm__ __volatile__(
-		"movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-#ifdef CONFIG_X86_32_LAZY_GS
-		"mov  %2, %%gs\n\t"
-#endif
-		"jmp resume_userspace"
-		: /* no outputs */
-		:"r" (&vm86regs), "r" (task_thread_info(tsk)), "r" (0));
-	unreachable();	/* we never return here */
-}
-
-static inline void return_to_32bit(struct kernel_vm86_regs *regs16, int retval)
-{
-	struct pt_regs *regs32;
-
-	regs32 = save_v86_state(regs16);
-	regs32->ax = retval;
-	__asm__ __volatile__("movl %0,%%esp\n\t"
-		"movl %1,%%ebp\n\t"
-		"jmp resume_userspace"
-		: : "r" (regs32), "r" (current_thread_info()));
+	memcpy((struct kernel_vm86_regs *)regs, &vm86regs, sizeof(vm86regs));
+	force_iret();
+	return regs->ax;
 }
 
 static inline void set_IF(struct kernel_vm86_regs *regs)
 {
 	VEFLAGS |= X86_EFLAGS_VIF;
-	if (VEFLAGS & X86_EFLAGS_VIP)
-		return_to_32bit(regs, VM86_STI);
 }
 
 static inline void clear_IF(struct kernel_vm86_regs *regs)
@@ -549,7 +522,7 @@ static void do_int(struct kernel_vm86_regs *regs, int i,
 	return;
 
 cannot_handle:
-	return_to_32bit(regs, VM86_INTx + (i << 8));
+	save_v86_state(regs, VM86_INTx + (i << 8));
 }
 
 int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
@@ -558,11 +531,7 @@ int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
 
 	if (vm86->vm86plus.is_vm86pus) {
 		if ((trapno == 3) || (trapno == 1)) {
-			vm86->regs32->ax = VM86_TRAP + (trapno << 8);
-			/* setting this flag forces the code in entry_32.S to
-			   the path where we call save_v86_state() and change
-			   the stack pointer to regs32 */
-			set_thread_flag(TIF_NOTIFY_RESUME);
+			save_v86_state(regs, VM86_TRAP + (trapno << 8));
 			return 0;
 		}
 		do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
@@ -588,12 +557,6 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 #define CHECK_IF_IN_TRAP \
 	if (vmpi->vm86dbg_active && vmpi->vm86dbg_TFpendig) \
 		newflags |= X86_EFLAGS_TF
-#define VM86_FAULT_RETURN do { \
-	if (vmpi->force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) \
-		return_to_32bit(regs, VM86_PICRETURN); \
-	if (orig_flags & X86_EFLAGS_TF) \
-		handle_vm86_trap(regs, 0, 1); \
-	return; } while (0)
 
 	orig_flags = *(unsigned short *)&regs->pt.flags;
 
@@ -632,7 +595,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 			SP(regs) -= 2;
 		}
 		IP(regs) = ip;
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;
 
 	/* popf */
 	case 0x9d:
@@ -652,7 +615,7 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		else
 			set_vflags_short(newflags, regs);
 
-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}
 
 	/* int xx */
@@ -660,8 +623,10 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		int intno = popb(csp, ip, simulate_sigsegv);
 		IP(regs) = ip;
 		if (vmpi->vm86dbg_active) {
-			if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3])
-				return_to_32bit(regs, VM86_INTx + (intno << 8));
+			if ((1 << (intno & 7)) & vmpi->vm86dbg_intxxtab[intno >> 3]) {
+				save_v86_state(regs, VM86_INTx + (intno << 8));
+				return;
+			}
 		}
 		do_int(regs, intno, ssp, sp);
 		return;
@@ -692,14 +657,14 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 		} else {
 			set_vflags_short(newflags, regs);
 		}
-		VM86_FAULT_RETURN;
+		goto check_vip;
 		}
 
 	/* cli */
 	case 0xfa:
 		IP(regs) = ip;
 		clear_IF(regs);
-		VM86_FAULT_RETURN;
+		goto vm86_fault_return;
 
 	/* sti */
 	/*
@@ -711,12 +676,27 @@ void handle_vm86_fault(struct kernel_vm86_regs *regs, long error_code)
 	case 0xfb:
 		IP(regs) = ip;
 		set_IF(regs);
-		VM86_FAULT_RETURN;
+		goto check_vip;
 
 	default:
-		return_to_32bit(regs, VM86_UNKNOWN);
+		save_v86_state(regs, VM86_UNKNOWN);
+	}
+
+	return;
+
+check_vip:
+	if (VEFLAGS & X86_EFLAGS_VIP) {
+		save_v86_state(regs, VM86_STI);
+		return;
 	}
 
+vm86_fault_return:
+	if (vmpi->force_return_for_pic  && (VEFLAGS & (X86_EFLAGS_IF | X86_EFLAGS_VIF))) {
+		save_v86_state(regs, VM86_PICRETURN);
+		return;
+	}
+	if (orig_flags & X86_EFLAGS_TF)
+		handle_vm86_trap(regs, 0, X86_TRAP_DB);
 	return;
 
 simulate_sigsegv:
@@ -730,7 +710,7 @@ simulate_sigsegv:
 	 *        should be a mixture of the two, but how do we
 	 *        get the information? [KD]
 	 */
-	return_to_32bit(regs, VM86_UNKNOWN);
+	save_v86_state(regs, VM86_UNKNOWN);
 }
 
 /* ---------------- vm86 special IRQ passing stuff ----------------- */
-- 
cgit v0.10.2


From af3e565a8542c4be699a0403b88fd6c691f5914f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 31 Jul 2015 10:59:20 +0200
Subject: x86/vm86: Move the vm86 IRQ definitions to vm86.h

Move vm86 specific definitions from irq_vectors.h to vm86.h.

Based on patch from Brian Gerst.

Originally-from: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-6-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 4c2d2eb..6ca9fd6 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -117,16 +117,6 @@
 
 #define FPU_IRQ				  13
 
-#define	FIRST_VM86_IRQ			   3
-#define LAST_VM86_IRQ			  15
-
-#ifndef __ASSEMBLY__
-static inline int invalid_vm86_irq(int irq)
-{
-	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
-}
-#endif
-
 /*
  * Size the maximum number of interrupts.
  *
diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index e45386e..b063196 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -49,7 +49,6 @@ int handle_vm86_trap(struct kernel_vm86_regs *, long, int);
 void save_v86_state(struct kernel_vm86_regs *, int);
 
 struct task_struct;
-void release_vm86_irqs(struct task_struct *);
 
 #define free_vm86(t) do {				\
 	struct thread_struct *__t = (t);		\
@@ -59,6 +58,20 @@ void release_vm86_irqs(struct task_struct *);
 	}						\
 } while (0)
 
+/*
+ * Support for VM86 programs to request interrupts for
+ * real mode hardware drivers:
+ */
+#define FIRST_VM86_IRQ		 3
+#define LAST_VM86_IRQ		15
+
+static inline int invalid_vm86_irq(int irq)
+{
+	return irq < FIRST_VM86_IRQ || irq > LAST_VM86_IRQ;
+}
+
+void release_vm86_irqs(struct task_struct *);
+
 #else
 
 #define handle_vm86_fault(a, b)
-- 
cgit v0.10.2


From ba3e127ec105e790eeec4034d9769e018e4a1b54 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:21 -0400
Subject: x86/vm86: Clean up vm86.h includes

vm86.h was being implicitly included in alot of places via
processor.h, which in turn got it from math_emu.h.  Break that
chain and explicitly include vm86.h in all files that need it.
Also remove unused vm86 field from math_emu_info.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-7-git-send-email-brgerst@gmail.com
[ Fixed build failure. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/math_emu.h b/arch/x86/include/asm/math_emu.h
index 031f626..0d9b14f 100644
--- a/arch/x86/include/asm/math_emu.h
+++ b/arch/x86/include/asm/math_emu.h
@@ -2,7 +2,6 @@
 #define _ASM_X86_MATH_EMU_H
 
 #include <asm/ptrace.h>
-#include <asm/vm86.h>
 
 /* This structure matches the layout of the data saved to the stack
    following a device-not-present interrupt, part of it saved
@@ -10,9 +9,6 @@
    */
 struct math_emu_info {
 	long ___orig_eip;
-	union {
-		struct pt_regs *regs;
-		struct kernel_vm86_regs *vm86;
-	};
+	struct pt_regs *regs;
 };
 #endif /* _ASM_X86_MATH_EMU_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 592a6a6..91dfcaf 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -37,6 +37,7 @@ asmlinkage long sys_get_thread_area(struct user_desc __user *);
 asmlinkage unsigned long sys_sigreturn(void);
 
 /* kernel/vm86_32.c */
+struct vm86_struct;
 asmlinkage long sys_vm86old(struct vm86_struct __user *);
 asmlinkage long sys_vm86(unsigned long, unsigned long);
 
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f73c962..c13df2c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -53,6 +53,7 @@
 #include <asm/syscalls.h>
 #include <asm/debugreg.h>
 #include <asm/switch_to.h>
+#include <asm/vm86.h>
 
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 asmlinkage void ret_from_kernel_thread(void) __asm__("ret_from_kernel_thread");
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index bfd736e..07eb844 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -31,6 +31,7 @@
 #include <asm/vdso.h>
 #include <asm/mce.h>
 #include <asm/sighandling.h>
+#include <asm/vm86.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/proto.h>
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 8e65d8a..86a82ea 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -62,6 +62,7 @@
 #include <asm/fpu/xstate.h>
 #include <asm/trace/mpx.h>
 #include <asm/mpx.h>
+#include <asm/vm86.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index ffe98ec..0de1f66 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -51,6 +51,7 @@
 #include <asm/tlbflush.h>
 #include <asm/irq.h>
 #include <asm/traps.h>
+#include <asm/vm86.h>
 
 /*
  * Known problems:
diff --git a/arch/x86/math-emu/get_address.c b/arch/x86/math-emu/get_address.c
index 6ef5e99..a2eefb1 100644
--- a/arch/x86/math-emu/get_address.c
+++ b/arch/x86/math-emu/get_address.c
@@ -21,6 +21,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/desc.h>
+#include <asm/vm86.h>
 
 #include "fpu_system.h"
 #include "exception.h"
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 34a368d..eef44d9 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -20,6 +20,7 @@
 #include <asm/kmemcheck.h>		/* kmemcheck_*(), ...		*/
 #include <asm/fixmap.h>			/* VSYSCALL_ADDR		*/
 #include <asm/vsyscall.h>		/* emulate_vsyscall		*/
+#include <asm/vm86.h>			/* struct vm86			*/
 
 #define CREATE_TRACE_POINTS
 #include <asm/trace/exceptions.h>
diff --git a/drivers/scsi/dpt_i2o.c b/drivers/scsi/dpt_i2o.c
index f35ed53..d4cda5e 100644
--- a/drivers/scsi/dpt_i2o.c
+++ b/drivers/scsi/dpt_i2o.c
@@ -1924,6 +1924,9 @@ static void adpt_alpha_info(sysInfo_S* si)
 #endif
 
 #if defined __i386__
+
+#include <uapi/asm/vm86.h>
+
 static void adpt_i386_info(sysInfo_S* si)
 {
 	// This is all the info we need for now
-- 
cgit v0.10.2


From 1342635638cba9b7c8eac776da5e54390d14d313 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:22 -0400
Subject: x86/vm86: Rename vm86->vm86_info to user_vm86

Make it clearer that this is the pointer to the userspace vm86
state area.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-8-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index b063196..c93ae73 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -28,7 +28,7 @@ struct kernel_vm86_regs {
 };
 
 struct vm86 {
-	struct vm86plus_struct __user *vm86_info;
+	struct vm86plus_struct __user *user_vm86;
 	struct pt_regs regs32;
 	unsigned long v86flags;
 	unsigned long v86mask;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 0de1f66..52aa33e 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -104,17 +104,17 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 	 */
 	local_irq_enable();
 
-	if (!vm86 || !vm86->vm86_info) {
-		pr_alert("no vm86_info: BAD\n");
+	if (!vm86 || !vm86->user_vm86) {
+		pr_alert("no user_vm86: BAD\n");
 		do_exit(SIGSEGV);
 	}
 	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask);
-	user = vm86->vm86_info;
+	user = vm86->user_vm86;
 
 	if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
 		       sizeof(struct vm86plus_struct) :
 		       sizeof(struct vm86_struct))) {
-		pr_alert("could not access userspace vm86_info\n");
+		pr_alert("could not access userspace vm86 info\n");
 		do_exit(SIGSEGV);
 	}
 
@@ -139,7 +139,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 		put_user_ex(vm86->screen_bitmap, &user->screen_bitmap);
 	} put_user_catch(err);
 	if (err) {
-		pr_alert("could not access userspace vm86_info\n");
+		pr_alert("could not access userspace vm86 info\n");
 		do_exit(SIGSEGV);
 	}
 
@@ -192,11 +192,11 @@ out:
 
 
 static int do_vm86_irq_handling(int subfunction, int irqnumber);
-static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus);
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus);
 
-SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, v86)
+SYSCALL_DEFINE1(vm86old, struct vm86_struct __user *, user_vm86)
 {
-	return do_sys_vm86((struct vm86plus_struct __user *) v86, false);
+	return do_sys_vm86((struct vm86plus_struct __user *) user_vm86, false);
 }
 
 
@@ -223,7 +223,7 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 }
 
 
-static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
+static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 {
 	struct tss_struct *tss;
 	struct task_struct *tsk = current;
@@ -240,7 +240,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	if (vm86->saved_sp0)
 		return -EPERM;
 
-	if (!access_ok(VERIFY_READ, v86, plus ?
+	if (!access_ok(VERIFY_READ, user_vm86, plus ?
 		       sizeof(struct vm86_struct) :
 		       sizeof(struct vm86plus_struct)))
 		return -EFAULT;
@@ -248,40 +248,42 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 	memset(&vm86regs, 0, sizeof(vm86regs));
 	get_user_try {
 		unsigned short seg;
-		get_user_ex(vm86regs.pt.bx, &v86->regs.ebx);
-		get_user_ex(vm86regs.pt.cx, &v86->regs.ecx);
-		get_user_ex(vm86regs.pt.dx, &v86->regs.edx);
-		get_user_ex(vm86regs.pt.si, &v86->regs.esi);
-		get_user_ex(vm86regs.pt.di, &v86->regs.edi);
-		get_user_ex(vm86regs.pt.bp, &v86->regs.ebp);
-		get_user_ex(vm86regs.pt.ax, &v86->regs.eax);
-		get_user_ex(vm86regs.pt.ip, &v86->regs.eip);
-		get_user_ex(seg, &v86->regs.cs);
+		get_user_ex(vm86regs.pt.bx, &user_vm86->regs.ebx);
+		get_user_ex(vm86regs.pt.cx, &user_vm86->regs.ecx);
+		get_user_ex(vm86regs.pt.dx, &user_vm86->regs.edx);
+		get_user_ex(vm86regs.pt.si, &user_vm86->regs.esi);
+		get_user_ex(vm86regs.pt.di, &user_vm86->regs.edi);
+		get_user_ex(vm86regs.pt.bp, &user_vm86->regs.ebp);
+		get_user_ex(vm86regs.pt.ax, &user_vm86->regs.eax);
+		get_user_ex(vm86regs.pt.ip, &user_vm86->regs.eip);
+		get_user_ex(seg, &user_vm86->regs.cs);
 		vm86regs.pt.cs = seg;
-		get_user_ex(vm86regs.pt.flags, &v86->regs.eflags);
-		get_user_ex(vm86regs.pt.sp, &v86->regs.esp);
-		get_user_ex(seg, &v86->regs.ss);
+		get_user_ex(vm86regs.pt.flags, &user_vm86->regs.eflags);
+		get_user_ex(vm86regs.pt.sp, &user_vm86->regs.esp);
+		get_user_ex(seg, &user_vm86->regs.ss);
 		vm86regs.pt.ss = seg;
-		get_user_ex(vm86regs.es, &v86->regs.es);
-		get_user_ex(vm86regs.ds, &v86->regs.ds);
-		get_user_ex(vm86regs.fs, &v86->regs.fs);
-		get_user_ex(vm86regs.gs, &v86->regs.gs);
-
-		get_user_ex(vm86->flags, &v86->flags);
-		get_user_ex(vm86->screen_bitmap, &v86->screen_bitmap);
-		get_user_ex(vm86->cpu_type, &v86->cpu_type);
+		get_user_ex(vm86regs.es, &user_vm86->regs.es);
+		get_user_ex(vm86regs.ds, &user_vm86->regs.ds);
+		get_user_ex(vm86regs.fs, &user_vm86->regs.fs);
+		get_user_ex(vm86regs.gs, &user_vm86->regs.gs);
+
+		get_user_ex(vm86->flags, &user_vm86->flags);
+		get_user_ex(vm86->screen_bitmap, &user_vm86->screen_bitmap);
+		get_user_ex(vm86->cpu_type, &user_vm86->cpu_type);
 	} get_user_catch(err);
 	if (err)
 		return err;
 
-	if (copy_from_user(&vm86->int_revectored, &v86->int_revectored,
+	if (copy_from_user(&vm86->int_revectored,
+			   &user_vm86->int_revectored,
 			   sizeof(struct revectored_struct)))
 		return -EFAULT;
-	if (copy_from_user(&vm86->int21_revectored, &v86->int21_revectored,
+	if (copy_from_user(&vm86->int21_revectored,
+			   &user_vm86->int21_revectored,
 			   sizeof(struct revectored_struct)))
 		return -EFAULT;
 	if (plus) {
-		if (copy_from_user(&vm86->vm86plus, &v86->vm86plus,
+		if (copy_from_user(&vm86->vm86plus, &user_vm86->vm86plus,
 				   sizeof(struct vm86plus_info_struct)))
 			return -EFAULT;
 		vm86->vm86plus.is_vm86pus = 1;
@@ -290,7 +292,7 @@ static long do_sys_vm86(struct vm86plus_struct __user *v86, bool plus)
 		       sizeof(struct vm86plus_info_struct));
 
 	memcpy(&vm86->regs32, regs, sizeof(struct pt_regs));
-	vm86->vm86_info = v86;
+	vm86->user_vm86 = user_vm86;
 
 /*
  * The flags register is also special: we cannot trust that the user
-- 
cgit v0.10.2


From decd275e62d5eef4b947fab89652fa6afdadf2f2 Mon Sep 17 00:00:00 2001
From: Brian Gerst <brgerst@gmail.com>
Date: Wed, 29 Jul 2015 01:41:23 -0400
Subject: x86/vm86: Rename vm86->v86flags and v86mask

Rename v86flags to veflags, and v86mask to veflags_mask.

Signed-off-by: Brian Gerst <brgerst@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1438148483-11932-9-git-send-email-brgerst@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/vm86.h b/arch/x86/include/asm/vm86.h
index c93ae73..1e491f3 100644
--- a/arch/x86/include/asm/vm86.h
+++ b/arch/x86/include/asm/vm86.h
@@ -30,8 +30,8 @@ struct kernel_vm86_regs {
 struct vm86 {
 	struct vm86plus_struct __user *user_vm86;
 	struct pt_regs regs32;
-	unsigned long v86flags;
-	unsigned long v86mask;
+	unsigned long veflags;
+	unsigned long veflags_mask;
 	unsigned long saved_sp0;
 
 	unsigned long flags;
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 52aa33e..abd8b856 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -80,8 +80,8 @@
 /*
  * virtual flags (16 and 32-bit versions)
  */
-#define VFLAGS	(*(unsigned short *)&(current->thread.vm86->v86flags))
-#define VEFLAGS	(current->thread.vm86->v86flags)
+#define VFLAGS	(*(unsigned short *)&(current->thread.vm86->veflags))
+#define VEFLAGS	(current->thread.vm86->veflags)
 
 #define set_flags(X, new, mask) \
 ((X) = ((X) & ~(mask)) | ((new) & (mask)))
@@ -108,7 +108,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 		pr_alert("no user_vm86: BAD\n");
 		do_exit(SIGSEGV);
 	}
-	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->v86mask);
+	set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask);
 	user = vm86->user_vm86;
 
 	if (!access_ok(VERIFY_WRITE, user, vm86->vm86plus.is_vm86pus ?
@@ -308,16 +308,16 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 
 	switch (vm86->cpu_type) {
 	case CPU_286:
-		vm86->v86mask = 0;
+		vm86->veflags_mask = 0;
 		break;
 	case CPU_386:
-		vm86->v86mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	case CPU_486:
-		vm86->v86mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	default:
-		vm86->v86mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
+		vm86->veflags_mask = X86_EFLAGS_ID | X86_EFLAGS_AC | X86_EFLAGS_NT | X86_EFLAGS_IOPL;
 		break;
 	}
 
@@ -377,7 +377,7 @@ static inline void clear_AC(struct kernel_vm86_regs *regs)
 
 static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VEFLAGS, flags, current->thread.vm86->v86mask);
+	set_flags(VEFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -387,7 +387,7 @@ static inline void set_vflags_long(unsigned long flags, struct kernel_vm86_regs
 
 static inline void set_vflags_short(unsigned short flags, struct kernel_vm86_regs *regs)
 {
-	set_flags(VFLAGS, flags, current->thread.vm86->v86mask);
+	set_flags(VFLAGS, flags, current->thread.vm86->veflags_mask);
 	set_flags(regs->pt.flags, flags, SAFE_MASK);
 	if (flags & X86_EFLAGS_IF)
 		set_IF(regs);
@@ -402,7 +402,7 @@ static inline unsigned long get_vflags(struct kernel_vm86_regs *regs)
 	if (VEFLAGS & X86_EFLAGS_VIF)
 		flags |= X86_EFLAGS_IF;
 	flags |= X86_EFLAGS_IOPL;
-	return flags | (VEFLAGS & current->thread.vm86->v86mask);
+	return flags | (VEFLAGS & current->thread.vm86->veflags_mask);
 }
 
 static inline int is_revectored(int nr, struct revectored_struct *bitmap)
-- 
cgit v0.10.2


From c5f69fde26d1581ee495f68bb9de4049c8168a04 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 31 Jul 2015 14:41:08 -0700
Subject: x86/entry/32: Remove 32-bit syscall audit optimizations

The asm audit optimizations are ugly and obfuscate the code too
much. Remove them.

This will regress performance if syscall auditing is enabled on
32-bit kernels and SYSENTER is in use. If this becomes a
problem, interested parties are encouraged to implement the
equivalent of the 64-bit opportunistic SYSRET optimization.

Alternatively, a case could be made that, on 32-bit kernels, a
less messy asm audit optimization could be done. 32-bit kernels
don't have the complicated partial register saving tricks that
64-bit kernels have, so the SYSENTER post-syscall path could
just call the audit hooks directly.  Any reimplementation of
this ought to demonstrate that it only calls the audit hook once
per syscall, though, which does not currently appear to be true.

Someone would have to make the case that doing so would be
better than implementing opportunistic SYSEXIT, though.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/212be39dd8c90b44c4b7bbc678128d6b88bdb9912.1438378274.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index f940e24..a3c307a 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -45,16 +45,6 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 
-/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this.  */
-#include <linux/elf-em.h>
-#define AUDIT_ARCH_I386		(EM_386|__AUDIT_ARCH_LE)
-#define __AUDIT_ARCH_LE		0x40000000
-
-#ifndef CONFIG_AUDITSYSCALL
-# define sysenter_audit		syscall_trace_entry
-# define sysexit_audit		syscall_exit_work
-#endif
-
 	.section .entry.text, "ax"
 
 /*
@@ -339,7 +329,7 @@ sysenter_past_esp:
 	GET_THREAD_INFO(%ebp)
 
 	testl	$_TIF_WORK_SYSCALL_ENTRY, TI_flags(%ebp)
-	jnz	sysenter_audit
+	jnz	syscall_trace_entry
 sysenter_do_call:
 	cmpl	$(NR_syscalls), %eax
 	jae	sysenter_badsys
@@ -351,7 +341,7 @@ sysenter_after_call:
 	TRACE_IRQS_OFF
 	movl	TI_flags(%ebp), %ecx
 	testl	$_TIF_ALLWORK_MASK, %ecx
-	jnz	sysexit_audit
+	jnz	syscall_exit_work
 sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
 	movl	PT_EIP(%esp), %edx
@@ -362,40 +352,6 @@ sysenter_exit:
 	PTGS_TO_GS
 	ENABLE_INTERRUPTS_SYSEXIT
 
-#ifdef CONFIG_AUDITSYSCALL
-sysenter_audit:
-	testl	$(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT), TI_flags(%ebp)
-	jnz	syscall_trace_entry
-	/* movl	PT_EAX(%esp), %eax already set, syscall number: 1st arg to audit */
-	movl	PT_EBX(%esp), %edx		/* ebx/a0: 2nd arg to audit */
-	/* movl	PT_ECX(%esp), %ecx already set, a1: 3nd arg to audit */
-	pushl	PT_ESI(%esp)			/* a3: 5th arg */
-	pushl	PT_EDX+4(%esp)			/* a2: 4th arg */
-	call	__audit_syscall_entry
-	popl	%ecx				/* get that remapped edx off the stack */
-	popl	%ecx				/* get that remapped esi off the stack */
-	movl	PT_EAX(%esp), %eax		/* reload syscall number */
-	jmp	sysenter_do_call
-
-sysexit_audit:
-	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jnz	syscall_exit_work
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)
-	movl	%eax, %edx			/* second arg, syscall return value */
-	cmpl	$-MAX_ERRNO, %eax		/* is it an error ? */
-	setbe %al				/* 1 if so, 0 if not */
-	movzbl %al, %eax			/* zero-extend that */
-	call	__audit_syscall_exit
-	DISABLE_INTERRUPTS(CLBR_ANY)
-	TRACE_IRQS_OFF
-	movl	TI_flags(%ebp), %ecx
-	testl	$(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
-	jnz	syscall_exit_work
-	movl	PT_EAX(%esp), %eax		/* reload syscall return value */
-	jmp	sysenter_exit
-#endif
-
 .pushsection .fixup, "ax"
 2:	movl	$0, PT_FS(%esp)
 	jmp	1b
-- 
cgit v0.10.2


From 5d73fc70996d9de0d1b2fc87e62dc51153204eba Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 31 Jul 2015 14:41:09 -0700
Subject: x86/entry/32: Migrate to C exit path

This removes the hybrid asm-and-C implementation of exit work.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/2baa438619ea6c027b40ec9fceacca52f09c74d09.1438378274.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index a3c307a..b2909bf 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -256,14 +256,10 @@ ret_from_intr:
 
 ENTRY(resume_userspace)
 	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
-						# setting need_resched or sigpending
-						# between sampling and the iret
+	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF
-	movl	TI_flags(%ebp), %ecx
-	andl	$_TIF_WORK_MASK, %ecx		# is there any work to be done on
-						# int/exception return?
-	jne	work_pending
+	movl	%esp, %eax
+	call	prepare_exit_to_usermode
 	jmp	restore_all
 END(ret_from_exception)
 
@@ -341,7 +337,7 @@ sysenter_after_call:
 	TRACE_IRQS_OFF
 	movl	TI_flags(%ebp), %ecx
 	testl	$_TIF_ALLWORK_MASK, %ecx
-	jnz	syscall_exit_work
+	jnz	syscall_exit_work_irqs_off
 sysenter_exit:
 /* if something modifies registers it must also disable sysexit */
 	movl	PT_EIP(%esp), %edx
@@ -377,13 +373,7 @@ syscall_after_call:
 	movl	%eax, PT_EAX(%esp)		# store the return value
 syscall_exit:
 	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
-						# setting need_resched or sigpending
-						# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl	TI_flags(%ebp), %ecx
-	testl	$_TIF_ALLWORK_MASK, %ecx	# current->work
-	jnz	syscall_exit_work
+	jmp	syscall_exit_work
 
 restore_all:
 	TRACE_IRQS_IRET
@@ -460,35 +450,6 @@ ldt_ss:
 #endif
 ENDPROC(entry_INT80_32)
 
-	# perform work that needs to be done immediately before resumption
-	ALIGN
-work_pending:
-	testb	$_TIF_NEED_RESCHED, %cl
-	jz	work_notifysig
-work_resched:
-	call	schedule
-	LOCKDEP_SYS_EXIT
-	DISABLE_INTERRUPTS(CLBR_ANY)		# make sure we don't miss an interrupt
-						# setting need_resched or sigpending
-						# between sampling and the iret
-	TRACE_IRQS_OFF
-	movl	TI_flags(%ebp), %ecx
-	andl	$_TIF_WORK_MASK, %ecx		# is there any work to be done other
-						# than syscall tracing?
-	jz	restore_all
-	testb	$_TIF_NEED_RESCHED, %cl
-	jnz	work_resched
-
-work_notifysig:					# deal with pending signals and
-						# notify-resume requests
-	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_NONE)
-	movl	%esp, %eax
-	xorl	%edx, %edx
-	call	do_notify_resume
-	jmp	resume_userspace
-END(work_pending)
-
 	# perform syscall exit tracing
 	ALIGN
 syscall_trace_entry:
@@ -503,15 +464,14 @@ END(syscall_trace_entry)
 
 	# perform syscall exit tracing
 	ALIGN
-syscall_exit_work:
-	testl	$_TIF_WORK_SYSCALL_EXIT, %ecx
-	jz	work_pending
+syscall_exit_work_irqs_off:
 	TRACE_IRQS_ON
-	ENABLE_INTERRUPTS(CLBR_ANY)		# could let syscall_trace_leave() call
-						# schedule() instead
+	ENABLE_INTERRUPTS(CLBR_ANY)
+
+syscall_exit_work:
 	movl	%esp, %eax
-	call	syscall_trace_leave
-	jmp	resume_userspace
+	call	syscall_return_slowpath
+	jmp	restore_all
 END(syscall_exit_work)
 
 syscall_fault:
-- 
cgit v0.10.2


From 88cd622f9299c4c9e61e978bb9ef9d7599769ed0 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Fri, 31 Jul 2015 14:41:10 -0700
Subject: x86/entry: Remove do_notify_resume(), syscall_trace_leave(), and
 their TIF masks

They are no longer used. Good riddance!

Deleting the TIF_ macros is really nice. It was never clear why
there were so many variants.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Eric Paris <eparis@parisplace.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/22c61682f446628573dde0f1d573ab821677e06da.1438378274.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index a3e9c7f..80dcc92 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -207,37 +207,6 @@ long syscall_trace_enter(struct pt_regs *regs)
 		return syscall_trace_enter_phase2(regs, arch, phase1_result);
 }
 
-/* Deprecated. */
-void syscall_trace_leave(struct pt_regs *regs)
-{
-	bool step;
-
-	/*
-	 * We may come here right after calling schedule_user()
-	 * or do_notify_resume(), in which case we can be in RCU
-	 * user mode.
-	 */
-	user_exit();
-
-	audit_syscall_exit(regs);
-
-	if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
-		trace_sys_exit(regs, regs->ax);
-
-	/*
-	 * If TIF_SYSCALL_EMU is set, we only get here because of
-	 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
-	 * We already reported this syscall instruction in
-	 * syscall_trace_enter().
-	 */
-	step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
-			!test_thread_flag(TIF_SYSCALL_EMU);
-	if (step || test_thread_flag(TIF_SYSCALL_TRACE))
-		tracehook_report_syscall_exit(regs, step);
-
-	user_enter();
-}
-
 static struct thread_info *pt_regs_to_thread_info(struct pt_regs *regs)
 {
 	unsigned long top_of_stack =
@@ -347,29 +316,3 @@ __visible void syscall_return_slowpath(struct pt_regs *regs)
 	local_irq_disable();
 	prepare_exit_to_usermode(regs);
 }
-
-/*
- * Deprecated notification of userspace execution resumption
- * - triggered by the TIF_WORK_MASK flags
- */
-__visible void
-do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
-{
-	user_exit();
-
-	if (thread_info_flags & _TIF_UPROBE)
-		uprobe_notify_resume(regs);
-
-	/* deal with pending signal delivery */
-	if (thread_info_flags & _TIF_SIGPENDING)
-		do_signal(regs);
-
-	if (thread_info_flags & _TIF_NOTIFY_RESUME) {
-		clear_thread_flag(TIF_NOTIFY_RESUME);
-		tracehook_notify_resume(regs);
-	}
-	if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
-		fire_user_return_notifiers();
-
-	user_enter();
-}
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 5fabf13..6271281 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -88,7 +88,6 @@ extern long syscall_trace_enter_phase2(struct pt_regs *, u32 arch,
 				       unsigned long phase1_result);
 
 extern long syscall_trace_enter(struct pt_regs *);
-extern void syscall_trace_leave(struct pt_regs *);
 
 static inline unsigned long regs_return_value(struct pt_regs *regs)
 {
diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h
index b42408b..c481be7 100644
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -31,7 +31,6 @@ typedef sigset_t compat_sigset_t;
 #include <uapi/asm/signal.h>
 #ifndef __ASSEMBLY__
 extern void do_signal(struct pt_regs *regs);
-extern void do_notify_resume(struct pt_regs *, void *, __u32);
 
 #define __ARCH_HAS_SA_RESTORER
 
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index fdad5c2..8afdc3e 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -143,27 +143,11 @@ struct thread_info {
 	 _TIF_SECCOMP | _TIF_SINGLESTEP | _TIF_SYSCALL_TRACEPOINT |	\
 	 _TIF_NOHZ)
 
-/* work to do in syscall_trace_leave() */
-#define _TIF_WORK_SYSCALL_EXIT	\
-	(_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP |	\
-	 _TIF_SYSCALL_TRACEPOINT | _TIF_NOHZ)
-
-/* work to do on interrupt/exception return */
-#define _TIF_WORK_MASK							\
-	(0x0000FFFF &							\
-	 ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|			\
-	   _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
-
 /* work to do on any return to user space */
 #define _TIF_ALLWORK_MASK						\
 	((0x0000FFFF & ~_TIF_SECCOMP) | _TIF_SYSCALL_TRACEPOINT |	\
 	_TIF_NOHZ)
 
-/* Only used for 64 bit */
-#define _TIF_DO_NOTIFY_MASK						\
-	(_TIF_SIGPENDING | _TIF_NOTIFY_RESUME |				\
-	 _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE)
-
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW							\
 	(_TIF_IO_BITMAP|_TIF_NOTSC|_TIF_BLOCKSTEP)
-- 
cgit v0.10.2


From 6b7e26547fad7ace3dcb27a5babd2317fb9d1e12 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@amacapital.net>
Date: Thu, 6 Aug 2015 14:45:45 -0700
Subject: x86/vdso: Emit a GNU hash

Some dynamic loaders may be slightly faster if a GNU hash is
available.  Strangely, this seems to have no effect at all on
the vdso size.

This is unlikely to have any measurable effect on the time it
takes to resolve vdso symbols (since there are so few of them).
In some contexts, it can be a win for a different reason: if
every DSO has a GNU hash section, then libc can avoid
calculating SysV hashes at all.  Both musl and glibc appear to
have this optimization.

It's plausible that this breaks some ancient glibc version.  If
so, then, depending on what glibc versions break, we could
either require COMPAT_VDSO for them or consider reverting.

Signed-off-by: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Isaac Dunham <ibid.ag@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nathan Lynch <nathan_lynch@mentor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rich Felker <dalias@libc.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: musl@lists.openwall.com <musl@lists.openwall.com>
Link: http://lkml.kernel.org/r/fd56cc057a2d62ab31c56a48d04fccb435b3fd4f.1438897382.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile
index 96c0617..a3d0767 100644
--- a/arch/x86/entry/vdso/Makefile
+++ b/arch/x86/entry/vdso/Makefile
@@ -175,7 +175,7 @@ quiet_cmd_vdso = VDSO    $@
 		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \
 		 sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@'
 
-VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \
+VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=both) \
 	$(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS)
 GCOV_PROFILE := n
 
-- 
cgit v0.10.2


From 33f3df41d03879ab86c7f2d650e67b655e0b85c8 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 17 Aug 2015 12:22:51 -0700
Subject: selftests/x86: Disable sigreturn_64

sigreturn_64 was broken by ed596cde9425 ("Revert x86 sigcontext
cleanups").  Turn it off until we have a better fix.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/a184e75ff170a0bcd76bf376c41cad2c402fe9f7.1439838962.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index b70da4a..986e7cb 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,8 +4,8 @@ include ../lib.mk
 
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
-TARGETS_C_BOTHBITS := sigreturn single_step_syscall sysret_ss_attrs ldt_gdt
-TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt
+TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
-- 
cgit v0.10.2


From a9c909ce8c7853b4fc16055c50eb50d91e20cb93 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 17 Aug 2015 12:22:52 -0700
Subject: selftests/x86: Add syscall_nt selftest

I've had this sitting around for a while.  Add it to the
selftests tree.  Far Cry running under Wine depends on this
behavior.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/ee4d63799a9e5294b70930618b71d04d2770eb2d.1439838962.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 986e7cb..29089b2 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,7 +4,7 @@ include ../lib.mk
 
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
-TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
diff --git a/tools/testing/selftests/x86/syscall_nt.c b/tools/testing/selftests/x86/syscall_nt.c
new file mode 100644
index 0000000..60c06af4
--- /dev/null
+++ b/tools/testing/selftests/x86/syscall_nt.c
@@ -0,0 +1,54 @@
+/*
+ * syscall_nt.c - checks syscalls with NT set
+ * Copyright (c) 2014-2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * Some obscure user-space code requires the ability to make system calls
+ * with FLAGS.NT set.  Make sure it works.
+ */
+
+#include <stdio.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#include <asm/processor-flags.h>
+
+#ifdef __x86_64__
+# define WIDTH "q"
+#else
+# define WIDTH "l"
+#endif
+
+static unsigned long get_eflags(void)
+{
+	unsigned long eflags;
+	asm volatile ("pushf" WIDTH "\n\tpop" WIDTH " %0" : "=rm" (eflags));
+	return eflags;
+}
+
+static void set_eflags(unsigned long eflags)
+{
+	asm volatile ("push" WIDTH " %0\n\tpopf" WIDTH
+		      : : "rm" (eflags) : "flags");
+}
+
+int main()
+{
+	printf("[RUN]\tSet NT and issue a syscall\n");
+	set_eflags(get_eflags() | X86_EFLAGS_NT);
+	syscall(SYS_getpid);
+	if (get_eflags() & X86_EFLAGS_NT) {
+		printf("[OK]\tThe syscall worked and NT is still set\n");
+		return 0;
+	} else {
+		printf("[FAIL]\tThe syscall worked but NT was cleared\n");
+		return 1;
+	}
+}
-- 
cgit v0.10.2


From 99770737ca7e3ebc14e66460a69b7032de9421e1 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@kernel.org>
Date: Fri, 21 Aug 2015 08:33:53 +0200
Subject: x86/asm/tsc: Add rdtscll() merge helper

Some in-flight code makes use of the old rdtscll() (now removed), provide a wrapper
for a kernel cycle to smooth the transition to rdtsc().

( We use the safest variant, rdtsc_ordered(), which has barriers - this adds another
  incentive to remove the wrapper in the future. )

Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Rui <ray.huang@amd.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ralf Baechle <ralf@linux-mips.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: kvm ML <kvm@vger.kernel.org>
Link: http://lkml.kernel.org/r/dddbf98a2af53312e9aa73a5a2b1622fe5d6f52b.1434501121.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 131eec2..54e9f08 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -152,6 +152,9 @@ static __always_inline unsigned long long rdtsc_ordered(void)
 	return rdtsc();
 }
 
+/* Deprecated, keep it for a cycle for easier merging: */
+#define rdtscll(now)	do { (now) = rdtsc_ordered(); } while (0)
+
 static inline unsigned long long native_read_pmc(int counter)
 {
 	DECLARE_ARGS(val, low, high);
-- 
cgit v0.10.2


From f0a97af83f6287357dcc100c859ec0066f164f32 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 20 Aug 2015 22:03:21 -0700
Subject: x86/traps: Weaken context tracking entry assertions

We were asserting that we were all the way in CONTEXT_KERNEL
when exception handlers were called.  While having this be true
is, I think, a nice goal (or maybe a variant in which we assert
that we're in CONTEXT_KERNEL or some new IRQ context), we're not
quite there.

In particular, if an IRQ interrupts the SYSCALL prologue and the
IRQ handler in turn causes an exception, the exception entry
will be called in RCU IRQ mode but with CONTEXT_USER.

This is okay (nothing goes wrong), but until we fix up the
SYSCALL prologue, we need to avoid warning.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/c81faf3916346c0e04346c441392974f49cd7184.1440133286.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 86a82ea..45e8d98 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -112,7 +112,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
 void ist_enter(struct pt_regs *regs)
 {
 	if (user_mode(regs)) {
-		CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+		rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	} else {
 		/*
 		 * We might have interrupted pretty much anything.  In
@@ -282,7 +282,7 @@ static void do_error_trap(struct pt_regs *regs, long error_code, char *str,
 {
 	siginfo_t info;
 
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 
 	if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, signr) !=
 			NOTIFY_STOP) {
@@ -364,7 +364,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
 	const struct bndcsr *bndcsr;
 	siginfo_t *info;
 
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	if (notify_die(DIE_TRAP, "bounds", regs, error_code,
 			X86_TRAP_BR, SIGSEGV) == NOTIFY_STOP)
 		return;
@@ -442,7 +442,7 @@ do_general_protection(struct pt_regs *regs, long error_code)
 {
 	struct task_struct *tsk;
 
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	conditional_sti(regs);
 
 	if (v8086_mode(regs)) {
@@ -496,7 +496,7 @@ dotraplinkage void notrace do_int3(struct pt_regs *regs, long error_code)
 		return;
 
 	ist_enter(regs);
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 #ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
 	if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, X86_TRAP_BP,
 				SIGTRAP) == NOTIFY_STOP)
@@ -729,14 +729,14 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
 
 dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_MF);
 }
 
 dotraplinkage void
 do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
 {
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	math_error(regs, error_code, X86_TRAP_XF);
 }
 
@@ -749,7 +749,7 @@ do_spurious_interrupt_bug(struct pt_regs *regs, long error_code)
 dotraplinkage void
 do_device_not_available(struct pt_regs *regs, long error_code)
 {
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	BUG_ON(use_eager_fpu());
 
 #ifdef CONFIG_MATH_EMULATION
@@ -775,7 +775,7 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
 {
 	siginfo_t info;
 
-	CT_WARN_ON(ct_state() != CONTEXT_KERNEL);
+	rcu_lockdep_assert(rcu_is_watching(), "entry code didn't wake RCU");
 	local_irq_enable();
 
 	info.si_signo = SIGILL;
-- 
cgit v0.10.2


From f96756746c7909de37db3d03ac5fd5cfb2757f38 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Mon, 10 Aug 2015 12:19:53 +0200
Subject: x86/asm: Add MONITORX/MWAITX instruction support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

AMD Carrizo processors (Family 15h, Models 60h-6fh) added a new
feature called MWAITX (MWAIT with extensions) as an extension to
MONITOR/MWAIT.

This new instruction controls a configurable timer which causes
the core to exit wait state on timer expiration, in addition to
"normal" MWAIT condition of reading from a monitored VA.

Compared to MONITOR/MWAIT, there are minor differences in opcode
and input parameters:

MWAITX ECX[1]: enable timer if set
MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks ==
TSC. The software P0 frequency is the same as the TSC frequency.

                MWAIT                           MWAITX
opcode          0f 01 c9           |            0f 01 fb
ECX[0]                  value of RFLAGS.IF seen by instruction
ECX[1]          unused/#GP if set  |            enable timer if set
ECX[31:2]                     unused/#GP if set
EAX                           unused (reserve for hint)
EBX[31:0]       unused             |            max wait time (SW P0 == TSC)

                MONITOR                         MONITORX
opcode          0f 01 c8           |            0f 01 fa
EAX                     (logical) address to monitor
ECX                     #GP if not zero

Max timeout = EBX/(TSC frequency)

Signed-off-by: Huang Rui <ray.huang@amd.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Andreas Herrmann <herrmann.der.user@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dirk Brandewie <dirk.j.brandewie@intel.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Josh Triplett <josh@joshtriplett.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <bitbucket@online.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Ross Zwisler <ross.zwisler@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Li <tony.li@amd.com>
Link: http://lkml.kernel.org/r/1439201994-28067-3-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3d6606f..a39e570 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -176,6 +176,7 @@
 #define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
 #define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
 #define X86_FEATURE_PERFCTR_L2	( 6*32+28) /* L2 performance counter extensions */
+#define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
index 653dfa7..c70689b 100644
--- a/arch/x86/include/asm/mwait.h
+++ b/arch/x86/include/asm/mwait.h
@@ -14,6 +14,9 @@
 #define CPUID5_ECX_INTERRUPT_BREAK	0x2
 
 #define MWAIT_ECX_INTERRUPT_BREAK	0x1
+#define MWAITX_ECX_TIMER_ENABLE		BIT(1)
+#define MWAITX_MAX_LOOPS		((u32)-1)
+#define MWAITX_DISABLE_CSTATES		0xf
 
 static inline void __monitor(const void *eax, unsigned long ecx,
 			     unsigned long edx)
@@ -23,6 +26,14 @@ static inline void __monitor(const void *eax, unsigned long ecx,
 		     :: "a" (eax), "c" (ecx), "d"(edx));
 }
 
+static inline void __monitorx(const void *eax, unsigned long ecx,
+			      unsigned long edx)
+{
+	/* "monitorx %eax, %ecx, %edx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xfa;"
+		     :: "a" (eax), "c" (ecx), "d"(edx));
+}
+
 static inline void __mwait(unsigned long eax, unsigned long ecx)
 {
 	/* "mwait %eax, %ecx;" */
@@ -30,6 +41,40 @@ static inline void __mwait(unsigned long eax, unsigned long ecx)
 		     :: "a" (eax), "c" (ecx));
 }
 
+/*
+ * MWAITX allows for a timer expiration to get the core out a wait state in
+ * addition to the default MWAIT exit condition of a store appearing at a
+ * monitored virtual address.
+ *
+ * Registers:
+ *
+ * MWAITX ECX[1]: enable timer if set
+ * MWAITX EBX[31:0]: max wait time expressed in SW P0 clocks. The software P0
+ * frequency is the same as the TSC frequency.
+ *
+ * Below is a comparison between MWAIT and MWAITX on AMD processors:
+ *
+ *                 MWAIT                           MWAITX
+ * opcode          0f 01 c9           |            0f 01 fb
+ * ECX[0]                  value of RFLAGS.IF seen by instruction
+ * ECX[1]          unused/#GP if set  |            enable timer if set
+ * ECX[31:2]                     unused/#GP if set
+ * EAX                           unused (reserve for hint)
+ * EBX[31:0]       unused             |            max wait time (P0 clocks)
+ *
+ *                 MONITOR                         MONITORX
+ * opcode          0f 01 c8           |            0f 01 fa
+ * EAX                     (logical) address to monitor
+ * ECX                     #GP if not zero
+ */
+static inline void __mwaitx(unsigned long eax, unsigned long ebx,
+			    unsigned long ecx)
+{
+	/* "mwaitx %eax, %ebx, %ecx;" */
+	asm volatile(".byte 0x0f, 0x01, 0xfb;"
+		     :: "a" (eax), "b" (ebx), "c" (ecx));
+}
+
 static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
 {
 	trace_hardirqs_on();
-- 
cgit v0.10.2


From b466bdb614823aaaa7188e85516177d2850f4782 Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Mon, 10 Aug 2015 12:19:54 +0200
Subject: x86/asm/delay: Introduce an MWAITX-based delay with a configurable
 timer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

MWAITX can enable a timer and a corresponding timer value
specified in SW P0 clocks. The SW P0 frequency is the same as
TSC. The timer provides an upper bound on how long the
instruction waits before exiting.

This way, a delay function in the kernel can leverage that
MWAITX timer of MWAITX.

When a CPU core executes MWAITX, it will be quiesced in a
waiting phase, diminishing its power consumption. This way, we
can save power in comparison to our default TSC-based delays.

A simple test shows that:

	$ cat /sys/bus/pci/devices/0000\:00\:18.4/hwmon/hwmon0/power1_acc
	$ sleep 10000s
	$ cat /sys/bus/pci/devices/0000\:00\:18.4/hwmon/hwmon0/power1_acc

Results:

	* TSC-based default delay:      485115 uWatts average power
	* MWAITX-based delay:           252738 uWatts average power

Thus, that's about 240 milliWatts less power consumption. The
test method relies on the support of AMD CPU accumulated power
algorithm in fam15h_power for which patches are forthcoming.

Suggested-by: Andy Lutomirski <luto@amacapital.net>
Suggested-by: Borislav Petkov <bp@suse.de>
Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Huang Rui <ray.huang@amd.com>
[ Fix delay truncation. ]
Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Aaron Lu <aaron.lu@intel.com>
Cc: Andreas Herrmann <herrmann.der.user@gmail.com>
Cc: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Jacob Shin <jacob.w.shin@gmail.com>
Cc: Jiri Olsa <jolsa@kernel.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Len Brown <lenb@kernel.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Rafael J. Wysocki <rjw@rjwysocki.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Li <tony.li@amd.com>
Link: http://lkml.kernel.org/r/1438744732-1459-3-git-send-email-ray.huang@amd.com
Link: http://lkml.kernel.org/r/1439201994-28067-4-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/delay.h b/arch/x86/include/asm/delay.h
index 9b3b4f2..36a760b 100644
--- a/arch/x86/include/asm/delay.h
+++ b/arch/x86/include/asm/delay.h
@@ -4,5 +4,6 @@
 #include <asm-generic/delay.h>
 
 void use_tsc_delay(void);
+void use_mwaitx_delay(void);
 
 #endif /* _ASM_X86_DELAY_H */
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 51ad2af..4a70fc6 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -11,6 +11,7 @@
 #include <asm/cpu.h>
 #include <asm/smp.h>
 #include <asm/pci-direct.h>
+#include <asm/delay.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/mmconfig.h>
@@ -506,6 +507,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
 		/* A random value per boot for bit slice [12:upper_bit) */
 		va_align.bits = get_random_int() & va_align.mask;
 	}
+
+	if (cpu_has(c, X86_FEATURE_MWAITX))
+		use_mwaitx_delay();
 }
 
 static void early_init_amd(struct cpuinfo_x86 *c)
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index 4453d52..e912b2f 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -20,6 +20,7 @@
 #include <asm/processor.h>
 #include <asm/delay.h>
 #include <asm/timer.h>
+#include <asm/mwait.h>
 
 #ifdef CONFIG_SMP
 # include <asm/smp.h>
@@ -84,6 +85,44 @@ static void delay_tsc(unsigned long __loops)
 }
 
 /*
+ * On some AMD platforms, MWAITX has a configurable 32-bit timer, that
+ * counts with TSC frequency. The input value is the loop of the
+ * counter, it will exit when the timer expires.
+ */
+static void delay_mwaitx(unsigned long __loops)
+{
+	u64 start, end, delay, loops = __loops;
+
+	start = rdtsc_ordered();
+
+	for (;;) {
+		delay = min_t(u64, MWAITX_MAX_LOOPS, loops);
+
+		/*
+		 * Use cpu_tss as a cacheline-aligned, seldomly
+		 * accessed per-cpu variable as the monitor target.
+		 */
+		__monitorx(this_cpu_ptr(&cpu_tss), 0, 0);
+
+		/*
+		 * AMD, like Intel, supports the EAX hint and EAX=0xf
+		 * means, do not enter any deep C-state and we use it
+		 * here in delay() to minimize wakeup latency.
+		 */
+		__mwaitx(MWAITX_DISABLE_CSTATES, delay, MWAITX_ECX_TIMER_ENABLE);
+
+		end = rdtsc_ordered();
+
+		if (loops <= end - start)
+			break;
+
+		loops -= end - start;
+
+		start = end;
+	}
+}
+
+/*
  * Since we calibrate only once at boot, this
  * function should be set once at boot and not changed
  */
@@ -91,7 +130,13 @@ static void (*delay_fn)(unsigned long) = delay_loop;
 
 void use_tsc_delay(void)
 {
-	delay_fn = delay_tsc;
+	if (delay_fn == delay_loop)
+		delay_fn = delay_tsc;
+}
+
+void use_mwaitx_delay(void)
+{
+	delay_fn = delay_mwaitx;
 }
 
 int read_current_timer(unsigned long *timer_val)
-- 
cgit v0.10.2


From 47edb65178cb7056c2eea0b6c41a7d8c84547192 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Thu, 23 Jul 2015 12:14:40 -0700
Subject: x86/asm/msr: Make wrmsrl() a function

As of cf991de2f614 ("x86/asm/msr: Make wrmsrl_safe() a
function"), wrmsrl_safe is a function, but wrmsrl is still a
macro.  The wrmsrl macro performs invalid shifts if the value
argument is 32 bits. This makes it unnecessarily awkward to
write code that puts an unsigned long into an MSR.

To make this work, syscall_init needs tweaking to stop passing
a function pointer to wrmsrl.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Willy Tarreau <w@1wt.eu>
Link: http://lkml.kernel.org/r/690f0c629a1085d054e2d1ef3da073cfb3f7db92.1437678821.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 54e9f08..77d8b28 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -188,8 +188,10 @@ static inline void wrmsr(unsigned msr, unsigned low, unsigned high)
 #define rdmsrl(msr, val)			\
 	((val) = native_read_msr((msr)))
 
-#define wrmsrl(msr, val)						\
-	native_write_msr((msr), (u32)((u64)(val)), (u32)((u64)(val) >> 32))
+static inline void wrmsrl(unsigned msr, u64 val)
+{
+	native_write_msr(msr, (u32)val, (u32)(val >> 32));
+}
 
 /* wrmsr with exception handling */
 static inline int wrmsr_safe(unsigned msr, unsigned low, unsigned high)
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c2be037..10d0596 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -153,7 +153,11 @@ do {						\
 	val = paravirt_read_msr(msr, &_err);	\
 } while (0)
 
-#define wrmsrl(msr, val)	wrmsr(msr, (u32)((u64)(val)), ((u64)(val))>>32)
+static inline void wrmsrl(unsigned msr, u64 val)
+{
+	wrmsr(msr, (u32)val, (u32)(val>>32));
+}
+
 #define wrmsr_safe(msr, a, b)	paravirt_write_msr(msr, a, b)
 
 /* rdmsr with exception handling */
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cb9e5df..b128808 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1185,10 +1185,10 @@ void syscall_init(void)
 	 * set CS/DS but only a 32bit target. LSTAR sets the 64bit rip.
 	 */
 	wrmsrl(MSR_STAR,  ((u64)__USER32_CS)<<48  | ((u64)__KERNEL_CS)<<32);
-	wrmsrl(MSR_LSTAR, entry_SYSCALL_64);
+	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
 
 #ifdef CONFIG_IA32_EMULATION
-	wrmsrl(MSR_CSTAR, entry_SYSCALL_compat);
+	wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat);
 	/*
 	 * This only works on Intel CPUs.
 	 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
@@ -1199,7 +1199,7 @@ void syscall_init(void)
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
 #else
-	wrmsrl(MSR_CSTAR, ignore_sysret);
+	wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret);
 	wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
 	wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
 	wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
-- 
cgit v0.10.2


From 7e01ebffffedec22cea86ebe94802f909e4579ca Mon Sep 17 00:00:00 2001
From: Huang Rui <ray.huang@amd.com>
Date: Thu, 27 Aug 2015 18:04:04 +0800
Subject: x86/asm: Drop repeated macro of X86_EFLAGS_AC definition

We just need one macro of X86_EFLAGS_AC_BIT and X86_EFLAGS_AC.

Signed-off-by: Huang Rui <ray.huang@amd.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@suse.de>
Cc: Fengguang Wu <fengguang.wu@intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tony Li <tony.li@amd.com>
Cc: Tony Luck <tony.luck@intel.com>
Link: http://lkml.kernel.org/r/1440669844-21535-1-git-send-email-ray.huang@amd.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 180a0c3..79887ab 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -37,8 +37,6 @@
 #define X86_EFLAGS_VM		_BITUL(X86_EFLAGS_VM_BIT)
 #define X86_EFLAGS_AC_BIT	18 /* Alignment Check/Access Control */
 #define X86_EFLAGS_AC		_BITUL(X86_EFLAGS_AC_BIT)
-#define X86_EFLAGS_AC_BIT	18 /* Alignment Check/Access Control */
-#define X86_EFLAGS_AC		_BITUL(X86_EFLAGS_AC_BIT)
 #define X86_EFLAGS_VIF_BIT	19 /* Virtual Interrupt Flag */
 #define X86_EFLAGS_VIF		_BITUL(X86_EFLAGS_VIF_BIT)
 #define X86_EFLAGS_VIP_BIT	20 /* Virtual Interrupt Pending */
-- 
cgit v0.10.2