From 71b3c126e61177eb693423f2e18a1914205b165e Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 6 Jan 2016 12:21:01 -0800
Subject: x86/mm: Add barriers and document switch_mm()-vs-flush
 synchronization

When switch_mm() activates a new PGD, it also sets a bit that
tells other CPUs that the PGD is in use so that TLB flush IPIs
will be sent.  In order for that to work correctly, the bit
needs to be visible prior to loading the PGD and therefore
starting to fill the local TLB.

Document all the barriers that make this work correctly and add
a couple that were missing.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: stable@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 379cd36..1edc9cd 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -116,8 +116,34 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 #endif
 		cpumask_set_cpu(cpu, mm_cpumask(next));
 
-		/* Re-load page tables */
+		/*
+		 * Re-load page tables.
+		 *
+		 * This logic has an ordering constraint:
+		 *
+		 *  CPU 0: Write to a PTE for 'next'
+		 *  CPU 0: load bit 1 in mm_cpumask.  if nonzero, send IPI.
+		 *  CPU 1: set bit 1 in next's mm_cpumask
+		 *  CPU 1: load from the PTE that CPU 0 writes (implicit)
+		 *
+		 * We need to prevent an outcome in which CPU 1 observes
+		 * the new PTE value and CPU 0 observes bit 1 clear in
+		 * mm_cpumask.  (If that occurs, then the IPI will never
+		 * be sent, and CPU 0's TLB will contain a stale entry.)
+		 *
+		 * The bad outcome can occur if either CPU's load is
+		 * reordered before that CPU's store, so both CPUs much
+		 * execute full barriers to prevent this from happening.
+		 *
+		 * Thus, switch_mm needs a full barrier between the
+		 * store to mm_cpumask and any operation that could load
+		 * from next->pgd.  This barrier synchronizes with
+		 * remote TLB flushers.  Fortunately, load_cr3 is
+		 * serializing and thus acts as a full barrier.
+		 *
+		 */
 		load_cr3(next->pgd);
+
 		trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
 
 		/* Stop flush ipis for the previous mm */
@@ -156,10 +182,15 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			 * schedule, protecting us from simultaneous changes.
 			 */
 			cpumask_set_cpu(cpu, mm_cpumask(next));
+
 			/*
 			 * We were in lazy tlb mode and leave_mm disabled
 			 * tlb flush IPI delivery. We must reload CR3
 			 * to make sure to use no freed page tables.
+			 *
+			 * As above, this is a barrier that forces
+			 * TLB repopulation to be ordered after the
+			 * store to mm_cpumask.
 			 */
 			load_cr3(next->pgd);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0..8f4cc3d 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -161,7 +161,10 @@ void flush_tlb_current_task(void)
 	preempt_disable();
 
 	count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
+
+	/* This is an implicit full barrier that synchronizes with switch_mm. */
 	local_flush_tlb();
+
 	trace_tlb_flush(TLB_LOCAL_SHOOTDOWN, TLB_FLUSH_ALL);
 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
 		flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL);
@@ -188,17 +191,29 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
 	unsigned long base_pages_to_flush = TLB_FLUSH_ALL;
 
 	preempt_disable();
-	if (current->active_mm != mm)
+	if (current->active_mm != mm) {
+		/* Synchronize with switch_mm. */
+		smp_mb();
+
 		goto out;
+	}
 
 	if (!current->mm) {
 		leave_mm(smp_processor_id());
+
+		/* Synchronize with switch_mm. */
+		smp_mb();
+
 		goto out;
 	}
 
 	if ((end != TLB_FLUSH_ALL) && !(vmflag & VM_HUGETLB))
 		base_pages_to_flush = (end - start) >> PAGE_SHIFT;
 
+	/*
+	 * Both branches below are implicit full barriers (MOV to CR or
+	 * INVLPG) that synchronize with switch_mm.
+	 */
 	if (base_pages_to_flush > tlb_single_page_flush_ceiling) {
 		base_pages_to_flush = TLB_FLUSH_ALL;
 		count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL);
@@ -228,10 +243,18 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start)
 	preempt_disable();
 
 	if (current->active_mm == mm) {
-		if (current->mm)
+		if (current->mm) {
+			/*
+			 * Implicit full barrier (INVLPG) that synchronizes
+			 * with switch_mm.
+			 */
 			__flush_tlb_one(start);
-		else
+		} else {
 			leave_mm(smp_processor_id());
+
+			/* Synchronize with switch_mm. */
+			smp_mb();
+		}
 	}
 
 	if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids)
-- 
cgit v0.10.2


From 8c31902cffc4d716450be549c66a67a8a3dd479c Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 4 Jan 2016 10:17:09 -0800
Subject: x86/boot: Double BOOT_HEAP_SIZE to 64KB

When decompressing kernel image during x86 bootup, malloc memory
for ELF program headers may run out of heap space, which leads
to system halt.  This patch doubles BOOT_HEAP_SIZE to 64KB.

Tested with 32-bit kernel which failed to boot without this patch.

Signed-off-by: H.J. Lu <hjl.tools@gmail.com>
Acked-by: H. Peter Anvin <hpa@zytor.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Cc: <stable@vger.kernel.org>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 4fa687a..6b8d6e8 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -27,7 +27,7 @@
 #define BOOT_HEAP_SIZE             0x400000
 #else /* !CONFIG_KERNEL_BZIP2 */
 
-#define BOOT_HEAP_SIZE	0x8000
+#define BOOT_HEAP_SIZE	0x10000
 
 #endif /* !CONFIG_KERNEL_BZIP2 */
 
-- 
cgit v0.10.2


From c9e0d39126af9e338495b719e9d565ca9ebcd4c5 Mon Sep 17 00:00:00 2001
From: Dave Jones <davej@codemonkey.org.uk>
Date: Mon, 11 Jan 2016 12:04:28 -0500
Subject: x86/mm/pat: Make split_page_count() check for empty levels to fix
 /proc/meminfo output

In CONFIG_PAGEALLOC_DEBUG=y builds, we disable 2M pages.

Unfortunatly when we split up mappings during boot,
split_page_count() doesn't take this into account, and
starts decrementing an empty direct_pages_count[] level.

This results in /proc/meminfo showing crazy things like:

  DirectMap2M:    18446744073709543424 kB

Signed-off-by: Dave Jones <davej@codemonkey.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6000ad7..fc6a4c8 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -66,6 +66,9 @@ void update_page_count(int level, unsigned long pages)
 
 static void split_page_count(int level)
 {
+	if (direct_pages_count[level] == 0)
+		return;
+
 	direct_pages_count[level]--;
 	direct_pages_count[level - 1] += PTRS_PER_PTE;
 }
-- 
cgit v0.10.2


From 0f672809f91abd0aee01624ccad9199a62e3da7a Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 11 Jan 2016 17:23:32 -0800
Subject: selftests/x86: Disable the ldt_gdt_64 test for now

ldt_gdt.c relies on cross-cpu invalidation of SS to do one of
its tests.  On 32-bit builds, this works fine, but on 64-bit
builds, it only works if the kernel has proper SS sigcontext
handling for 64-bit user programs.

Since the SS fixes are currently reverted, restrict the test
case to 32 bits for now.

In principle, I could change the test to use a different segment
register, but it would be messy: CS can't point to the LDT for
64-bit code, and the other registers don't result in immediate
faults because they aren't reloaded on kernel -> user
transitions.

When we fix sigcontext (in 4.6?), we can revert this.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/231591d9122d282402d8f53175134f8db5b3bc73.1452561752.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index eabcff4..398bb0e 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -4,9 +4,10 @@ include ../lib.mk
 
 .PHONY: all all_32 all_64 warn_32bit_failure clean
 
-TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs ldt_gdt syscall_nt ptrace_syscall
+TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
-			test_FCMOV test_FCOMI test_FISTTP
+			test_FCMOV test_FCOMI test_FISTTP \
+			ldt_gdt
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
-- 
cgit v0.10.2


From b500f77baea576b74c3961613b67eaffcdf65c57 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Tue, 12 Jan 2016 10:19:30 +0800
Subject: x86/mm: Use PAGE_ALIGNED instead of IS_ALIGNED

Use PAGE_ALIGEND macro in <linux/mm.h> to simplify code.

Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Cc: <guohanjun@huawei.com>
Cc: Alexander Kuleshov <kuleshovmail@gmail.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1452565170-11083-1-git-send-email-wangkefeng.wang@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index ec081fe..8829482 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -814,8 +814,7 @@ remove_pte_table(pte_t *pte_start, unsigned long addr, unsigned long end,
 		if (phys_addr < (phys_addr_t)0x40000000)
 			return;
 
-		if (IS_ALIGNED(addr, PAGE_SIZE) &&
-		    IS_ALIGNED(next, PAGE_SIZE)) {
+		if (PAGE_ALIGNED(addr) && PAGE_ALIGNED(next)) {
 			/*
 			 * Do not free direct mapping pages since they were
 			 * freed when offlining, or simplely not in use.
-- 
cgit v0.10.2


From 4f81cbafcce2c603db7865e9d0e461f7947d77d4 Mon Sep 17 00:00:00 2001
From: yu-cheng yu <yu-cheng.yu@intel.com>
Date: Wed, 6 Jan 2016 14:24:51 -0800
Subject: x86/fpu: Fix early FPU command-line parsing

The function fpu__init_system() is executed before
parse_early_param(). This causes wrong FPU configuration. This
patch fixes this issue by parsing boot_command_line in the
beginning of fpu__init_system().

With all four patches in this series, each parameter disables
features as the following:

eagerfpu=off: eagerfpu, avx, avx2, avx512, mpx
no387: fpu
nofxsr: fxsr, fxsropt, xmm
noxsave: xsave, xsaveopt, xsaves, xsavec, avx, avx2, avx512,
mpx, xgetbv1 noxsaveopt: xsaveopt
noxsaves: xsaves

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi V. Shankar <ravi.v.shankar@intel.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/1452119094-7252-2-git-send-email-yu-cheng.yu@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 7b2978a..3a45fcd 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -3,8 +3,11 @@
  */
 #include <asm/fpu/internal.h>
 #include <asm/tlbflush.h>
+#include <asm/setup.h>
+#include <asm/cmdline.h>
 
 #include <linux/sched.h>
+#include <linux/init.h>
 
 /*
  * Initialize the TS bit in CR0 according to the style of context-switches
@@ -270,18 +273,6 @@ static void __init fpu__init_system_xstate_size_legacy(void)
  */
 static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
 
-static int __init eager_fpu_setup(char *s)
-{
-	if (!strcmp(s, "on"))
-		eagerfpu = ENABLE;
-	else if (!strcmp(s, "off"))
-		eagerfpu = DISABLE;
-	else if (!strcmp(s, "auto"))
-		eagerfpu = AUTO;
-	return 1;
-}
-__setup("eagerfpu=", eager_fpu_setup);
-
 /*
  * Pick the FPU context switching strategy:
  */
@@ -316,11 +307,46 @@ static void __init fpu__init_system_ctx_switch(void)
 }
 
 /*
+ * We parse fpu parameters early because fpu__init_system() is executed
+ * before parse_early_param().
+ */
+static void __init fpu__init_parse_early_param(void)
+{
+	/*
+	 * No need to check "eagerfpu=auto" again, since it is the
+	 * initial default.
+	 */
+	if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off"))
+		eagerfpu = DISABLE;
+	else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on"))
+		eagerfpu = ENABLE;
+
+	if (cmdline_find_option_bool(boot_command_line, "no387"))
+		setup_clear_cpu_cap(X86_FEATURE_FPU);
+
+	if (cmdline_find_option_bool(boot_command_line, "nofxsr")) {
+		setup_clear_cpu_cap(X86_FEATURE_FXSR);
+		setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
+		setup_clear_cpu_cap(X86_FEATURE_XMM);
+	}
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsave"))
+		fpu__xstate_clear_all_cpu_caps();
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsaveopt"))
+		setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+
+	if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
+		setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+}
+
+/*
  * Called on the boot CPU once per system bootup, to set up the initial
  * FPU state that is later cloned into all processes:
  */
 void __init fpu__init_system(struct cpuinfo_x86 *c)
 {
+	fpu__init_parse_early_param();
 	fpu__init_system_early_generic(c);
 
 	/*
@@ -344,62 +370,3 @@ void __init fpu__init_system(struct cpuinfo_x86 *c)
 
 	fpu__init_system_ctx_switch();
 }
-
-/*
- * Boot parameter to turn off FPU support and fall back to math-emu:
- */
-static int __init no_387(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FPU);
-	return 1;
-}
-__setup("no387", no_387);
-
-/*
- * Disable all xstate CPU features:
- */
-static int __init x86_noxsave_setup(char *s)
-{
-	if (strlen(s))
-		return 0;
-
-	fpu__xstate_clear_all_cpu_caps();
-
-	return 1;
-}
-__setup("noxsave", x86_noxsave_setup);
-
-/*
- * Disable the XSAVEOPT instruction specifically:
- */
-static int __init x86_noxsaveopt_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-
-	return 1;
-}
-__setup("noxsaveopt", x86_noxsaveopt_setup);
-
-/*
- * Disable the XSAVES instruction:
- */
-static int __init x86_noxsaves_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-
-	return 1;
-}
-__setup("noxsaves", x86_noxsaves_setup);
-
-/*
- * Disable FX save/restore and SSE support:
- */
-static int __init x86_nofxsr_setup(char *s)
-{
-	setup_clear_cpu_cap(X86_FEATURE_FXSR);
-	setup_clear_cpu_cap(X86_FEATURE_FXSR_OPT);
-	setup_clear_cpu_cap(X86_FEATURE_XMM);
-
-	return 1;
-}
-__setup("nofxsr", x86_nofxsr_setup);
-- 
cgit v0.10.2


From eb7c5f872e697b0aebd846cf3a3328d71e9decb2 Mon Sep 17 00:00:00 2001
From: yu-cheng yu <yu-cheng.yu@intel.com>
Date: Wed, 6 Jan 2016 14:24:52 -0800
Subject: x86/fpu: Disable XGETBV1 when no XSAVE

When "noxsave" is given as a command-line input, the kernel
should disable XGETBV1. This issue currently does not cause any
actual problems. XGETBV1 is only useful if we have something
using the 'init optimization' (i.e. xsaveopt, xsaves). We
already clear both of those in fpu__xstate_clear_all_cpu_caps().
But this is good for completeness.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Reviewed-by: Dave Hansen <dave.hansen@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi V. Shankar <ravi.v.shankar@intel.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/1452119094-7252-3-git-send-email-yu-cheng.yu@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 40f1002..d489f27 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -52,6 +52,7 @@ void fpu__xstate_clear_all_cpu_caps(void)
 	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
 	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
 	setup_clear_cpu_cap(X86_FEATURE_MPX);
+	setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
 }
 
 /*
-- 
cgit v0.10.2


From a5fe93a549c54838063d2952dd9643b0b18aa67f Mon Sep 17 00:00:00 2001
From: yu-cheng yu <yu-cheng.yu@intel.com>
Date: Wed, 6 Jan 2016 14:24:53 -0800
Subject: x86/fpu: Disable MPX when eagerfpu is off

This issue is a fallout from the command-line parsing move.

When "eagerfpu=off" is given as a command-line input, the kernel
should disable MPX support. The decision for turning off MPX was
made in fpu__init_system_ctx_switch(), which is after the
selection of the XSAVE format. This patch fixes it by getting
that decision done earlier in fpu__init_system_xstate().

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi V. Shankar <ravi.v.shankar@intel.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/1452119094-7252-4-git-send-email-yu-cheng.yu@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/fpu/internal.h b/arch/x86/include/asm/fpu/internal.h
index eadcdd5..0fd440d 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -42,6 +42,7 @@ extern void fpu__init_cpu_xstate(void);
 extern void fpu__init_system(struct cpuinfo_x86 *c);
 extern void fpu__init_check_bugs(void);
 extern void fpu__resume_cpu(void);
+extern u64 fpu__get_supported_xfeatures_mask(void);
 
 /*
  * Debugging facility:
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 3a45fcd..f0ab368 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -274,7 +274,45 @@ static void __init fpu__init_system_xstate_size_legacy(void)
 static enum { AUTO, ENABLE, DISABLE } eagerfpu = AUTO;
 
 /*
+ * Find supported xfeatures based on cpu features and command-line input.
+ * This must be called after fpu__init_parse_early_param() is called and
+ * xfeatures_mask is enumerated.
+ */
+u64 __init fpu__get_supported_xfeatures_mask(void)
+{
+	/* Support all xfeatures known to us */
+	if (eagerfpu != DISABLE)
+		return XCNTXT_MASK;
+
+	/* Warning of xfeatures being disabled for no eagerfpu mode */
+	if (xfeatures_mask & XFEATURE_MASK_EAGER) {
+		pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
+			xfeatures_mask & XFEATURE_MASK_EAGER);
+	}
+
+	/* Return a mask that masks out all features requiring eagerfpu mode */
+	return ~XFEATURE_MASK_EAGER;
+}
+
+/*
+ * Disable features dependent on eagerfpu.
+ */
+static void __init fpu__clear_eager_fpu_features(void)
+{
+	setup_clear_cpu_cap(X86_FEATURE_MPX);
+}
+
+/*
  * Pick the FPU context switching strategy:
+ *
+ * When eagerfpu is AUTO or ENABLE, we ensure it is ENABLE if either of
+ * the following is true:
+ *
+ * (1) the cpu has xsaveopt, as it has the optimization and doing eager
+ *     FPU switching has a relatively low cost compared to a plain xsave;
+ * (2) the cpu has xsave features (e.g. MPX) that depend on eager FPU
+ *     switching. Should the kernel boot with noxsaveopt, we support MPX
+ *     with eager FPU switching at a higher cost.
  */
 static void __init fpu__init_system_ctx_switch(void)
 {
@@ -286,19 +324,11 @@ static void __init fpu__init_system_ctx_switch(void)
 	WARN_ON_FPU(current->thread.fpu.fpstate_active);
 	current_thread_info()->status = 0;
 
-	/* Auto enable eagerfpu for xsaveopt */
 	if (boot_cpu_has(X86_FEATURE_XSAVEOPT) && eagerfpu != DISABLE)
 		eagerfpu = ENABLE;
 
-	if (xfeatures_mask & XFEATURE_MASK_EAGER) {
-		if (eagerfpu == DISABLE) {
-			pr_err("x86/fpu: eagerfpu switching disabled, disabling the following xstate features: 0x%llx.\n",
-			       xfeatures_mask & XFEATURE_MASK_EAGER);
-			xfeatures_mask &= ~XFEATURE_MASK_EAGER;
-		} else {
-			eagerfpu = ENABLE;
-		}
-	}
+	if (xfeatures_mask & XFEATURE_MASK_EAGER)
+		eagerfpu = ENABLE;
 
 	if (eagerfpu == ENABLE)
 		setup_force_cpu_cap(X86_FEATURE_EAGER_FPU);
@@ -316,10 +346,12 @@ static void __init fpu__init_parse_early_param(void)
 	 * No need to check "eagerfpu=auto" again, since it is the
 	 * initial default.
 	 */
-	if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off"))
+	if (cmdline_find_option_bool(boot_command_line, "eagerfpu=off")) {
 		eagerfpu = DISABLE;
-	else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on"))
+		fpu__clear_eager_fpu_features();
+	} else if (cmdline_find_option_bool(boot_command_line, "eagerfpu=on")) {
 		eagerfpu = ENABLE;
+	}
 
 	if (cmdline_find_option_bool(boot_command_line, "no387"))
 		setup_clear_cpu_cap(X86_FEATURE_FPU);
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index d489f27..d425cda5 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -633,8 +633,7 @@ void __init fpu__init_system_xstate(void)
 		BUG();
 	}
 
-	/* Support only the state known to the OS: */
-	xfeatures_mask = xfeatures_mask & XCNTXT_MASK;
+	xfeatures_mask &= fpu__get_supported_xfeatures_mask();
 
 	/* Enable xstate instructions to be able to continue with initialization: */
 	fpu__init_cpu_xstate();
-- 
cgit v0.10.2


From 394db20ca240741a08d472173db13d6f6a6e5a28 Mon Sep 17 00:00:00 2001
From: yu-cheng yu <yu-cheng.yu@intel.com>
Date: Wed, 6 Jan 2016 14:24:54 -0800
Subject: x86/fpu: Disable AVX when eagerfpu is off

When "eagerfpu=off" is given as a command-line input, the kernel
should disable AVX support.

The Task Switched bit used for lazy context switching does not
support AVX. If AVX is enabled without eagerfpu context
switching, one task's AVX state could become corrupted or leak
to other tasks. This is a bug and has bad security implications.

This only affects systems that have AVX/AVX2/AVX512 and this
issue will be found only when one actually uses AVX/AVX2/AVX512
_AND_ does eagerfpu=off.

Reference: Intel Software Developer's Manual Vol. 3A

Sec. 2.5 Control Registers:
TS Task Switched bit (bit 3 of CR0) -- Allows the saving of the
x87 FPU/ MMX/SSE/SSE2/SSE3/SSSE3/SSE4 context on a task switch
to be delayed until an x87 FPU/MMX/SSE/SSE2/SSE3/SSSE3/SSE4
instruction is actually executed by the new task.

Sec. 13.4.1 Using the TS Flag to Control the Saving of the X87
FPU and SSE State
When the TS flag is set, the processor monitors the instruction
stream for x87 FPU, MMX, SSE instructions. When the processor
detects one of these instructions, it raises a
device-not-available exeception (#NM) prior to executing the
instruction.

Signed-off-by: Yu-cheng Yu <yu-cheng.yu@intel.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi V. Shankar <ravi.v.shankar@intel.com>
Cc: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: yu-cheng yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/1452119094-7252-5-git-send-email-yu-cheng.yu@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/fpu/xstate.h b/arch/x86/include/asm/fpu/xstate.h
index 3a6c89b..af30fde 100644
--- a/arch/x86/include/asm/fpu/xstate.h
+++ b/arch/x86/include/asm/fpu/xstate.h
@@ -20,15 +20,16 @@
 
 /* Supported features which support lazy state saving */
 #define XFEATURE_MASK_LAZY	(XFEATURE_MASK_FP | \
-				 XFEATURE_MASK_SSE | \
+				 XFEATURE_MASK_SSE)
+
+/* Supported features which require eager state saving */
+#define XFEATURE_MASK_EAGER	(XFEATURE_MASK_BNDREGS | \
+				 XFEATURE_MASK_BNDCSR | \
 				 XFEATURE_MASK_YMM | \
-				 XFEATURE_MASK_OPMASK |	\
+				 XFEATURE_MASK_OPMASK | \
 				 XFEATURE_MASK_ZMM_Hi256 | \
 				 XFEATURE_MASK_Hi16_ZMM)
 
-/* Supported features which require eager state saving */
-#define XFEATURE_MASK_EAGER	(XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)
-
 /* All currently supported features */
 #define XCNTXT_MASK	(XFEATURE_MASK_LAZY | XFEATURE_MASK_EAGER)
 
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index f0ab368..6d9f0a7 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -300,6 +300,12 @@ u64 __init fpu__get_supported_xfeatures_mask(void)
 static void __init fpu__clear_eager_fpu_features(void)
 {
 	setup_clear_cpu_cap(X86_FEATURE_MPX);
+	setup_clear_cpu_cap(X86_FEATURE_AVX);
+	setup_clear_cpu_cap(X86_FEATURE_AVX2);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
+	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
 }
 
 /*
-- 
cgit v0.10.2


From aa0421410fc540832f0fd267ff0f2657c3276053 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 3 Jan 2016 23:38:53 +0100
Subject: x86/boot: Hide local labels in verify_cpu()

... from the final ELF image's symbol table as they're not
really needed there.

Before:

$ readelf -a vmlinux | grep verify_cpu
    43: ffffffff810001a9     0 NOTYPE  LOCAL  DEFAULT    1 verify_cpu
    45: ffffffff8100028f     0 NOTYPE  LOCAL  DEFAULT    1 verify_cpu_no_longmode
    46: ffffffff810001de     0 NOTYPE  LOCAL  DEFAULT    1 verify_cpu_noamd
    47: ffffffff8100022b     0 NOTYPE  LOCAL  DEFAULT    1 verify_cpu_check
    48: ffffffff8100021c     0 NOTYPE  LOCAL  DEFAULT    1 verify_cpu_clear_xd
    49: ffffffff81000263     0 NOTYPE  LOCAL  DEFAULT    1 verify_cpu_sse_test
    50: ffffffff81000296     0 NOTYPE  LOCAL  DEFAULT    1 verify_cpu_sse_ok

After:

$ readelf -a vmlinux | grep verify_cpu
    43: ffffffff810001a9     0 NOTYPE  LOCAL  DEFAULT    1 verify_cpu

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1451860733-21163-1-git-send-email-bp@alien8.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 4cf401f..07efb35 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -48,31 +48,31 @@ verify_cpu:
 	pushfl
 	popl	%eax
 	cmpl	%eax,%ebx
-	jz	verify_cpu_no_longmode	# cpu has no cpuid
+	jz	.Lverify_cpu_no_longmode	# cpu has no cpuid
 #endif
 
 	movl	$0x0,%eax		# See if cpuid 1 is implemented
 	cpuid
 	cmpl	$0x1,%eax
-	jb	verify_cpu_no_longmode	# no cpuid 1
+	jb	.Lverify_cpu_no_longmode	# no cpuid 1
 
 	xor	%di,%di
 	cmpl	$0x68747541,%ebx	# AuthenticAMD
-	jnz	verify_cpu_noamd
+	jnz	.Lverify_cpu_noamd
 	cmpl	$0x69746e65,%edx
-	jnz	verify_cpu_noamd
+	jnz	.Lverify_cpu_noamd
 	cmpl	$0x444d4163,%ecx
-	jnz	verify_cpu_noamd
+	jnz	.Lverify_cpu_noamd
 	mov	$1,%di			# cpu is from AMD
-	jmp	verify_cpu_check
+	jmp	.Lverify_cpu_check
 
-verify_cpu_noamd:
+.Lverify_cpu_noamd:
 	cmpl	$0x756e6547,%ebx        # GenuineIntel?
-	jnz	verify_cpu_check
+	jnz	.Lverify_cpu_check
 	cmpl	$0x49656e69,%edx
-	jnz	verify_cpu_check
+	jnz	.Lverify_cpu_check
 	cmpl	$0x6c65746e,%ecx
-	jnz	verify_cpu_check
+	jnz	.Lverify_cpu_check
 
 	# only call IA32_MISC_ENABLE when:
 	# family > 6 || (family == 6 && model >= 0xd)
@@ -83,59 +83,59 @@ verify_cpu_noamd:
 	andl	$0x0ff00f00, %eax	# mask family and extended family
 	shrl	$8, %eax
 	cmpl	$6, %eax
-	ja	verify_cpu_clear_xd	# family > 6, ok
-	jb	verify_cpu_check	# family < 6, skip
+	ja	.Lverify_cpu_clear_xd	# family > 6, ok
+	jb	.Lverify_cpu_check	# family < 6, skip
 
 	andl	$0x000f00f0, %ecx	# mask model and extended model
 	shrl	$4, %ecx
 	cmpl	$0xd, %ecx
-	jb	verify_cpu_check	# family == 6, model < 0xd, skip
+	jb	.Lverify_cpu_check	# family == 6, model < 0xd, skip
 
-verify_cpu_clear_xd:
+.Lverify_cpu_clear_xd:
 	movl	$MSR_IA32_MISC_ENABLE, %ecx
 	rdmsr
 	btrl	$2, %edx		# clear MSR_IA32_MISC_ENABLE_XD_DISABLE
-	jnc	verify_cpu_check	# only write MSR if bit was changed
+	jnc	.Lverify_cpu_check	# only write MSR if bit was changed
 	wrmsr
 
-verify_cpu_check:
+.Lverify_cpu_check:
 	movl    $0x1,%eax		# Does the cpu have what it takes
 	cpuid
 	andl	$REQUIRED_MASK0,%edx
 	xorl	$REQUIRED_MASK0,%edx
-	jnz	verify_cpu_no_longmode
+	jnz	.Lverify_cpu_no_longmode
 
 	movl    $0x80000000,%eax	# See if extended cpuid is implemented
 	cpuid
 	cmpl    $0x80000001,%eax
-	jb      verify_cpu_no_longmode	# no extended cpuid
+	jb      .Lverify_cpu_no_longmode	# no extended cpuid
 
 	movl    $0x80000001,%eax	# Does the cpu have what it takes
 	cpuid
 	andl    $REQUIRED_MASK1,%edx
 	xorl    $REQUIRED_MASK1,%edx
-	jnz     verify_cpu_no_longmode
+	jnz     .Lverify_cpu_no_longmode
 
-verify_cpu_sse_test:
+.Lverify_cpu_sse_test:
 	movl	$1,%eax
 	cpuid
 	andl	$SSE_MASK,%edx
 	cmpl	$SSE_MASK,%edx
-	je	verify_cpu_sse_ok
+	je	.Lverify_cpu_sse_ok
 	test	%di,%di
-	jz	verify_cpu_no_longmode	# only try to force SSE on AMD
+	jz	.Lverify_cpu_no_longmode	# only try to force SSE on AMD
 	movl	$MSR_K7_HWCR,%ecx
 	rdmsr
 	btr	$15,%eax		# enable SSE
 	wrmsr
 	xor	%di,%di			# don't loop
-	jmp	verify_cpu_sse_test	# try again
+	jmp	.Lverify_cpu_sse_test	# try again
 
-verify_cpu_no_longmode:
+.Lverify_cpu_no_longmode:
 	popf				# Restore caller passed flags
 	movl $1,%eax
 	ret
-verify_cpu_sse_ok:
+.Lverify_cpu_sse_ok:
 	popf				# Restore caller passed flags
 	xorl %eax, %eax
 	ret
-- 
cgit v0.10.2


From e27d90e8be08d96b5e047cd01daf7e62c4fcab78 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 22 Dec 2015 10:39:54 +1030
Subject: lguest: Map switcher text R/O

Pavel noted that lguest maps the switcher code executable and
read-write.  This is a bad idea for any kernel text, but
particularly for text mapped at a fixed address.

Create two vmas, one for the text (PAGE_KERNEL_RX) and another
for the stacks (PAGE_KERNEL).  Use VM_NO_GUARD to map them
adjacent (as expected by the rest of the code).

Reported-by: Pavel Machek <pavel@ucw.cz>
Tested-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h
index 3bbc07a..73d0c9b 100644
--- a/arch/x86/include/asm/lguest.h
+++ b/arch/x86/include/asm/lguest.h
@@ -12,7 +12,9 @@
 #define GUEST_PL 1
 
 /* Page for Switcher text itself, then two pages per cpu */
-#define TOTAL_SWITCHER_PAGES (1 + 2 * nr_cpu_ids)
+#define SWITCHER_TEXT_PAGES (1)
+#define SWITCHER_STACK_PAGES (2 * nr_cpu_ids)
+#define TOTAL_SWITCHER_PAGES (SWITCHER_TEXT_PAGES + SWITCHER_STACK_PAGES)
 
 /* Where we map the Switcher, in both Host and Guest. */
 extern unsigned long switcher_addr;
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 312ffd3..9e385b3 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -22,7 +22,8 @@
 
 unsigned long switcher_addr;
 struct page **lg_switcher_pages;
-static struct vm_struct *switcher_vma;
+static struct vm_struct *switcher_text_vma;
+static struct vm_struct *switcher_stacks_vma;
 
 /* This One Big lock protects all inter-guest data structures. */
 DEFINE_MUTEX(lguest_lock);
@@ -83,54 +84,80 @@ static __init int map_switcher(void)
 	}
 
 	/*
+	 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
+	 * It goes in the first page, which we map in momentarily.
+	 */
+	memcpy(kmap(lg_switcher_pages[0]), start_switcher_text,
+	       end_switcher_text - start_switcher_text);
+	kunmap(lg_switcher_pages[0]);
+
+	/*
 	 * We place the Switcher underneath the fixmap area, which is the
 	 * highest virtual address we can get.  This is important, since we
 	 * tell the Guest it can't access this memory, so we want its ceiling
 	 * as high as possible.
 	 */
-	switcher_addr = FIXADDR_START - (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE;
+	switcher_addr = FIXADDR_START - TOTAL_SWITCHER_PAGES*PAGE_SIZE;
 
 	/*
-	 * Now we reserve the "virtual memory area" we want.  We might
-	 * not get it in theory, but in practice it's worked so far.
-	 * The end address needs +1 because __get_vm_area allocates an
-	 * extra guard page, so we need space for that.
+	 * Now we reserve the "virtual memory area"s we want.  We might
+	 * not get them in theory, but in practice it's worked so far.
+	 *
+	 * We want the switcher text to be read-only and executable, and
+	 * the stacks to be read-write and non-executable.
 	 */
-	switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE,
-				     VM_ALLOC, switcher_addr, switcher_addr
-				     + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE);
-	if (!switcher_vma) {
+	switcher_text_vma = __get_vm_area(PAGE_SIZE, VM_ALLOC|VM_NO_GUARD,
+					  switcher_addr,
+					  switcher_addr + PAGE_SIZE);
+
+	if (!switcher_text_vma) {
 		err = -ENOMEM;
 		printk("lguest: could not map switcher pages high\n");
 		goto free_pages;
 	}
 
+	switcher_stacks_vma = __get_vm_area(SWITCHER_STACK_PAGES * PAGE_SIZE,
+					    VM_ALLOC|VM_NO_GUARD,
+					    switcher_addr + PAGE_SIZE,
+					    switcher_addr + TOTAL_SWITCHER_PAGES * PAGE_SIZE);
+	if (!switcher_stacks_vma) {
+		err = -ENOMEM;
+		printk("lguest: could not map switcher pages high\n");
+		goto free_text_vma;
+	}
+
 	/*
 	 * This code actually sets up the pages we've allocated to appear at
 	 * switcher_addr.  map_vm_area() takes the vma we allocated above, the
-	 * kind of pages we're mapping (kernel pages), and a pointer to our
-	 * array of struct pages.
+	 * kind of pages we're mapping (kernel text pages and kernel writable
+	 * pages respectively), and a pointer to our array of struct pages.
 	 */
-	err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, lg_switcher_pages);
+	err = map_vm_area(switcher_text_vma, PAGE_KERNEL_RX, lg_switcher_pages);
+	if (err) {
+		printk("lguest: text map_vm_area failed: %i\n", err);
+		goto free_vmas;
+	}
+
+	err = map_vm_area(switcher_stacks_vma, PAGE_KERNEL,
+			  lg_switcher_pages + SWITCHER_TEXT_PAGES);
 	if (err) {
-		printk("lguest: map_vm_area failed: %i\n", err);
-		goto free_vma;
+		printk("lguest: stacks map_vm_area failed: %i\n", err);
+		goto free_vmas;
 	}
 
 	/*
 	 * Now the Switcher is mapped at the right address, we can't fail!
-	 * Copy in the compiled-in Switcher code (from x86/switcher_32.S).
 	 */
-	memcpy(switcher_vma->addr, start_switcher_text,
-	       end_switcher_text - start_switcher_text);
-
 	printk(KERN_INFO "lguest: mapped switcher at %p\n",
-	       switcher_vma->addr);
+	       switcher_text_vma->addr);
 	/* And we succeeded... */
 	return 0;
 
-free_vma:
-	vunmap(switcher_vma->addr);
+free_vmas:
+	/* Undoes map_vm_area and __get_vm_area */
+	vunmap(switcher_stacks_vma->addr);
+free_text_vma:
+	vunmap(switcher_text_vma->addr);
 free_pages:
 	i = TOTAL_SWITCHER_PAGES;
 free_some_pages:
@@ -148,7 +175,8 @@ static void unmap_switcher(void)
 	unsigned int i;
 
 	/* vunmap() undoes *both* map_vm_area() and __get_vm_area(). */
-	vunmap(switcher_vma->addr);
+	vunmap(switcher_text_vma->addr);
+	vunmap(switcher_stacks_vma->addr);
 	/* Now we just need to free the pages we copied the switcher into */
 	for (i = 0; i < TOTAL_SWITCHER_PAGES; i++)
 		__free_pages(lg_switcher_pages[i], 0);
-- 
cgit v0.10.2


From 2f0c0b2d96b1205efb14347009748d786c2d9ba5 Mon Sep 17 00:00:00 2001
From: Mario Kleiner <mario.kleiner.de@gmail.com>
Date: Fri, 18 Dec 2015 20:24:06 +0100
Subject: x86/reboot/quirks: Add iMac10,1 to pci_reboot_dmi_table[]

Without the reboot=pci method, the iMac 10,1 simply
hangs after printing "Restarting system" at the point
when it should reboot. This fixes it.

Signed-off-by: Mario Kleiner <mario.kleiner.de@gmail.com>
Cc: <stable@vger.kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Jones <davej@codemonkey.org.uk>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1450466646-26663-1-git-send-email-mario.kleiner.de@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index d64889a..ab0adc0 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -182,6 +182,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
 			DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
 		},
 	},
+	{	/* Handle problems with rebooting on the iMac10,1. */
+		.callback = set_pci_reboot,
+		.ident = "Apple iMac10,1",
+		.matches = {
+		    DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
+		    DMI_MATCH(DMI_PRODUCT_NAME, "iMac10,1"),
+		},
+	},
 
 	/* ASRock */
 	{	/* Handle problems with rebooting on ASRock Q1900DC-ITX */
-- 
cgit v0.10.2


From 65cacec1ba908a153cfb19c4de596a108f95970c Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 12 Jan 2016 11:56:15 -0800
Subject: selftests/x86: Test __kernel_sigreturn and __kernel_rt_sigreturn

The vdso-based sigreturn mechanism is fragile and isn't used by
modern glibc so, if we break it, we'll only notice when someone
tests an unusual libc.

Add an explicit selftest.

[ I wrote this while debugging a Bionic breakage -- my first guess
  was that I had somehow messed up sigreturn.  I've caused problems in
  that code before, and it's really easy to fail to notice it because
  there's nothing on a modern distro that needs vdso-based sigreturn. ]

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/32946d714156879cd8e5d8eab044cd07557ed558.1452628504.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/tools/testing/selftests/x86/Makefile b/tools/testing/selftests/x86/Makefile
index 398bb0e..d0c473f 100644
--- a/tools/testing/selftests/x86/Makefile
+++ b/tools/testing/selftests/x86/Makefile
@@ -7,7 +7,8 @@ include ../lib.mk
 TARGETS_C_BOTHBITS := single_step_syscall sysret_ss_attrs syscall_nt ptrace_syscall
 TARGETS_C_32BIT_ONLY := entry_from_vm86 syscall_arg_fault sigreturn test_syscall_vdso unwind_vdso \
 			test_FCMOV test_FCOMI test_FISTTP \
-			ldt_gdt
+			ldt_gdt \
+			vdso_restorer
 
 TARGETS_C_32BIT_ALL := $(TARGETS_C_BOTHBITS) $(TARGETS_C_32BIT_ONLY)
 BINARIES_32 := $(TARGETS_C_32BIT_ALL:%=%_32)
diff --git a/tools/testing/selftests/x86/vdso_restorer.c b/tools/testing/selftests/x86/vdso_restorer.c
new file mode 100644
index 0000000..cb03842
--- /dev/null
+++ b/tools/testing/selftests/x86/vdso_restorer.c
@@ -0,0 +1,88 @@
+/*
+ * vdso_restorer.c - tests vDSO-based signal restore
+ * Copyright (c) 2015 Andrew Lutomirski
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * This makes sure that sa_restorer == NULL keeps working on 32-bit
+ * configurations.  Modern glibc doesn't use it under any circumstances,
+ * so it's easy to overlook breakage.
+ *
+ * 64-bit userspace has never supported sa_restorer == NULL, so this is
+ * 32-bit only.
+ */
+
+#define _GNU_SOURCE
+
+#include <err.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <unistd.h>
+#include <syscall.h>
+#include <sys/syscall.h>
+
+/* Open-code this -- the headers are too messy to easily use them. */
+struct real_sigaction {
+	void *handler;
+	unsigned long flags;
+	void *restorer;
+	unsigned int mask[2];
+};
+
+static volatile sig_atomic_t handler_called;
+
+static void handler_with_siginfo(int sig, siginfo_t *info, void *ctx_void)
+{
+	handler_called = 1;
+}
+
+static void handler_without_siginfo(int sig)
+{
+	handler_called = 1;
+}
+
+int main()
+{
+	int nerrs = 0;
+	struct real_sigaction sa;
+
+	memset(&sa, 0, sizeof(sa));
+	sa.handler = handler_with_siginfo;
+	sa.flags = SA_SIGINFO;
+	sa.restorer = NULL;	/* request kernel-provided restorer */
+
+	if (syscall(SYS_rt_sigaction, SIGUSR1, &sa, NULL, 8) != 0)
+		err(1, "raw rt_sigaction syscall");
+
+	raise(SIGUSR1);
+
+	if (handler_called) {
+		printf("[OK]\tSA_SIGINFO handler returned successfully\n");
+	} else {
+		printf("[FAIL]\tSA_SIGINFO handler was not called\n");
+		nerrs++;
+	}
+
+	sa.flags = 0;
+	sa.handler = handler_without_siginfo;
+	if (syscall(SYS_sigaction, SIGUSR1, &sa, 0) != 0)
+		err(1, "raw sigaction syscall");
+	handler_called = 0;
+
+	raise(SIGUSR1);
+
+	if (handler_called) {
+		printf("[OK]\t!SA_SIGINFO handler returned successfully\n");
+	} else {
+		printf("[FAIL]\t!SA_SIGINFO handler was not called\n");
+		nerrs++;
+	}
+}
-- 
cgit v0.10.2


From 4eaffdd5a5fe6ff9f95e1ab4de1ac904d5e0fa8b Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Tue, 12 Jan 2016 12:47:40 -0800
Subject: x86/mm: Improve switch_mm() barrier comments

My previous comments were still a bit confusing and there was a
typo. Fix it up.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: stable@vger.kernel.org
Fixes: 71b3c126e611 ("x86/mm: Add barriers and document switch_mm()-vs-flush synchronization")
Link: http://lkml.kernel.org/r/0a0b43cdcdd241c5faaaecfbcc91a155ddedc9a1.1452631609.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 1edc9cd..bfd9b2a 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -132,14 +132,16 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 		 * be sent, and CPU 0's TLB will contain a stale entry.)
 		 *
 		 * The bad outcome can occur if either CPU's load is
-		 * reordered before that CPU's store, so both CPUs much
+		 * reordered before that CPU's store, so both CPUs must
 		 * execute full barriers to prevent this from happening.
 		 *
 		 * Thus, switch_mm needs a full barrier between the
 		 * store to mm_cpumask and any operation that could load
-		 * from next->pgd.  This barrier synchronizes with
-		 * remote TLB flushers.  Fortunately, load_cr3 is
-		 * serializing and thus acts as a full barrier.
+		 * from next->pgd.  TLB fills are special and can happen
+		 * due to instruction fetches or for no reason at all,
+		 * and neither LOCK nor MFENCE orders them.
+		 * Fortunately, load_cr3() is serializing and gives the
+		 * ordering guarantee we need.
 		 *
 		 */
 		load_cr3(next->pgd);
@@ -188,9 +190,8 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
 			 * tlb flush IPI delivery. We must reload CR3
 			 * to make sure to use no freed page tables.
 			 *
-			 * As above, this is a barrier that forces
-			 * TLB repopulation to be ordered after the
-			 * store to mm_cpumask.
+			 * As above, load_cr3() is serializing and orders TLB
+			 * fills with respect to the mm_cpumask write.
 			 */
 			load_cr3(next->pgd);
 			trace_tlb_flush(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
-- 
cgit v0.10.2


From 78fd8c7288e0a4bba3ad1d69caf9396a6b69cb00 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Mon, 4 Jan 2016 15:14:28 -0800
Subject: x86/vdso/pvclock: Protect STABLE check with the seqcount

If the clock becomes unstable while we're reading it, we need to
bail.  We can do this by simply moving the check into the
seqcount loop.

Reported-by: Marcelo Tosatti <mtosatti@redhat.com>
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Alexander Graf <agraf@suse.de>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paolo Bonzini <pbonzini@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Radim Krcmar <rkrcmar@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/755dcedb17269e1d7ce12a9a713dea303835137e.1451949191.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/vdso/vclock_gettime.c b/arch/x86/entry/vdso/vclock_gettime.c
index 8602f06..1a50e09 100644
--- a/arch/x86/entry/vdso/vclock_gettime.c
+++ b/arch/x86/entry/vdso/vclock_gettime.c
@@ -126,23 +126,23 @@ static notrace cycle_t vread_pvclock(int *mode)
 	 *
 	 * On Xen, we don't appear to have that guarantee, but Xen still
 	 * supplies a valid seqlock using the version field.
-
+	 *
 	 * We only do pvclock vdso timing at all if
 	 * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
 	 * mean that all vCPUs have matching pvti and that the TSC is
 	 * synced, so we can just look at vCPU 0's pvti.
 	 */
 
-	if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
-		*mode = VCLOCK_NONE;
-		return 0;
-	}
-
 	do {
 		version = pvti->version;
 
 		smp_rmb();
 
+		if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+			*mode = VCLOCK_NONE;
+			return 0;
+		}
+
 		tsc = rdtsc_ordered();
 		pvti_tsc_to_system_mul = pvti->tsc_to_system_mul;
 		pvti_tsc_shift = pvti->tsc_shift;
-- 
cgit v0.10.2


From 7030a7e9321166eef44c811fe4af4d460360d424 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Wed, 13 Jan 2016 15:39:40 +0300
Subject: x86/cpu/amd: Remove an unneeded condition in srat_detect_node()

Originally we calculated ht_nodeid as "ht_nodeid = apicid -
boot_cpu_id;" so presumably it could be negative.

But after commit:

  01aaea1afbcd ('x86: introduce initial apicid')

we use c->initial_apicid which is an unsigned short and thus always >= 0.

It causes a static checker warning to test for impossible
conditions so let's remove it.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Aravind Gopalakrishnan <Aravind.Gopalakrishnan@amd.com>
Cc: Borislav Petkov <bp@suse.de>
Cc: Hector Marco-Gisbert <hecmargi@upv.es>
Cc: Huang Rui <ray.huang@amd.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
Link: http://lkml.kernel.org/r/20160113123940.GE19993@mwanda
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e678dde..a07956a 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -434,8 +434,7 @@ static void srat_detect_node(struct cpuinfo_x86 *c)
 		 */
 		int ht_nodeid = c->initial_apicid;
 
-		if (ht_nodeid >= 0 &&
-		    __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
+		if (__apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
 			node = __apicid_to_node[ht_nodeid];
 		/* Pick a nearby node */
 		if (!node_online(node))
-- 
cgit v0.10.2