From 469f00231278da68062a809306df0bac95a27507 Mon Sep 17 00:00:00 2001
From: Alexander Potapenko <glider@google.com>
Date: Fri, 15 Jul 2016 11:42:43 +0200
Subject: x86, kasan, ftrace: Put APIC interrupt handlers into .irqentry.text

Dmitry Vyukov has reported unexpected KASAN stackdepot growth:

  https://github.com/google/kasan/issues/36

... which is caused by the APIC handlers not being present in .irqentry.text:

When building with CONFIG_FUNCTION_GRAPH_TRACER=y or CONFIG_KASAN=y, put the
APIC interrupt handlers into the .irqentry.text section. This is needed
because both KASAN and function graph tracer use __irqentry_text_start and
__irqentry_text_end to determine whether a function is an IRQ entry point.

Reported-by: Dmitry Vyukov <dvyukov@google.com>
Signed-off-by: Alexander Potapenko <glider@google.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: aryabinin@virtuozzo.com
Cc: kasan-dev@googlegroups.com
Cc: kcc@google.com
Cc: rostedt@goodmis.org
Link: http://lkml.kernel.org/r/1468575763-144889-1-git-send-email-glider@google.com
[ Minor edits. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index b846875..9f85827 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -601,9 +601,20 @@ apicinterrupt3 \num trace(\sym) smp_trace(\sym)
 .endm
 #endif
 
+/* Make sure APIC interrupt handlers end up in the irqentry section: */
+#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN)
+# define PUSH_SECTION_IRQENTRY	.pushsection .irqentry.text, "ax"
+# define POP_SECTION_IRQENTRY	.popsection
+#else
+# define PUSH_SECTION_IRQENTRY
+# define POP_SECTION_IRQENTRY
+#endif
+
 .macro apicinterrupt num sym do_sym
+PUSH_SECTION_IRQENTRY
 apicinterrupt3 \num \sym \do_sym
 trace_apicinterrupt \num \sym
+POP_SECTION_IRQENTRY
 .endm
 
 #ifdef CONFIG_SMP
-- 
cgit v0.10.2


From c7d2361f7524f365c1ae42f47880e3fa9efb2c2a Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Tue, 9 Aug 2016 10:11:04 -0700
Subject: x86/mm/KASLR: Fix physical memory calculation on KASLR memory
 randomization

Initialize KASLR memory randomization after max_pfn is initialized. Also
ensure the size is rounded up. It could create problems on machines
with more than 1Tb of memory on certain random addresses.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
Cc: Aleksey Makarov <aleksey.makarov@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lv Zheng <lv.zheng@intel.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: kernel-hardening@lists.openwall.com
Fixes: 021182e52fe0 ("Enable KASLR for physical mapping memory regions")
Link: http://lkml.kernel.org/r/1470762665-88032-1-git-send-email-thgarnie@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 991b779..95cf31c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -936,8 +936,6 @@ void __init setup_arch(char **cmdline_p)
 
 	x86_init.oem.arch_setup();
 
-	kernel_randomize_memory();
-
 	iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
 	setup_memory_map();
 	parse_setup_data();
@@ -1055,6 +1053,12 @@ void __init setup_arch(char **cmdline_p)
 
 	max_possible_pfn = max_pfn;
 
+	/*
+	 * Define random base addresses for memory sections after max_pfn is
+	 * defined and before each memory section base is used.
+	 */
+	kernel_randomize_memory();
+
 #ifdef CONFIG_X86_32
 	/* max_low_pfn get updated here */
 	find_low_pfn_range();
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 26dccd6..ec8654f 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -97,7 +97,7 @@ void __init kernel_randomize_memory(void)
 	 * add padding if needed (especially for memory hotplug support).
 	 */
 	BUG_ON(kaslr_regions[0].base != &page_offset_base);
-	memory_tb = ((max_pfn << PAGE_SHIFT) >> TB_SHIFT) +
+	memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) +
 		CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
 
 	/* Adapt phyiscal memory region size based on available memory */
-- 
cgit v0.10.2


From fb754f958f8e46202c1efd7f66d5b3db1208117d Mon Sep 17 00:00:00 2001
From: Thomas Garnier <thgarnie@google.com>
Date: Tue, 9 Aug 2016 10:11:05 -0700
Subject: x86/mm/KASLR: Increase BRK pages for KASLR memory randomization

Default implementation expects 6 pages maximum are needed for low page
allocations. If KASLR memory randomization is enabled, the worse case
of e820 layout would require 12 pages (no large pages). It is due to the
PUD level randomization and the variable e820 memory layout.

This bug was found while doing extensive testing of KASLR memory
randomization on different type of hardware.

Signed-off-by: Thomas Garnier <thgarnie@google.com>
Cc: Aleksey Makarov <aleksey.makarov@linaro.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Baoquan He <bhe@redhat.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dan Williams <dan.j.williams@intel.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Young <dyoung@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: Fabian Frederick <fabf@skynet.be>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Joerg Roedel <jroedel@suse.de>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Lv Zheng <lv.zheng@intel.com>
Cc: Mark Salter <msalter@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rafael J . Wysocki <rafael.j.wysocki@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Cc: kernel-hardening@lists.openwall.com
Fixes: 021182e52fe0 ("Enable KASLR for physical mapping memory regions")
Link: http://lkml.kernel.org/r/1470762665-88032-2-git-send-email-thgarnie@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 6209289..d28a2d7 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -122,8 +122,18 @@ __ref void *alloc_low_pages(unsigned int num)
 	return __va(pfn << PAGE_SHIFT);
 }
 
-/* need 3 4k for initial PMD_SIZE,  3 4k for 0-ISA_END_ADDRESS */
-#define INIT_PGT_BUF_SIZE	(6 * PAGE_SIZE)
+/*
+ * By default need 3 4k for initial PMD_SIZE,  3 4k for 0-ISA_END_ADDRESS.
+ * With KASLR memory randomization, depending on the machine e820 memory
+ * and the PUD alignment. We may need twice more pages when KASLR memory
+ * randomization is enabled.
+ */
+#ifndef CONFIG_RANDOMIZE_MEMORY
+#define INIT_PGD_PAGE_COUNT      6
+#else
+#define INIT_PGD_PAGE_COUNT      12
+#endif
+#define INIT_PGT_BUF_SIZE	(INIT_PGD_PAGE_COUNT * PAGE_SIZE)
 RESERVE_BRK(early_pgt_alloc, INIT_PGT_BUF_SIZE);
 void  __init early_alloc_pgt_buf(void)
 {
-- 
cgit v0.10.2


From 5cf0791da5c162ebc14b01eb01631cfa7ed4fa6e Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 5 Aug 2016 15:37:39 +0200
Subject: x86/mm: Disable preemption during CR3 read+write
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

There's a subtle preemption race on UP kernels:

Usually current->mm (and therefore mm->pgd) stays the same during the
lifetime of a task so it does not matter if a task gets preempted during
the read and write of the CR3.

But then, there is this scenario on x86-UP:

TaskA is in do_exit() and exit_mm() sets current->mm = NULL followed by:

 -> mmput()
 -> exit_mmap()
 -> tlb_finish_mmu()
 -> tlb_flush_mmu()
 -> tlb_flush_mmu_tlbonly()
 -> tlb_flush()
 -> flush_tlb_mm_range()
 -> __flush_tlb_up()
 -> __flush_tlb()
 ->  __native_flush_tlb()

At this point current->mm is NULL but current->active_mm still points to
the "old" mm.

Let's preempt taskA _after_ native_read_cr3() by taskB. TaskB has its
own mm so CR3 has changed.

Now preempt back to taskA. TaskA has no ->mm set so it borrows taskB's
mm and so CR3 remains unchanged. Once taskA gets active it continues
where it was interrupted and that means it writes its old CR3 value
back. Everything is fine because userland won't need its memory
anymore.

Now the fun part:

Let's preempt taskA one more time and get back to taskB. This
time switch_mm() won't do a thing because oldmm (->active_mm)
is the same as mm (as per context_switch()). So we remain
with a bad CR3 / PGD and return to userland.

The next thing that happens is handle_mm_fault() with an address for
the execution of its code in userland. handle_mm_fault() realizes that
it has a PTE with proper rights so it returns doing nothing. But the
CPU looks at the wrong PGD and insists that something is wrong and
faults again. And again. And one more time…

This pagefault circle continues until the scheduler gets tired of it and
puts another task on the CPU. It gets little difficult if the task is a
RT task with a high priority. The system will either freeze or it gets
fixed by the software watchdog thread which usually runs at RT-max prio.
But waiting for the watchdog will increase the latency of the RT task
which is no good.

Fix this by disabling preemption across the critical code section.

Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Rik van Riel <riel@redhat.com>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Borislav Petkov <bp@suse.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Cc: stable@vger.kernel.org
Link: http://lkml.kernel.org/r/1470404259-26290-1-git-send-email-bigeasy@linutronix.de
[ Prettified the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 4e5be94..6fa8594 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -135,7 +135,14 @@ static inline void cr4_set_bits_and_update_boot(unsigned long mask)
 
 static inline void __native_flush_tlb(void)
 {
+	/*
+	 * If current->mm == NULL then we borrow a mm which may change during a
+	 * task switch and therefore we must not be preempted while we write CR3
+	 * back:
+	 */
+	preempt_disable();
 	native_write_cr3(native_read_cr3());
+	preempt_enable();
 }
 
 static inline void __native_flush_tlb_global_irq_disabled(void)
-- 
cgit v0.10.2


From 3e035305875cfa8a58c1ca573d0cfa6a7f201f27 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@alien8.de>
Date: Wed, 3 Aug 2016 19:14:29 +0200
Subject: x86/entry: Clarify the RF saving/restoring situation with
 SYSCALL/SYSRET

Clarify why exactly RF cannot be restored properly by SYSRET to avoid
confusion.

No functionality change.

Signed-off-by: Borislav Petkov <bp@suse.de>
Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160803171429.GA2590@nazgul.tnic
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 9f85827..d172c61 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -288,11 +288,15 @@ return_from_SYSCALL_64:
 	jne	opportunistic_sysret_failed
 
 	/*
-	 * SYSRET can't restore RF.  SYSRET can restore TF, but unlike IRET,
-	 * restoring TF results in a trap from userspace immediately after
-	 * SYSRET.  This would cause an infinite loop whenever #DB happens
-	 * with register state that satisfies the opportunistic SYSRET
-	 * conditions.  For example, single-stepping this user code:
+	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
+	 * restore RF properly. If the slowpath sets it for whatever reason, we
+	 * need to restore it correctly.
+	 *
+	 * SYSRET can restore TF, but unlike IRET, restoring TF results in a
+	 * trap from userspace immediately after SYSRET.  This would cause an
+	 * infinite loop whenever #DB happens with register state that satisfies
+	 * the opportunistic SYSRET conditions.  For example, single-stepping
+	 * this user code:
 	 *
 	 *           movq	$stuck_here, %rcx
 	 *           pushfq
-- 
cgit v0.10.2


From 054f621fd5b1c7245710f5d3935c94ce6ae4b3b7 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 1 Aug 2016 13:40:50 -0500
Subject: x86/platform/UV: Fix problem with UV4 Socket IDs not being contiguous

The UV4 Socket IDs are not guaranteed to equate to Node values which
can cause the GAM (Global Addressable Memory) table lookups to fail.
Fix this by using an independent index into the GAM table instead of
the Socket ID to reference the base address.

Tested-by: Frank Ramsay <framsay@sgi.com>
Tested-by: John Estabrook <estabrook@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Dimitri Sivanich <sivanich@sgi.com>
Reviewed-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <rja@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160801184050.048755337@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 09b59ad..d918733 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -325,7 +325,7 @@ static __init void build_uv_gr_table(void)
 	struct uv_gam_range_entry *gre = uv_gre_table;
 	struct uv_gam_range_s *grt;
 	unsigned long last_limit = 0, ram_limit = 0;
-	int bytes, i, sid, lsid = -1;
+	int bytes, i, sid, lsid = -1, indx = 0, lindx = -1;
 
 	if (!gre)
 		return;
@@ -356,11 +356,12 @@ static __init void build_uv_gr_table(void)
 		}
 		sid = gre->sockid - _min_socket;
 		if (lsid < sid) {		/* new range */
-			grt = &_gr_table[sid];
-			grt->base = lsid;
+			grt = &_gr_table[indx];
+			grt->base = lindx;
 			grt->nasid = gre->nasid;
 			grt->limit = last_limit = gre->limit;
 			lsid = sid;
+			lindx = indx++;
 			continue;
 		}
 		if (lsid == sid && !ram_limit) {	/* update range */
@@ -371,7 +372,7 @@ static __init void build_uv_gr_table(void)
 		}
 		if (!ram_limit) {		/* non-contiguous ram range */
 			grt++;
-			grt->base = sid - 1;
+			grt->base = lindx;
 			grt->nasid = gre->nasid;
 			grt->limit = last_limit = gre->limit;
 			continue;
-- 
cgit v0.10.2


From e363d24c2b997c421476c6aa00547edadf678efe Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 1 Aug 2016 13:40:51 -0500
Subject: x86/platform/UV: Fix bug with iounmap() of the UV4 EFI System Table
 causing a crash

Save the uv_systab::size field before doing the iounmap()
of the struct pointer, to avoid a NULL dereference crash.

Tested-by: Frank Ramsay <framsay@sgi.com>
Tested-by: John Estabrook <estabrook@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Dimitri Sivanich <sivanich@sgi.com>
Reviewed-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <rja@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160801184050.250424783@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 66b2166..4e9fd13 100644
--- a/arch/x86/platform/uv/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
@@ -199,12 +199,14 @@ void uv_bios_init(void)
 		return;
 	}
 
+	/* Starting with UV4 the UV systab size is variable */
 	if (uv_systab->revision >= UV_SYSTAB_VERSION_UV4) {
+		int size = uv_systab->size;
+
 		iounmap(uv_systab);
-		uv_systab = ioremap(efi.uv_systab, uv_systab->size);
+		uv_systab = ioremap(efi.uv_systab, size);
 		if (!uv_systab) {
-			pr_err("UV: UVsystab: ioremap(%d) failed!\n",
-				uv_systab->size);
+			pr_err("UV: UVsystab: ioremap(%d) failed!\n", size);
 			return;
 		}
 	}
-- 
cgit v0.10.2


From 22ac2bca92f2d92c6495248d65ff648182df428d Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 1 Aug 2016 13:40:52 -0500
Subject: x86/platform/UV: Fix problem with UV4 BIOS providing incorrect PXM
 values

There are some circumstances where the UV4 BIOS cannot provide the
correct Proximity Node values to associate with specific Sockets and
Physical Nodes.  The decision was made to remove these values from BIOS
and for the kernel to get these values from the standard ACPI tables.

Tested-by: Frank Ramsay <framsay@sgi.com>
Tested-by: John Estabrook <estabrook@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Dimitri Sivanich <sivanich@sgi.com>
Reviewed-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <rja@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160801184050.414210079@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index c852590..e652a7c 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -79,7 +79,7 @@ struct uv_gam_range_entry {
 	u16	nasid;		/* HNasid */
 	u16	sockid;		/* Socket ID, high bits of APIC ID */
 	u16	pnode;		/* Index to MMR and GRU spaces */
-	u32	pxm;		/* ACPI proximity domain number */
+	u32	unused2;
 	u32	limit;		/* PA bits 56:26 (UV_GAM_RANGE_SHFT) */
 };
 
@@ -88,7 +88,8 @@ struct uv_gam_range_entry {
 #define	UV_SYSTAB_VERSION_UV4		0x400	/* UV4 BIOS base version */
 #define	UV_SYSTAB_VERSION_UV4_1		0x401	/* + gpa_shift */
 #define	UV_SYSTAB_VERSION_UV4_2		0x402	/* + TYPE_NVRAM/WINDOW/MBOX */
-#define	UV_SYSTAB_VERSION_UV4_LATEST	UV_SYSTAB_VERSION_UV4_2
+#define	UV_SYSTAB_VERSION_UV4_3		0x403	/* - GAM Range PXM Value */
+#define	UV_SYSTAB_VERSION_UV4_LATEST	UV_SYSTAB_VERSION_UV4_3
 
 #define	UV_SYSTAB_TYPE_UNUSED		0	/* End of table (offset == 0) */
 #define	UV_SYSTAB_TYPE_GAM_PARAMS	1	/* GAM PARAM conversions */
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index d918733..6aa0545 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -1156,19 +1156,18 @@ static void __init decode_gam_rng_tbl(unsigned long ptr)
 	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
 		if (!index) {
 			pr_info("UV: GAM Range Table...\n");
-			pr_info("UV:  # %20s %14s %5s %4s %5s %3s %2s %3s\n",
+			pr_info("UV:  # %20s %14s %5s %4s %5s %3s %2s\n",
 				"Range", "", "Size", "Type", "NASID",
-				"SID", "PN", "PXM");
+				"SID", "PN");
 		}
 		pr_info(
-		"UV: %2d: 0x%014lx-0x%014lx %5luG %3d   %04x  %02x %02x %3d\n",
+		"UV: %2d: 0x%014lx-0x%014lx %5luG %3d   %04x  %02x %02x\n",
 			index++,
 			(unsigned long)lgre << UV_GAM_RANGE_SHFT,
 			(unsigned long)gre->limit << UV_GAM_RANGE_SHFT,
 			((unsigned long)(gre->limit - lgre)) >>
 				(30 - UV_GAM_RANGE_SHFT), /* 64M -> 1G */
-			gre->type, gre->nasid, gre->sockid,
-			gre->pnode, gre->pxm);
+			gre->type, gre->nasid, gre->sockid, gre->pnode);
 
 		lgre = gre->limit;
 		if (sock_min > gre->sockid)
@@ -1287,7 +1286,7 @@ static void __init build_socket_tables(void)
 		_pnode_to_socket[i] = SOCK_EMPTY;
 
 	/* fill in pnode/node/addr conversion list values */
-	pr_info("UV: GAM Building socket/pnode/pxm conversion tables\n");
+	pr_info("UV: GAM Building socket/pnode conversion tables\n");
 	for (; gre->type != UV_GAM_RANGE_TYPE_UNUSED; gre++) {
 		if (gre->type == UV_GAM_RANGE_TYPE_HOLE)
 			continue;
@@ -1295,20 +1294,18 @@ static void __init build_socket_tables(void)
 		if (_socket_to_pnode[i] != SOCK_EMPTY)
 			continue;	/* duplicate */
 		_socket_to_pnode[i] = gre->pnode;
-		_socket_to_node[i] = gre->pxm;
 
 		i = gre->pnode - minpnode;
 		_pnode_to_socket[i] = gre->sockid;
 
 		pr_info(
-		"UV: sid:%02x type:%d nasid:%04x pn:%02x pxm:%2d pn2s:%2x\n",
+		"UV: sid:%02x type:%d nasid:%04x pn:%02x pn2s:%2x\n",
 			gre->sockid, gre->type, gre->nasid,
 			_socket_to_pnode[gre->sockid - minsock],
-			_socket_to_node[gre->sockid - minsock],
 			_pnode_to_socket[gre->pnode - minpnode]);
 	}
 
-	/* check socket -> node values */
+	/* Set socket -> node values */
 	lnid = -1;
 	for_each_present_cpu(cpu) {
 		int nid = cpu_to_node(cpu);
@@ -1319,14 +1316,9 @@ static void __init build_socket_tables(void)
 		lnid = nid;
 		apicid = per_cpu(x86_cpu_to_apicid, cpu);
 		sockid = apicid >> uv_cpuid.socketid_shift;
-		i = sockid - minsock;
-
-		if (nid != _socket_to_node[i]) {
-			pr_warn(
-			"UV: %02x: type:%d socket:%02x PXM:%02x != node:%2d\n",
-				i, sockid, gre->type, _socket_to_node[i], nid);
-			_socket_to_node[i] = nid;
-		}
+		_socket_to_node[sockid - minsock] = nid;
+		pr_info("UV: sid:%02x: apicid:%04x node:%2d\n",
+			sockid, apicid, nid);
 	}
 
 	/* Setup physical blade to pnode translation from GAM Range Table */
-- 
cgit v0.10.2


From 5a52e8f822374bebc702bb2688ed8b5515bbb55b Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Mon, 1 Aug 2016 13:40:53 -0500
Subject: x86/platform/UV: Fix kernel panic running RHEL kdump kernel on UV
 systems

The latest UV kernel support panics when RHEL7 kexec's the kdump kernel
to make a dumpfile.  This patch fixes the problem by turning off all UV
support if NUMA is off.

Tested-by: Frank Ramsay <framsay@sgi.com>
Tested-by: John Estabrook <estabrook@sgi.com>
Signed-off-by: Mike Travis <travis@sgi.com>
Reviewed-by: Dimitri Sivanich <sivanich@sgi.com>
Reviewed-by: Nathan Zimmer <nzimmer@sgi.com>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Banman <abanman@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Russ Anderson <rja@sgi.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160801184050.577755634@asylum.americas.sgi.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 6aa0545..cb0673c 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -223,6 +223,11 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 	if (strncmp(oem_id, "SGI", 3) != 0)
 		return 0;
 
+	if (numa_off) {
+		pr_err("UV: NUMA is off, disabling UV support\n");
+		return 0;
+	}
+
 	/* Setup early hub type field in uv_hub_info for Node 0 */
 	uv_cpu_info->p_uv_hub_info = &uv_hub_info_node0;
 
-- 
cgit v0.10.2


From 5e44258d168b2bdee51d9e2e1f1f4726ff9775cd Mon Sep 17 00:00:00 2001
From: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Date: Sun, 31 Jul 2016 23:24:50 -0400
Subject: x86/build: Reduce the W=1 warnings noise when compiling x86 syscall
 tables

Building an X86_64 kernel with W=1 throws a total of 9,948 lines of warnings of
this form for both 32-bit and 64-bit syscall tables. Given that the entire rest
of the build for my config only generates 8,375 lines of output, this is a big
reduction in the warnings generated.

The warnings follow this pattern:

  ./arch/x86/include/generated/asm/syscalls_32.h:885:21: warning: initialized field overwritten [-Woverride-init]
   __SYSCALL_I386(379, compat_sys_pwritev2, )
                     ^
  arch/x86/entry/syscall_32.c:13:46: note: in definition of macro '__SYSCALL_I386'
   #define __SYSCALL_I386(nr, sym, qual) [nr] = sym,
                                              ^~~
  ./arch/x86/include/generated/asm/syscalls_32.h:885:21: note: (near initialization for 'ia32_sys_call_table[379]')
   __SYSCALL_I386(379, compat_sys_pwritev2, )
                     ^
  arch/x86/entry/syscall_32.c:13:46: note: in definition of macro '__SYSCALL_I386'
   #define __SYSCALL_I386(nr, sym, qual) [nr] = sym,

Since we intentionally build the syscall tables this way, ignore that one
warning in the two files.

Signed-off-by: Valdis Kletnieks <valdis.kletnieks@vt.edu>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/7464.1470021890@turing-police.cc.vt.edu
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/entry/Makefile b/arch/x86/entry/Makefile
index fe91c25..77f28ce 100644
--- a/arch/x86/entry/Makefile
+++ b/arch/x86/entry/Makefile
@@ -5,6 +5,8 @@
 OBJECT_FILES_NON_STANDARD_entry_$(BITS).o   := y
 OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y
 
+CFLAGS_syscall_64.o		+= -Wno-override-init
+CFLAGS_syscall_32.o		+= -Wno-override-init
 obj-y				:= entry_$(BITS).o thunk_$(BITS).o syscall_$(BITS).o
 obj-y				+= common.o
 
-- 
cgit v0.10.2


From b79daf85899215d5ede3641806db2e2a77b776b4 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 27 Jul 2016 16:20:40 -0700
Subject: x86/mm/pkeys: Fix compact mode by removing protection keys' XSAVE
 buffer manipulation

The Memory Protection Keys "rights register" (PKRU) is
XSAVE-managed, and is saved/restored along with the FPU state.

When kernel code accesses FPU regsisters, it does a delicate
dance with preempt.  Otherwise, the context switching code can
get confused as to whether the most up-to-date state is in the
registers themselves or in the XSAVE buffer.

But, PKRU is not a normal FPU register.  Using it does not
generate the normal device-not-available (#NM) exceptions which
means we can not manage it lazily, and the kernel completley
disallows using lazy mode when it is enabled.

The dance with preempt *only* occurs when managing the FPU
lazily.  Since we never manage PKRU lazily, we do not have to do
the dance with preempt; we can access it directly.  Doing it
this way saves a ton of complicated code (and is faster too).

Further, the XSAVES reenabling failed to patch a bit of code
in fpu__xfeature_set_state() the checked for compacted buffers.
That check caused fpu__xfeature_set_state() to silently refuse to
work when the kernel is using compacted XSAVE buffers.  This
broke execute-only and future pkey_mprotect() support when using
compact XSAVE buffers.

But, removing fpu__xfeature_set_state() gets rid of this issue,
in addition to the nice cleanup and speedup.

This fixes the same thing as a fix that Sai posted:

  https://lkml.org/lkml/2016/7/25/637

The fix that he posted is a much more obviously correct, but I
think we should just do this instead.

Reported-by: Sai Praneeth Prakhya <sai.praneeth.prakhya@intel.com>
Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Fenghua Yu <fenghua.yu@intel.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Ravi Shankar <ravi.v.shankar@intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Yu-Cheng Yu <yu-cheng.yu@intel.com>
Link: http://lkml.kernel.org/r/20160727232040.7D060DAD@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index 680049a..01567aa 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -866,105 +866,17 @@ const void *get_xsave_field_ptr(int xsave_state)
 	return get_xsave_addr(&fpu->state.xsave, xsave_state);
 }
 
-
-/*
- * Set xfeatures (aka XSTATE_BV) bit for a feature that we want
- * to take out of its "init state".  This will ensure that an
- * XRSTOR actually restores the state.
- */
-static void fpu__xfeature_set_non_init(struct xregs_state *xsave,
-		int xstate_feature_mask)
-{
-	xsave->header.xfeatures |= xstate_feature_mask;
-}
-
-/*
- * This function is safe to call whether the FPU is in use or not.
- *
- * Note that this only works on the current task.
- *
- * Inputs:
- *	@xsave_state: state which is defined in xsave.h (e.g. XFEATURE_MASK_FP,
- *	XFEATURE_MASK_SSE, etc...)
- *	@xsave_state_ptr: a pointer to a copy of the state that you would
- *	like written in to the current task's FPU xsave state.  This pointer
- *	must not be located in the current tasks's xsave area.
- * Output:
- *	address of the state in the xsave area or NULL if the state
- *	is not present or is in its 'init state'.
- */
-static void fpu__xfeature_set_state(int xstate_feature_mask,
-		void *xstate_feature_src, size_t len)
-{
-	struct xregs_state *xsave = &current->thread.fpu.state.xsave;
-	struct fpu *fpu = &current->thread.fpu;
-	void *dst;
-
-	if (!boot_cpu_has(X86_FEATURE_XSAVE)) {
-		WARN_ONCE(1, "%s() attempted with no xsave support", __func__);
-		return;
-	}
-
-	/*
-	 * Tell the FPU code that we need the FPU state to be in
-	 * 'fpu' (not in the registers), and that we need it to
-	 * be stable while we write to it.
-	 */
-	fpu__current_fpstate_write_begin();
-
-	/*
-	 * This method *WILL* *NOT* work for compact-format
-	 * buffers.  If the 'xstate_feature_mask' is unset in
-	 * xcomp_bv then we may need to move other feature state
-	 * "up" in the buffer.
-	 */
-	if (xsave->header.xcomp_bv & xstate_feature_mask) {
-		WARN_ON_ONCE(1);
-		goto out;
-	}
-
-	/* find the location in the xsave buffer of the desired state */
-	dst = __raw_xsave_addr(&fpu->state.xsave, xstate_feature_mask);
-
-	/*
-	 * Make sure that the pointer being passed in did not
-	 * come from the xsave buffer itself.
-	 */
-	WARN_ONCE(xstate_feature_src == dst, "set from xsave buffer itself");
-
-	/* put the caller-provided data in the location */
-	memcpy(dst, xstate_feature_src, len);
-
-	/*
-	 * Mark the xfeature so that the CPU knows there is state
-	 * in the buffer now.
-	 */
-	fpu__xfeature_set_non_init(xsave, xstate_feature_mask);
-out:
-	/*
-	 * We are done writing to the 'fpu'.  Reenable preeption
-	 * and (possibly) move the fpstate back in to the fpregs.
-	 */
-	fpu__current_fpstate_write_end();
-}
-
 #define NR_VALID_PKRU_BITS (CONFIG_NR_PROTECTION_KEYS * 2)
 #define PKRU_VALID_MASK (NR_VALID_PKRU_BITS - 1)
 
 /*
- * This will go out and modify the XSAVE buffer so that PKRU is
- * set to a particular state for access to 'pkey'.
- *
- * PKRU state does affect kernel access to user memory.  We do
- * not modfiy PKRU *itself* here, only the XSAVE state that will
- * be restored in to PKRU when we return back to userspace.
+ * This will go out and modify PKRU register to set the access
+ * rights for @pkey to @init_val.
  */
 int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 		unsigned long init_val)
 {
-	struct xregs_state *xsave = &tsk->thread.fpu.state.xsave;
-	struct pkru_state *old_pkru_state;
-	struct pkru_state new_pkru_state;
+	u32 old_pkru;
 	int pkey_shift = (pkey * PKRU_BITS_PER_PKEY);
 	u32 new_pkru_bits = 0;
 
@@ -974,6 +886,15 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 	 */
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
 		return -EINVAL;
+	/*
+	 * For most XSAVE components, this would be an arduous task:
+	 * brining fpstate up to date with fpregs, updating fpstate,
+	 * then re-populating fpregs.  But, for components that are
+	 * never lazily managed, we can just access the fpregs
+	 * directly.  PKRU is never managed lazily, so we can just
+	 * manipulate it directly.  Make sure it stays that way.
+	 */
+	WARN_ON_ONCE(!use_eager_fpu());
 
 	/* Set the bits we need in PKRU:  */
 	if (init_val & PKEY_DISABLE_ACCESS)
@@ -984,37 +905,12 @@ int arch_set_user_pkey_access(struct task_struct *tsk, int pkey,
 	/* Shift the bits in to the correct place in PKRU for pkey: */
 	new_pkru_bits <<= pkey_shift;
 
-	/* Locate old copy of the state in the xsave buffer: */
-	old_pkru_state = get_xsave_addr(xsave, XFEATURE_MASK_PKRU);
-
-	/*
-	 * When state is not in the buffer, it is in the init
-	 * state, set it manually.  Otherwise, copy out the old
-	 * state.
-	 */
-	if (!old_pkru_state)
-		new_pkru_state.pkru = 0;
-	else
-		new_pkru_state.pkru = old_pkru_state->pkru;
-
-	/* Mask off any old bits in place: */
-	new_pkru_state.pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
-
-	/* Set the newly-requested bits: */
-	new_pkru_state.pkru |= new_pkru_bits;
-
-	/*
-	 * We could theoretically live without zeroing pkru.pad.
-	 * The current XSAVE feature state definition says that
-	 * only bytes 0->3 are used.  But we do not want to
-	 * chance leaking kernel stack out to userspace in case a
-	 * memcpy() of the whole xsave buffer was done.
-	 *
-	 * They're in the same cacheline anyway.
-	 */
-	new_pkru_state.pad = 0;
+	/* Get old PKRU and mask off any old bits in place: */
+	old_pkru = read_pkru();
+	old_pkru &= ~((PKRU_AD_BIT|PKRU_WD_BIT) << pkey_shift);
 
-	fpu__xfeature_set_state(XFEATURE_MASK_PKRU, &new_pkru_state, sizeof(new_pkru_state));
+	/* Write old part along with new part: */
+	write_pkru(old_pkru | new_pkru_bits);
 
 	return 0;
 }
-- 
cgit v0.10.2


From 62d16b5a3fca4d186e13215e0d7d2f6d36191796 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Sat, 6 Aug 2016 12:20:39 +0200
Subject: x86/mm/kaslr: Fix -Wformat-security warning

debug_putstr() is used to output strings without using printf-like
formatting but debug_putstr(v) is defined as early_printk(v) in
arch/x86/lib/kaslr.c.

This makes clang reports the following warning when building
with -Wformat-security:

    arch/x86/lib/kaslr.c:57:15: warning: format string is not a string
    literal (potentially insecure) [-Wformat-security]
            debug_putstr(purpose);
                         ^~~~~~~

Fix this by using "%s" in early_printk().

Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Acked-by: Kees Cook <keescook@chromium.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/20160806102039.27221-1-nicolas.iooss_linux@m4x.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/lib/kaslr.c b/arch/x86/lib/kaslr.c
index f7dfeda..121f59c 100644
--- a/arch/x86/lib/kaslr.c
+++ b/arch/x86/lib/kaslr.c
@@ -19,7 +19,7 @@
 #include <asm/cpufeature.h>
 #include <asm/setup.h>
 
-#define debug_putstr(v) early_printk(v)
+#define debug_putstr(v) early_printk("%s", v)
 #define has_cpuflag(f) boot_cpu_has(f)
 #define get_boot_seed() kaslr_offset()
 #endif
-- 
cgit v0.10.2


From ace7fab7a6cdd363a615ec537f2aa94dbc761ee2 Mon Sep 17 00:00:00 2001
From: Dave Hansen <dave.hansen@linux.intel.com>
Date: Wed, 10 Aug 2016 10:23:25 -0700
Subject: x86/mm: Fix swap entry comment and macro

A recent patch changed the format of a swap PTE.

The comment explaining the format of the swap PTE is wrong about
the bits used for the swap type field.  Amusingly, the ASCII art
and the patch description are correct, but the comment itself
is wrong.

As I was looking at this, I also noticed that the
SWP_OFFSET_FIRST_BIT has an off-by-one error.  This does not
really hurt anything.  It just wasted a bit of space in the PTE,
giving us 2^59 bytes of addressable space in our swapfiles
instead of 2^60.  But, it doesn't match with the comments, and it
wastes a bit of space, so fix it.

Signed-off-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Dave Hansen <dave@sr71.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Luis R. Rodriguez <mcgrof@suse.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Toshi Kani <toshi.kani@hp.com>
Fixes: 00839ee3b299 ("x86/mm: Move swap offset/type up in PTE to work around erratum")
Link: http://lkml.kernel.org/r/20160810172325.E56AD7DA@viggo.jf.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 7e8ec7a..1cc82ec 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -145,7 +145,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
  *
  * |     ...            | 11| 10|  9|8|7|6|5| 4| 3|2|1|0| <- bit number
  * |     ...            |SW3|SW2|SW1|G|L|D|A|CD|WT|U|W|P| <- bit names
- * | OFFSET (14->63) | TYPE (10-13) |0|X|X|X| X| X|X|X|0| <- swp entry
+ * | OFFSET (14->63) | TYPE (9-13)  |0|X|X|X| X| X|X|X|0| <- swp entry
  *
  * G (8) is aliased and used as a PROT_NONE indicator for
  * !present ptes.  We need to start storing swap entries above
@@ -156,7 +156,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 #define SWP_TYPE_FIRST_BIT (_PAGE_BIT_PROTNONE + 1)
 #define SWP_TYPE_BITS 5
 /* Place the offset above the type: */
-#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS + 1)
+#define SWP_OFFSET_FIRST_BIT (SWP_TYPE_FIRST_BIT + SWP_TYPE_BITS)
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
 
-- 
cgit v0.10.2


From 82ba4faca1bffad429f15c90c980ffd010366c25 Mon Sep 17 00:00:00 2001
From: Aaron Lu <aaron.lu@intel.com>
Date: Thu, 11 Aug 2016 15:44:30 +0800
Subject: x86/irq: Do not substract irq_tlb_count from irq_call_count

Since commit:

  52aec3308db8 ("x86/tlb: replace INVALIDATE_TLB_VECTOR by CALL_FUNCTION_VECTOR")

the TLB remote shootdown is done through call function vector. That
commit didn't take care of irq_tlb_count, which a later commit:

  fd0f5869724f ("x86: Distinguish TLB shootdown interrupts from other functions call interrupts")

... tried to fix.

The fix assumes every increase of irq_tlb_count has a corresponding
increase of irq_call_count. So the irq_call_count is always bigger than
irq_tlb_count and we could substract irq_tlb_count from irq_call_count.

Unfortunately this is not true for the smp_call_function_single() case.
The IPI is only sent if the target CPU's call_single_queue is empty when
adding a csd into it in generic_exec_single. That means if two threads
are both adding flush tlb csds to the same CPU's call_single_queue, only
one IPI is sent. In other words, the irq_call_count is incremented by 1
but irq_tlb_count is incremented by 2. Over time, irq_tlb_count will be
bigger than irq_call_count and the substract will produce a very large
irq_call_count value due to overflow.

Considering that:

  1) it's not worth to send more IPIs for the sake of accurate counting of
     irq_call_count in generic_exec_single();

  2) it's not easy to tell if the call function interrupt is for TLB
     shootdown in __smp_call_function_single_interrupt().

Not to exclude TLB shootdown from call function count seems to be the
simplest fix and this patch just does that.

This bug was found by LKP's cyclic performance regression tracking recently
with the vm-scalability test suite. I have bisected to commit:

  3dec0ba0be6a ("mm/rmap: share the i_mmap_rwsem")

This commit didn't do anything wrong but revealed the irq_call_count
problem. IIUC, the commit makes rwc->remap_one in rmap_walk_file
concurrent with multiple threads.  When remap_one is try_to_unmap_one(),
then multiple threads could queue flush TLB to the same CPU but only
one IPI will be sent.

Since the commit was added in Linux v3.19, the counting problem only
shows up from v3.19 onwards.

Signed-off-by: Aaron Lu <aaron.lu@intel.com>
Cc: Alex Shi <alex.shi@linaro.org>
Cc: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Huang Ying <ying.huang@intel.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Tomoki Sekiyama <tomoki.sekiyama.qu@hitachi.com>
Link: http://lkml.kernel.org/r/20160811074430.GA18163@aaronlu.sh.intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 7178043..59405a2 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -22,10 +22,6 @@ typedef struct {
 #ifdef CONFIG_SMP
 	unsigned int irq_resched_count;
 	unsigned int irq_call_count;
-	/*
-	 * irq_tlb_count is double-counted in irq_call_count, so it must be
-	 * subtracted from irq_call_count when displaying irq_call_count
-	 */
 	unsigned int irq_tlb_count;
 #endif
 #ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 61521dc..9f669fd 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -102,8 +102,7 @@ int arch_show_interrupts(struct seq_file *p, int prec)
 	seq_puts(p, "  Rescheduling interrupts\n");
 	seq_printf(p, "%*s: ", prec, "CAL");
 	for_each_online_cpu(j)
-		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count -
-					irq_stats(j)->irq_tlb_count);
+		seq_printf(p, "%10u ", irq_stats(j)->irq_call_count);
 	seq_puts(p, "  Function call interrupts\n");
 	seq_printf(p, "%*s: ", prec, "TLB");
 	for_each_online_cpu(j)
-- 
cgit v0.10.2


From 007b756053386af079ba963a8f5817ac651c7c59 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Aug 2016 02:29:13 -0700
Subject: x86/boot: Run reserve_bios_regions() after we initialize the memory
 map

reserve_bios_regions() is a quirk that reserves memory that we might
otherwise think is available.  There's no need to run it so early,
and running it before we have the memory map initialized with its
non-quirky inputs makes it hard to make reserve_bios_regions() more
intelligent.

Move it right after we populate the memblock state.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mario Limonciello <mario_limonciello@dell.com>
Cc: Matt Fleming <mfleming@suse.de>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/59f58618911005c799c6c9979ce6ae4881d907c2.1470821230.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 2dda0bc..f16c55b 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -25,8 +25,6 @@ static void __init i386_default_early_setup(void)
 	/* Initialize 32bit specific setup functions */
 	x86_init.resources.reserve_resources = i386_reserve_resources;
 	x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
-
-	reserve_bios_regions();
 }
 
 asmlinkage __visible void __init i386_start_kernel(void)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 99d48e7..54a2372 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -183,7 +183,6 @@ void __init x86_64_start_reservations(char *real_mode_data)
 		copy_bootdata(__va(real_mode_data));
 
 	x86_early_init_platform_quirks();
-	reserve_bios_regions();
 
 	switch (boot_params.hdr.hardware_subarch) {
 	case X86_SUBARCH_INTEL_MID:
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 95cf31c..bf780e0 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1101,6 +1101,8 @@ void __init setup_arch(char **cmdline_p)
 		efi_find_mirror();
 	}
 
+	reserve_bios_regions();
+
 	/*
 	 * The EFI specification says that boot service code won't be called
 	 * after ExitBootServices(). This is, in fact, a lie.
-- 
cgit v0.10.2


From 18bc7bd523e0fc5be8d76bf84bde733a97a8c375 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Aug 2016 02:29:14 -0700
Subject: x86/boot: Synchronize trampoline_cr4_features and mmu_cr4_features
 directly

The initialization process for trampoline_cr4_features and
mmu_cr4_features was confusing.  The intent is for mmu_cr4_features
and *trampoline_cr4_features to stay in sync, but
trampoline_cr4_features is NULL until setup_real_mode() runs.  The
old code synchronized *trampoline_cr4_features *twice*, once in
setup_real_mode() and once in setup_arch().  It also initialized
mmu_cr4_features in setup_real_mode(), which causes the actual value
of mmu_cr4_features to potentially depend on when setup_real_mode()
is called.

With this patch, mmu_cr4_features is initialized directly in
setup_arch(), and *trampoline_cr4_features is synchronized to
mmu_cr4_features when the trampoline is set up.

After this patch, it should be safe to defer setup_real_mode().

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mario Limonciello <mario_limonciello@dell.com>
Cc: Matt Fleming <mfleming@suse.de>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/d48a263f9912389b957dd495a7127b009259ffe0.1470821230.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index bf780e0..95c8c9c 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1131,6 +1131,16 @@ void __init setup_arch(char **cmdline_p)
 
 	early_trap_pf_init();
 
+	/*
+	 * Update mmu_cr4_features (and, indirectly, trampoline_cr4_features)
+	 * with the current CR4 value.  This may not be necessary, but
+	 * auditing all the early-boot CR4 manipulation would be needed to
+	 * rule it out.
+	 */
+	if (boot_cpu_data.cpuid_level >= 0)
+		/* A CPU has %cr4 if and only if it has CPUID. */
+		mmu_cr4_features = __read_cr4();
+
 	setup_real_mode();
 
 	memblock_set_current_limit(get_max_mapped());
@@ -1180,13 +1190,6 @@ void __init setup_arch(char **cmdline_p)
 
 	kasan_init();
 
-	if (boot_cpu_data.cpuid_level >= 0) {
-		/* A CPU has %cr4 if and only if it has CPUID */
-		mmu_cr4_features = __read_cr4();
-		if (trampoline_cr4_features)
-			*trampoline_cr4_features = mmu_cr4_features;
-	}
-
 #ifdef CONFIG_X86_32
 	/* sync back kernel address range */
 	clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 705e3ff..c5bdc4e 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -4,6 +4,7 @@
 #include <asm/cacheflush.h>
 #include <asm/pgtable.h>
 #include <asm/realmode.h>
+#include <asm/tlbflush.h>
 
 struct real_mode_header *real_mode_header;
 u32 *trampoline_cr4_features;
@@ -84,7 +85,7 @@ void __init setup_real_mode(void)
 
 	trampoline_header->start = (u64) secondary_startup_64;
 	trampoline_cr4_features = &trampoline_header->cr4;
-	*trampoline_cr4_features = __read_cr4();
+	*trampoline_cr4_features = mmu_cr4_features;
 
 	trampoline_pgd = (u64 *) __va(real_mode_header->trampoline_pgd);
 	trampoline_pgd[0] = trampoline_pgd_entry.pgd;
-- 
cgit v0.10.2


From d0de0f685db7faf2ae4597a39a59996dd84e18c7 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Aug 2016 02:29:15 -0700
Subject: x86/boot: Defer setup_real_mode() to early_initcall time

There's no need to run setup_real_mode() as early as we run it.
Defer it to the same early_initcall that sets up the page
permissions for the real mode code.

This should be a code size reduction.  More importantly, it give us
a longer window in which we can allocate the real mode trampoline.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mario Limonciello <mario_limonciello@dell.com>
Cc: Matt Fleming <mfleming@suse.de>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/fd62f0da4f79357695e9bf3e365623736b05f119.1470821230.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 9c6b890..8d67777 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -59,6 +59,5 @@ extern unsigned char secondary_startup_64[];
 #endif
 
 void reserve_real_mode(void);
-void setup_real_mode(void);
 
 #endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 95c8c9c..0fa60f5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -1141,8 +1141,6 @@ void __init setup_arch(char **cmdline_p)
 		/* A CPU has %cr4 if and only if it has CPUID. */
 		mmu_cr4_features = __read_cr4();
 
-	setup_real_mode();
-
 	memblock_set_current_limit(get_max_mapped());
 
 	/*
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index c5bdc4e..747b71e 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -30,7 +30,7 @@ void __init reserve_real_mode(void)
 	       base, (unsigned long long)mem, size);
 }
 
-void __init setup_real_mode(void)
+static void __init setup_real_mode(void)
 {
 	u16 real_mode_seg;
 	const u32 *rel;
@@ -101,7 +101,7 @@ void __init setup_real_mode(void)
  * need to mark it executable at do_pre_smp_initcalls() at least,
  * thus run it as a early_initcall().
  */
-static int __init set_real_mode_permissions(void)
+static void __init set_real_mode_permissions(void)
 {
 	unsigned char *base = (unsigned char *) real_mode_header;
 	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
@@ -120,7 +120,16 @@ static int __init set_real_mode_permissions(void)
 	set_memory_nx((unsigned long) base, size >> PAGE_SHIFT);
 	set_memory_ro((unsigned long) base, ro_size >> PAGE_SHIFT);
 	set_memory_x((unsigned long) text_start, text_size >> PAGE_SHIFT);
+}
+
+static int __init init_real_mode(void)
+{
+	if (!real_mode_header)
+		panic("Real mode trampoline was not allocated");
+
+	setup_real_mode();
+	set_real_mode_permissions();
 
 	return 0;
 }
-early_initcall(set_real_mode_permissions);
+early_initcall(init_real_mode);
-- 
cgit v0.10.2


From 5ff3e2c3c3eebe13967d81ad1f23b9468fefea81 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Aug 2016 02:29:16 -0700
Subject: x86/boot: Rework reserve_real_mode() to allow multiple tries

If reserve_real_mode() fails, panicing immediately means we're
doomed.  Make it safe to try more than once to allocate the
trampoline:

 - Degrade a failure from panic() to pr_info().  (If we make it to
   setup_real_mode() without reserving the trampoline, we'll panic
   them.)

 - Factor out helpers so that platform code can supply a specific
   address to try.

 - Warn if reserve_real_mode() is called after we're done with the
   memblock allocator.  If that were to happen, we would behave
   unpredictably.

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mario Limonciello <mario_limonciello@dell.com>
Cc: Matt Fleming <mfleming@suse.de>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/876e383038f3e9971aa72fd20a4f5da05f9d193d.1470821230.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/include/asm/realmode.h b/arch/x86/include/asm/realmode.h
index 8d67777..b2988c0 100644
--- a/arch/x86/include/asm/realmode.h
+++ b/arch/x86/include/asm/realmode.h
@@ -58,6 +58,15 @@ extern unsigned char boot_gdt[];
 extern unsigned char secondary_startup_64[];
 #endif
 
+static inline size_t real_mode_size_needed(void)
+{
+	if (real_mode_header)
+		return 0;	/* already allocated. */
+
+	return ALIGN(real_mode_blob_end - real_mode_blob, PAGE_SIZE);
+}
+
+void set_real_mode_mem(phys_addr_t mem, size_t size);
 void reserve_real_mode(void);
 
 #endif /* _ARCH_X86_REALMODE_H */
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index 747b71e..5db706f1 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -1,4 +1,5 @@
 #include <linux/io.h>
+#include <linux/slab.h>
 #include <linux/memblock.h>
 
 #include <asm/cacheflush.h>
@@ -12,22 +13,34 @@ u32 *trampoline_cr4_features;
 /* Hold the pgd entry used on booting additional CPUs */
 pgd_t trampoline_pgd_entry;
 
+void __init set_real_mode_mem(phys_addr_t mem, size_t size)
+{
+	void *base = __va(mem);
+
+	real_mode_header = (struct real_mode_header *) base;
+	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
+	       base, (unsigned long long)mem, size);
+}
+
 void __init reserve_real_mode(void)
 {
 	phys_addr_t mem;
-	unsigned char *base;
-	size_t size = PAGE_ALIGN(real_mode_blob_end - real_mode_blob);
+	size_t size = real_mode_size_needed();
+
+	if (!size)
+		return;
+
+	WARN_ON(slab_is_available());
 
 	/* Has to be under 1M so we can execute real-mode AP code. */
 	mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
-	if (!mem)
-		panic("Cannot allocate trampoline\n");
+	if (!mem) {
+		pr_info("No sub-1M memory is available for the trampoline\n");
+		return;
+	}
 
-	base = __va(mem);
 	memblock_reserve(mem, size);
-	real_mode_header = (struct real_mode_header *) base;
-	printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
-	       base, (unsigned long long)mem, size);
+	set_real_mode_mem(mem, size);
 }
 
 static void __init setup_real_mode(void)
-- 
cgit v0.10.2


From 5bc653b7318217c54244a14f248f1f07abe0a865 Mon Sep 17 00:00:00 2001
From: Andy Lutomirski <luto@kernel.org>
Date: Wed, 10 Aug 2016 02:29:17 -0700
Subject: x86/efi: Allocate a trampoline if needed in efi_free_boot_services()

On my Dell XPS 13 9350 with firmware 1.4.4 and SGX on, if I boot
Fedora 24's grub2-efi off a hard disk, my first 1MB of RAM looks
like:

 efi: mem00: [Runtime Data       |RUN|  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000000000000-0x0000000000000fff] (0MB)
 efi: mem01: [Boot Data          |   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000000001000-0x0000000000027fff] (0MB)
 efi: mem02: [Loader Data        |   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000000028000-0x0000000000029fff] (0MB)
 efi: mem03: [Reserved           |   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x000000000002a000-0x000000000002bfff] (0MB)
 efi: mem04: [Runtime Data       |RUN|  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x000000000002c000-0x000000000002cfff] (0MB)
 efi: mem05: [Loader Data        |   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x000000000002d000-0x000000000002dfff] (0MB)
 efi: mem06: [Conventional Memory|   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x000000000002e000-0x0000000000057fff] (0MB)
 efi: mem07: [Reserved           |   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000000058000-0x0000000000058fff] (0MB)
 efi: mem08: [Conventional Memory|   |  |  |  |  |  |  |   |WB|WT|WC|UC] range=[0x0000000000059000-0x000000000009ffff] (0MB)

My EBDA is at 0x2c000, which blocks off everything from 0x2c000 and
up, and my trampoline is 0x6000 bytes (6 pages), so it doesn't fit
in the loader data range at 0x28000.

Without this patch, it panics due to a failure to allocate the
trampoline.  With this patch, it works:

 [  +0.001744] Base memory trampoline at [ffff880000001000] 1000 size 24576

Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Matt Fleming <matt@codeblueprint.co.uk>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Brian Gerst <brgerst@gmail.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Josh Poimboeuf <jpoimboe@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mario Limonciello <mario_limonciello@dell.com>
Cc: Matt Fleming <mfleming@suse.de>
Cc: Matthew Garrett <mjg59@srcf.ucam.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/998c77b3bf709f3dfed85cb30701ed1a5d8a438b.1470821230.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c
index 4480c06..89d1146 100644
--- a/arch/x86/platform/efi/quirks.c
+++ b/arch/x86/platform/efi/quirks.c
@@ -254,6 +254,7 @@ void __init efi_free_boot_services(void)
 	for_each_efi_memory_desc(md) {
 		unsigned long long start = md->phys_addr;
 		unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
+		size_t rm_size;
 
 		if (md->type != EFI_BOOT_SERVICES_CODE &&
 		    md->type != EFI_BOOT_SERVICES_DATA)
@@ -263,6 +264,26 @@ void __init efi_free_boot_services(void)
 		if (md->attribute & EFI_MEMORY_RUNTIME)
 			continue;
 
+		/*
+		 * Nasty quirk: if all sub-1MB memory is used for boot
+		 * services, we can get here without having allocated the
+		 * real mode trampoline.  It's too late to hand boot services
+		 * memory back to the memblock allocator, so instead
+		 * try to manually allocate the trampoline if needed.
+		 *
+		 * I've seen this on a Dell XPS 13 9350 with firmware
+		 * 1.4.4 with SGX enabled booting Linux via Fedora 24's
+		 * grub2-efi on a hard disk.  (And no, I don't know why
+		 * this happened, but Linux should still try to boot rather
+		 * panicing early.)
+		 */
+		rm_size = real_mode_size_needed();
+		if (rm_size && (start + rm_size) < (1<<20) && size >= rm_size) {
+			set_real_mode_mem(start, rm_size);
+			start += rm_size;
+			size -= rm_size;
+		}
+
 		free_bootmem_late(start, size);
 	}
 
-- 
cgit v0.10.2


From d52c0569bab4edc888832df44dc7ac28517134f6 Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 11 Aug 2016 16:08:35 +0200
Subject: x86/apic/x2apic, smp/hotplug: Don't use before alloc in
 x2apic_cluster_probe()

I made a mistake while converting the driver to the hotplug state
machine and as a result x2apic_cluster_probe() was accessing
cpus_in_cluster before allocating it.

This patch fixes it by setting the cpumask after the allocation the
memory succeeded.

While at it, I marked two functions static which are only used within
this file.

Reported-by: Laura Abbott <labbott@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Fixes: 6b2c28471de5 ("x86/x2apic: Convert to CPU hotplug state machine")
Link: http://lkml.kernel.org/r/1470924515-9444-1-git-send-email-bigeasy@linutronix.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>

diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 6368fa6..54f35d9 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -155,7 +155,7 @@ static void init_x2apic_ldr(void)
 /*
  * At CPU state changes, update the x2apic cluster sibling info.
  */
-int x2apic_prepare_cpu(unsigned int cpu)
+static int x2apic_prepare_cpu(unsigned int cpu)
 {
 	if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL))
 		return -ENOMEM;
@@ -168,7 +168,7 @@ int x2apic_prepare_cpu(unsigned int cpu)
 	return 0;
 }
 
-int x2apic_dead_cpu(unsigned int this_cpu)
+static int x2apic_dead_cpu(unsigned int this_cpu)
 {
 	int cpu;
 
@@ -186,13 +186,18 @@ int x2apic_dead_cpu(unsigned int this_cpu)
 static int x2apic_cluster_probe(void)
 {
 	int cpu = smp_processor_id();
+	int ret;
 
 	if (!x2apic_mode)
 		return 0;
 
+	ret = cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE",
+				x2apic_prepare_cpu, x2apic_dead_cpu);
+	if (ret < 0) {
+		pr_err("Failed to register X2APIC_PREPARE\n");
+		return 0;
+	}
 	cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
-	cpuhp_setup_state(CPUHP_X2APIC_PREPARE, "X2APIC_PREPARE",
-			  x2apic_prepare_cpu, x2apic_dead_cpu);
 	return 1;
 }
 
-- 
cgit v0.10.2