From 338ea55579d1dbd83753b10db74da747b2f1998f Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 4 Jan 2015 11:49:56 +0100
Subject: x86/lib/copy_user_64.S: Remove FIX_ALIGNMENT define

It is unconditionally enabled so remove it. No object file change.

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dee945d..1530ec2 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,9 +8,6 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
 #include <asm/current.h>
 #include <asm/asm-offsets.h>
 #include <asm/thread_info.h>
@@ -45,7 +42,6 @@
 	.endm
 
 	.macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
 	/* check for bad alignment of destination */
 	movl %edi,%ecx
 	andl $7,%ecx
@@ -67,7 +63,6 @@
 
 	_ASM_EXTABLE(100b,103b)
 	_ASM_EXTABLE(101b,103b)
-#endif
 	.endm
 
 /* Standard copy_to_user with segment limit checking */
-- 
cgit v0.10.2


From db477a3386dee183130916d6bbf21f5828b0b2e2 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 30 Dec 2014 20:27:09 +0100
Subject: x86/alternatives: Cleanup DPRINTK macro

Make it pass __func__ implicitly. Also, dump info about each replacing
we're doing. Fixup comments and style while at it.

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f..1e86e85 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,10 @@ static int __init setup_noreplace_paravirt(char *str)
 __setup("noreplace-paravirt", setup_noreplace_paravirt);
 #endif
 
-#define DPRINTK(fmt, ...)				\
-do {							\
-	if (debug_alternative)				\
-		printk(KERN_DEBUG fmt, ##__VA_ARGS__);	\
+#define DPRINTK(fmt, args...)						\
+do {									\
+	if (debug_alternative)						\
+		printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);	\
 } while (0)
 
 /*
@@ -243,12 +243,13 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
 extern s32 __smp_locks[], __smp_locks_end[];
 void *text_poke_early(void *addr, const void *opcode, size_t len);
 
-/* Replace instructions with better alternatives for this CPU type.
-   This runs before SMP is initialized to avoid SMP problems with
-   self modifying code. This implies that asymmetric systems where
-   APs have less capabilities than the boot processor are not handled.
-   Tough. Make sure you disable such features by hand. */
-
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ */
 void __init_or_module apply_alternatives(struct alt_instr *start,
 					 struct alt_instr *end)
 {
@@ -256,10 +257,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 	u8 *instr, *replacement;
 	u8 insnbuf[MAX_PATCH_LEN];
 
-	DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+	DPRINTK("alt table %p -> %p", start, end);
 	/*
 	 * The scan order should be from start to end. A later scanned
-	 * alternative code can overwrite a previous scanned alternative code.
+	 * alternative code can overwrite previously scanned alternative code.
 	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
 	 * patch code.
 	 *
@@ -275,11 +276,19 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 		if (!boot_cpu_has(a->cpuid))
 			continue;
 
+		DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)",
+			a->cpuid >> 5,
+			a->cpuid & 0x1f,
+			instr, a->instrlen,
+			replacement, a->replacementlen);
+
 		memcpy(insnbuf, replacement, a->replacementlen);
 
 		/* 0xe8 is a relative jump; fix the offset. */
-		if (*insnbuf == 0xe8 && a->replacementlen == 5)
-		    *(s32 *)(insnbuf + 1) += replacement - instr;
+		if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+			*(s32 *)(insnbuf + 1) += replacement - instr;
+			DPRINTK("Fix CALL offset: 0x%x", *(s32 *)(insnbuf + 1));
+		}
 
 		add_nops(insnbuf + a->replacementlen,
 			 a->instrlen - a->replacementlen);
@@ -371,8 +380,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
 	smp->locks_end	= locks_end;
 	smp->text	= text;
 	smp->text_end	= text_end;
-	DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
-		__func__, smp->locks, smp->locks_end,
+	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+		smp->locks, smp->locks_end,
 		smp->text, smp->text_end, smp->name);
 
 	list_add_tail(&smp->next, &smp_alt_modules);
-- 
cgit v0.10.2


From 4332195c5615bf748624094ce4ff6797e475024d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 27 Dec 2014 10:41:52 +0100
Subject: x86/alternatives: Add instruction padding

Up until now we have always paid attention to make sure the length of
the new instruction replacing the old one is at least less or equal to
the length of the old instruction. If the new instruction is longer, at
the time it replaces the old instruction it will overwrite the beginning
of the next instruction in the kernel image and cause your pants to
catch fire.

So instead of having to pay attention, teach the alternatives framework
to pad shorter old instructions with NOPs at buildtime - but only in the
case when

  len(old instruction(s)) < len(new instruction(s))

and add nothing in the >= case. (In that case we do add_nops() when
patching).

This way the alternatives user shouldn't have to care about instruction
sizes and simply use the macros.

Add asm ALTERNATIVE* flavor macros too, while at it.

Also, we need to save the pad length in a separate struct alt_instr
member for NOP optimization and the way to do that reliably is to carry
the pad length instead of trying to detect whether we're looking at
single-byte NOPs or at pathological instruction offsets like e9 90 90 90
90, for example, which is a valid instruction.

Thanks to Michael Matz for the great help with toolchain questions.

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 372231c..524bddc 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,12 +18,53 @@
 	.endm
 #endif
 
-.macro altinstruction_entry orig alt feature orig_len alt_len
+.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
 	.long \orig - .
 	.long \alt - .
 	.word \feature
 	.byte \orig_len
 	.byte \alt_len
+	.byte \pad_len
+.endm
+
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+	\oldinstr
+141:
+	.skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+	.pushsection .altinstructions,"a"
+	altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
+	.popsection
+
+	.pushsection .altinstr_replacement,"ax"
+143:
+	\newinstr
+144:
+	.popsection
+.endm
+
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+	\oldinstr
+141:
+	.skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+	.skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90
+142:
+
+	.pushsection .altinstructions,"a"
+	altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
+	altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
+	.popsection
+
+	.pushsection .altinstr_replacement,"ax"
+143:
+	\newinstr1
+144:
+	\newinstr2
+145:
+	.popsection
 .endm
 
 #endif  /*  __ASSEMBLY__  */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 473bdbe..5aef6a9 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -48,8 +48,9 @@ struct alt_instr {
 	s32 repl_offset;	/* offset to replacement instruction */
 	u16 cpuid;		/* cpuid bit set for replacement */
 	u8  instrlen;		/* length of original instruction */
-	u8  replacementlen;	/* length of new instruction, <= instrlen */
-};
+	u8  replacementlen;	/* length of new instruction */
+	u8  padlen;		/* length of build-time padding */
+} __packed;
 
 extern void alternative_instructions(void);
 extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,61 @@ static inline int alternatives_text_reserved(void *start, void *end)
 }
 #endif	/* CONFIG_SMP */
 
-#define OLDINSTR(oldinstr)	"661:\n\t" oldinstr "\n662:\n"
+#define b_replacement(num)	"664"#num
+#define e_replacement(num)	"665"#num
 
-#define b_replacement(number)	"663"#number
-#define e_replacement(number)	"664"#number
+#define alt_end_marker		"663"
+#define alt_slen		"662b-661b"
+#define alt_pad_len		alt_end_marker"b-662b"
+#define alt_total_slen		alt_end_marker"b-661b"
+#define alt_rlen(num)		e_replacement(num)"f-"b_replacement(num)"f"
 
-#define alt_slen "662b-661b"
-#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
+#define __OLDINSTR(oldinstr, num)					\
+	"661:\n\t" oldinstr "\n662:\n"					\
+	".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * "		\
+		"((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
 
-#define ALTINSTR_ENTRY(feature, number)					      \
+#define OLDINSTR(oldinstr, num)						\
+	__OLDINSTR(oldinstr, num)					\
+	alt_end_marker ":\n"
+
+/*
+ * Pad the second replacement alternative with additional NOPs if it is
+ * additionally longer than the first replacement alternative.
+ */
+#define OLDINSTR_2(oldinstr, num1, num2)					\
+	__OLDINSTR(oldinstr, num1)						\
+	".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \
+		"((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n"  \
+	alt_end_marker ":\n"
+
+#define ALTINSTR_ENTRY(feature, num)					      \
 	" .long 661b - .\n"				/* label           */ \
-	" .long " b_replacement(number)"f - .\n"	/* new instruction */ \
+	" .long " b_replacement(num)"f - .\n"		/* new instruction */ \
 	" .word " __stringify(feature) "\n"		/* feature bit     */ \
-	" .byte " alt_slen "\n"				/* source len      */ \
-	" .byte " alt_rlen(number) "\n"			/* replacement len */
+	" .byte " alt_total_slen "\n"			/* source len      */ \
+	" .byte " alt_rlen(num) "\n"			/* replacement len */ \
+	" .byte " alt_pad_len "\n"			/* pad len */
 
-#define DISCARD_ENTRY(number)				/* rlen <= slen */    \
-	" .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
-
-#define ALTINSTR_REPLACEMENT(newinstr, feature, number)	/* replacement */     \
-	b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
+#define ALTINSTR_REPLACEMENT(newinstr, feature, num)	/* replacement */     \
+	b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
 
 /* alternative assembly primitive: */
 #define ALTERNATIVE(oldinstr, newinstr, feature)			\
-	OLDINSTR(oldinstr)						\
+	OLDINSTR(oldinstr, 1)						\
 	".pushsection .altinstructions,\"a\"\n"				\
 	ALTINSTR_ENTRY(feature, 1)					\
 	".popsection\n"							\
-	".pushsection .discard,\"aw\",@progbits\n"			\
-	DISCARD_ENTRY(1)						\
-	".popsection\n"							\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr, feature, 1)			\
 	".popsection"
 
 #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
-	OLDINSTR(oldinstr)						\
+	OLDINSTR_2(oldinstr, 1, 2)					\
 	".pushsection .altinstructions,\"a\"\n"				\
 	ALTINSTR_ENTRY(feature1, 1)					\
 	ALTINSTR_ENTRY(feature2, 2)					\
 	".popsection\n"							\
-	".pushsection .discard,\"aw\",@progbits\n"			\
-	DISCARD_ENTRY(1)						\
-	DISCARD_ENTRY(2)						\
-	".popsection\n"							\
 	".pushsection .altinstr_replacement, \"ax\"\n"			\
 	ALTINSTR_REPLACEMENT(newinstr1, feature1, 1)			\
 	ALTINSTR_REPLACEMENT(newinstr2, feature2, 2)			\
@@ -146,6 +158,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
 #define alternative(oldinstr, newinstr, feature)			\
 	asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
 
+#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+	asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+
 /*
  * Alternative inline assembly with input.
  *
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 90a5485..9b1df43 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -418,6 +418,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 			 " .word %P0\n"		/* 1: do replace */
 			 " .byte 2b - 1b\n"	/* source len */
 			 " .byte 0\n"		/* replacement len */
+			 " .byte 0\n"		/* pad len */
 			 ".previous\n"
 			 /* skipping size check since replacement size = 0 */
 			 : : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -432,6 +433,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 			 " .word %P0\n"		/* feature bit */
 			 " .byte 2b - 1b\n"	/* source len */
 			 " .byte 0\n"		/* replacement len */
+			 " .byte 0\n"		/* pad len */
 			 ".previous\n"
 			 /* skipping size check since replacement size = 0 */
 			 : : "i" (bit) : : t_no);
@@ -457,6 +459,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 			     " .word %P1\n"		/* feature bit */
 			     " .byte 2b - 1b\n"		/* source len */
 			     " .byte 4f - 3f\n"		/* replacement len */
+			     " .byte 0\n"		/* pad len */
 			     ".previous\n"
 			     ".section .discard,\"aw\",@progbits\n"
 			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -491,23 +494,28 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
  */
 		asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
 			 "2:\n"
+			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+			         "((5f-4f) - (2b-1b)),0x90\n"
+			 "3:\n"
 			 ".section .altinstructions,\"a\"\n"
 			 " .long 1b - .\n"		/* src offset */
-			 " .long 3f - .\n"		/* repl offset */
+			 " .long 4f - .\n"		/* repl offset */
 			 " .word %P1\n"			/* always replace */
-			 " .byte 2b - 1b\n"		/* src len */
-			 " .byte 4f - 3f\n"		/* repl len */
+			 " .byte 3b - 1b\n"		/* src len */
+			 " .byte 5f - 4f\n"		/* repl len */
+			 " .byte 3b - 2b\n"		/* pad len */
 			 ".previous\n"
 			 ".section .altinstr_replacement,\"ax\"\n"
-			 "3: .byte 0xe9\n .long %l[t_no] - 2b\n"
-			 "4:\n"
+			 "4: .byte 0xe9\n .long %l[t_no] - 2b\n"
+			 "5:\n"
 			 ".previous\n"
 			 ".section .altinstructions,\"a\"\n"
 			 " .long 1b - .\n"		/* src offset */
 			 " .long 0\n"			/* no replacement */
 			 " .word %P0\n"			/* feature bit */
-			 " .byte 2b - 1b\n"		/* src len */
+			 " .byte 3b - 1b\n"		/* src len */
 			 " .byte 0\n"			/* repl len */
+			 " .byte 0\n"			/* pad len */
 			 ".previous\n"
 			 : : "i" (bit), "i" (X86_FEATURE_ALWAYS)
 			 : : t_dynamic, t_no);
@@ -527,6 +535,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 			     " .word %P2\n"		/* always replace */
 			     " .byte 2b - 1b\n"		/* source len */
 			     " .byte 4f - 3f\n"		/* replacement len */
+			     " .byte 0\n"		/* pad len */
 			     ".previous\n"
 			     ".section .discard,\"aw\",@progbits\n"
 			     " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -541,6 +550,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 			     " .word %P1\n"		/* feature bit */
 			     " .byte 4b - 3b\n"		/* src len */
 			     " .byte 6f - 5f\n"		/* repl len */
+			     " .byte 0\n"		/* pad len */
 			     ".previous\n"
 			     ".section .discard,\"aw\",@progbits\n"
 			     " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 8d3120f..c56cb4f 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -33,7 +33,7 @@
 	662: __ASM_CLAC ;						\
 	.popsection ;							\
 	.pushsection .altinstructions, "a" ;				\
-	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;	\
+	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ;	\
 	.popsection
 
 #define ASM_STAC							\
@@ -42,7 +42,7 @@
 	662: __ASM_STAC ;						\
 	.popsection ;							\
 	.pushsection .altinstructions, "a" ;				\
-	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ;	\
+	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ;	\
 	.popsection
 
 #else /* CONFIG_X86_SMAP */
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1e86e85..c99b0f1 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -270,7 +270,6 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 	for (a = start; a < end; a++) {
 		instr = (u8 *)&a->instr_offset + a->instr_offset;
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
-		BUG_ON(a->replacementlen > a->instrlen);
 		BUG_ON(a->instrlen > sizeof(insnbuf));
 		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
 		if (!boot_cpu_has(a->cpuid))
@@ -290,8 +289,9 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 			DPRINTK("Fix CALL offset: 0x%x", *(s32 *)(insnbuf + 1));
 		}
 
-		add_nops(insnbuf + a->replacementlen,
-			 a->instrlen - a->replacementlen);
+		if (a->instrlen > a->replacementlen)
+			add_nops(insnbuf + a->replacementlen,
+				 a->instrlen - a->replacementlen);
 
 		text_poke_early(instr, insnbuf, a->instrlen);
 	}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 000d419..d23c9a1 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -819,7 +819,7 @@ ENTRY(simd_coprocessor_error)
 661:	pushl_cfi $do_general_protection
 662:
 .section .altinstructions,"a"
-	altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
+	altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f, 0
 .previous
 .section .altinstr_replacement,"ax"
 663:	pushl $do_simd_coprocessor_error
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f2145cf..38e57fa 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -67,7 +67,7 @@ ENDPROC(clear_page)
 	.previous
 	.section .altinstructions,"a"
 	altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
-			     .Lclear_page_end-clear_page, 2b-1b
+			     .Lclear_page_end-clear_page, 2b-1b, 0
 	altinstruction_entry clear_page,2b,X86_FEATURE_ERMS,   \
-			     .Lclear_page_end-clear_page,3b-2b
+			     .Lclear_page_end-clear_page,3b-2b, 0
 	.previous
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 176cca6..f1ffdbb 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -106,5 +106,5 @@ ENDPROC(copy_page)
 	.previous
 	.section .altinstructions,"a"
 	altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,	\
-		.Lcopy_page_end-copy_page, 2b-1b
+		.Lcopy_page_end-copy_page, 2b-1b, 0
 	.previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 1530ec2..a9aedd6 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -36,8 +36,8 @@
 	.previous
 
 	.section .altinstructions,"a"
-	altinstruction_entry 0b,2b,\feature1,5,5
-	altinstruction_entry 0b,3b,\feature2,5,5
+	altinstruction_entry 0b,2b,\feature1,5,5,0
+	altinstruction_entry 0b,3b,\feature2,5,5,0
 	.previous
 	.endm
 
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 89b53c9..bbfdacc 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -202,7 +202,7 @@ ENDPROC(__memcpy)
 	 */
 	.section .altinstructions, "a"
 	altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
-			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
+			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c,0
 	altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
-			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
+			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e,0
 	.previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 9c4b530..bbfa6b2 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -221,7 +221,7 @@ ENTRY(__memmove)
 	altinstruction_entry .Lmemmove_begin_forward,		\
 		.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,	\
 		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
-		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
+		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs,0
 	.previous
 ENDPROC(__memmove)
 ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 6f44935..f6153c1 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -150,7 +150,7 @@ ENDPROC(__memset)
 	 */
 	.section .altinstructions,"a"
 	altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-			     .Lfinal-__memset,.Lmemset_e-.Lmemset_c
+			     .Lfinal-__memset,.Lmemset_e-.Lmemset_c,0
 	altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-			     .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
+			     .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e,0
 	.previous
-- 
cgit v0.10.2


From 48c7a2509f9e237d8465399d9cdfe487d3212a23 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 5 Jan 2015 13:48:41 +0100
Subject: x86/alternatives: Make JMPs more robust

Up until now we had to pay attention to relative JMPs in alternatives
about how their relative offset gets computed so that the jump target
is still correct. Or, as it is the case for near CALLs (opcode e8), we
still have to go and readjust the offset at patching time.

What is more, the static_cpu_has_safe() facility had to forcefully
generate 5-byte JMPs since we couldn't rely on the compiler to generate
properly sized ones so we had to force the longest ones. Worse than
that, sometimes it would generate a replacement JMP which is longer than
the original one, thus overwriting the beginning of the next instruction
at patching time.

So, in order to alleviate all that and make using JMPs more
straight-forward we go and pad the original instruction in an
alternative block with NOPs at build time, should the replacement(s) be
longer. This way, alternatives users shouldn't pay special attention
so that original and replacement instruction sizes are fine but the
assembler would simply add padding where needed and not do anything
otherwise.

As a second aspect, we go and recompute JMPs at patching time so that we
can try to make 5-byte JMPs into two-byte ones if possible. If not, we
still have to recompute the offsets as the replacement JMP gets put far
away in the .altinstr_replacement section leading to a wrong offset if
copied verbatim.

For example, on a locally generated kernel image

  old insn VA: 0xffffffff810014bd, CPU feat: X86_FEATURE_ALWAYS, size: 2
  __switch_to:
   ffffffff810014bd:      eb 21                   jmp ffffffff810014e0
  repl insn: size: 5
  ffffffff81d0b23c:       e9 b1 62 2f ff          jmpq ffffffff810014f2

gets corrected to a 2-byte JMP:

  apply_alternatives: feat: 3*32+21, old: (ffffffff810014bd, len: 2), repl: (ffffffff81d0b23c, len: 5)
  alt_insn: e9 b1 62 2f ff
  recompute_jumps: next_rip: ffffffff81d0b241, tgt_rip: ffffffff810014f2, new_displ: 0x00000033, ret len: 2
  converted to: eb 33 90 90 90

and a 5-byte JMP:

  old insn VA: 0xffffffff81001516, CPU feat: X86_FEATURE_ALWAYS, size: 2
  __switch_to:
   ffffffff81001516:      eb 30                   jmp ffffffff81001548
  repl insn: size: 5
   ffffffff81d0b241:      e9 10 63 2f ff          jmpq ffffffff81001556

gets shortened into a two-byte one:

  apply_alternatives: feat: 3*32+21, old: (ffffffff81001516, len: 2), repl: (ffffffff81d0b241, len: 5)
  alt_insn: e9 10 63 2f ff
  recompute_jumps: next_rip: ffffffff81d0b246, tgt_rip: ffffffff81001556, new_displ: 0x0000003e, ret len: 2
  converted to: eb 3e 90 90 90

... and so on.

This leads to a net win of around

40ish replacements * 3 bytes savings =~ 120 bytes of I$

on an AMD guest which means some savings of precious instruction cache
bandwidth. The padding to the shorter 2-byte JMPs are single-byte NOPs
which on smart microarchitectures means discarding NOPs at decode time
and thus freeing up execution bandwidth.

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9b1df43..e0571a2 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -486,13 +486,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
 static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 {
 #ifdef CC_HAVE_ASM_GOTO
-/*
- * We need to spell the jumps to the compiler because, depending on the offset,
- * the replacement jump can be bigger than the original jump, and this we cannot
- * have. Thus, we force the jump to the widest, 4-byte, signed relative
- * offset even though the last would often fit in less bytes.
- */
-		asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
+		asm_volatile_goto("1: jmp %l[t_dynamic]\n"
 			 "2:\n"
 			 ".skip -(((5f-4f) - (2b-1b)) > 0) * "
 			         "((5f-4f) - (2b-1b)),0x90\n"
@@ -506,7 +500,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
 			 " .byte 3b - 2b\n"		/* pad len */
 			 ".previous\n"
 			 ".section .altinstr_replacement,\"ax\"\n"
-			 "4: .byte 0xe9\n .long %l[t_no] - 2b\n"
+			 "4: jmp %l[t_no]\n"
 			 "5:\n"
 			 ".previous\n"
 			 ".section .altinstructions,\"a\"\n"
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index c99b0f1..715af37 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -58,6 +58,21 @@ do {									\
 		printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args);	\
 } while (0)
 
+#define DUMP_BYTES(buf, len, fmt, args...)				\
+do {									\
+	if (unlikely(debug_alternative)) {				\
+		int j;							\
+									\
+		if (!(len))						\
+			break;						\
+									\
+		printk(KERN_DEBUG fmt, ##args);				\
+		for (j = 0; j < (len) - 1; j++)				\
+			printk(KERN_CONT "%02hhx ", buf[j]);		\
+		printk(KERN_CONT "%02hhx\n", buf[j]);			\
+	}								\
+} while (0)
+
 /*
  * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
  * that correspond to that nop. Getting from one nop to the next, we
@@ -244,6 +259,71 @@ extern s32 __smp_locks[], __smp_locks_end[];
 void *text_poke_early(void *addr, const void *opcode, size_t len);
 
 /*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+	return opcode == 0xeb || opcode == 0xe9;
+}
+
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+	u8 *next_rip, *tgt_rip;
+	s32 n_dspl, o_dspl;
+	int repl_len;
+
+	if (a->replacementlen != 5)
+		return;
+
+	o_dspl = *(s32 *)(insnbuf + 1);
+
+	/* next_rip of the replacement JMP */
+	next_rip = repl_insn + a->replacementlen;
+	/* target rip of the replacement JMP */
+	tgt_rip  = next_rip + o_dspl;
+	n_dspl = tgt_rip - orig_insn;
+
+	DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
+
+	if (tgt_rip - orig_insn >= 0) {
+		if (n_dspl - 2 <= 127)
+			goto two_byte_jmp;
+		else
+			goto five_byte_jmp;
+	/* negative offset */
+	} else {
+		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+			goto two_byte_jmp;
+		else
+			goto five_byte_jmp;
+	}
+
+two_byte_jmp:
+	n_dspl -= 2;
+
+	insnbuf[0] = 0xeb;
+	insnbuf[1] = (s8)n_dspl;
+	add_nops(insnbuf + 2, 3);
+
+	repl_len = 2;
+	goto done;
+
+five_byte_jmp:
+	n_dspl -= 5;
+
+	insnbuf[0] = 0xe9;
+	*(s32 *)&insnbuf[1] = n_dspl;
+
+	repl_len = 5;
+
+done:
+
+	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
+/*
  * Replace instructions with better alternatives for this CPU type. This runs
  * before SMP is initialized to avoid SMP problems with self modifying code.
  * This implies that asymmetric systems where APs have less capabilities than
@@ -268,6 +348,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 	 * order.
 	 */
 	for (a = start; a < end; a++) {
+		int insnbuf_sz = 0;
+
 		instr = (u8 *)&a->instr_offset + a->instr_offset;
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 		BUG_ON(a->instrlen > sizeof(insnbuf));
@@ -281,24 +363,35 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 			instr, a->instrlen,
 			replacement, a->replacementlen);
 
+		DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
+		DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
+
 		memcpy(insnbuf, replacement, a->replacementlen);
+		insnbuf_sz = a->replacementlen;
 
 		/* 0xe8 is a relative jump; fix the offset. */
 		if (*insnbuf == 0xe8 && a->replacementlen == 5) {
 			*(s32 *)(insnbuf + 1) += replacement - instr;
-			DPRINTK("Fix CALL offset: 0x%x", *(s32 *)(insnbuf + 1));
+			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+				*(s32 *)(insnbuf + 1),
+				(unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
 		}
 
-		if (a->instrlen > a->replacementlen)
+		if (a->replacementlen && is_jmp(replacement[0]))
+			recompute_jump(a, instr, replacement, insnbuf);
+
+		if (a->instrlen > a->replacementlen) {
 			add_nops(insnbuf + a->replacementlen,
 				 a->instrlen - a->replacementlen);
+			insnbuf_sz += a->instrlen - a->replacementlen;
+		}
+		DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
 
-		text_poke_early(instr, insnbuf, a->instrlen);
+		text_poke_early(instr, insnbuf, insnbuf_sz);
 	}
 }
 
 #ifdef CONFIG_SMP
-
 static void alternatives_smp_lock(const s32 *start, const s32 *end,
 				  u8 *text, u8 *text_end)
 {
@@ -449,7 +542,7 @@ int alternatives_text_reserved(void *start, void *end)
 
 	return 0;
 }
-#endif
+#endif /* CONFIG_SMP */
 
 #ifdef CONFIG_PARAVIRT
 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index a9aedd6..dad718c 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -25,14 +25,13 @@
  */
 	.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
 0:
-	.byte 0xe9	/* 32bit jump */
-	.long \orig-1f	/* by default jump to orig */
+	jmp \orig
 1:
 	.section .altinstr_replacement,"ax"
-2:	.byte 0xe9			/* near jump with 32bit immediate */
-	.long \alt1-1b /* offset */   /* or alternatively to alt1 */
-3:	.byte 0xe9			/* near jump with 32bit immediate */
-	.long \alt2-1b /* offset */   /* or alternatively to alt2 */
+2:
+	jmp \alt1
+3:
+	jmp \alt2
 	.previous
 
 	.section .altinstructions,"a"
-- 
cgit v0.10.2


From 4fd4b6e5537cec5b56db0b22546dd439ebb26830 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sat, 10 Jan 2015 20:34:07 +0100
Subject: x86/alternatives: Use optimized NOPs for padding

Alternatives allow now for an empty old instruction. In this case we go
and pad the space with NOPs at assembly time. However, there are the
optimal, longer NOPs which should be used. Do that at patching time by
adding alt_instr.padlen-sized NOPs at the old instruction address.

Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 715af37..af397cc 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -323,6 +323,14 @@ done:
 		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
 }
 
+static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+{
+	add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+
+	DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
+		   instr, a->instrlen - a->padlen, a->padlen);
+}
+
 /*
  * Replace instructions with better alternatives for this CPU type. This runs
  * before SMP is initialized to avoid SMP problems with self modifying code.
@@ -354,8 +362,12 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
 		BUG_ON(a->instrlen > sizeof(insnbuf));
 		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
-		if (!boot_cpu_has(a->cpuid))
+		if (!boot_cpu_has(a->cpuid)) {
+			if (a->padlen > 1)
+				optimize_nops(a, instr);
+
 			continue;
+		}
 
 		DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)",
 			a->cpuid >> 5,
-- 
cgit v0.10.2


From 090a3f615524c3f75d09fdb37f15ea1868d79f7e Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Mon, 12 Jan 2015 18:19:40 +0100
Subject: x86/lib/copy_page_64.S: Use generic ALTERNATIVE macro

... instead of the semi-version with the spelled out sections.

What is more, make the REP_GOOD version be the default copy_page()
version as the majority of the relevant x86 CPUs do set
X86_FEATURE_REP_GOOD. Thus, copy_page gets compiled to:

  ffffffff8130af80 <copy_page>:
  ffffffff8130af80:       e9 0b 00 00 00          jmpq   ffffffff8130af90 <copy_page_regs>
  ffffffff8130af85:       b9 00 02 00 00          mov    $0x200,%ecx
  ffffffff8130af8a:       f3 48 a5                rep movsq %ds:(%rsi),%es:(%rdi)
  ffffffff8130af8d:       c3                      retq
  ffffffff8130af8e:       66 90                   xchg   %ax,%ax

  ffffffff8130af90 <copy_page_regs>:
  ...

and after the alternatives have run, the JMP to the old, unrolled
version gets NOPed out:

  ffffffff8130af80 <copy_page>:
  ffffffff8130af80:  66 66 90		xchg   %ax,%ax
  ffffffff8130af83:  66 90		xchg   %ax,%ax
  ffffffff8130af85:  b9 00 02 00 00	mov    $0x200,%ecx
  ffffffff8130af8a:  f3 48 a5		rep movsq %ds:(%rsi),%es:(%rdi)
  ffffffff8130af8d:  c3			retq

On modern uarches, those NOPs are cheaper than the unconditional JMP
previously.

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index f1ffdbb..8239dbc 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,23 +2,26 @@
 
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
+/*
+ * Some CPUs run faster using the string copy instructions (sane microcode).
+ * It is also a lot simpler. Use this when possible. But, don't use streaming
+ * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
+ * prefetch distance based on SMP/UP.
+ */
 	ALIGN
-copy_page_rep:
+ENTRY(copy_page)
 	CFI_STARTPROC
+	ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
 	movl	$4096/8, %ecx
 	rep	movsq
 	ret
 	CFI_ENDPROC
-ENDPROC(copy_page_rep)
-
-/*
- *  Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
- *  Could vary the prefetch distance based on SMP/UP.
-*/
+ENDPROC(copy_page)
 
-ENTRY(copy_page)
+ENTRY(copy_page_regs)
 	CFI_STARTPROC
 	subq	$2*8,	%rsp
 	CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
 	addq	$2*8, %rsp
 	CFI_ADJUST_CFA_OFFSET -2*8
 	ret
-.Lcopy_page_end:
 	CFI_ENDPROC
-ENDPROC(copy_page)
-
-	/* Some CPUs run faster using the string copy instructions.
-	   It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
-	.section .altinstr_replacement,"ax"
-1:	.byte 0xeb					/* jmp <disp8> */
-	.byte (copy_page_rep - copy_page) - (2f - 1b)	/* offset */
-2:
-	.previous
-	.section .altinstructions,"a"
-	altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD,	\
-		.Lcopy_page_end-copy_page, 2b-1b, 0
-	.previous
+ENDPROC(copy_page_regs)
-- 
cgit v0.10.2


From de2ff888843a22918ef15306922083739d23dad3 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Tue, 13 Jan 2015 01:38:17 +0100
Subject: x86/lib/copy_user_64.S: Convert to ALTERNATIVE_2

Use the asm macro and drop the locally grown version.

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dad718c..fa997df 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -16,30 +16,6 @@
 #include <asm/asm.h>
 #include <asm/smap.h>
 
-/*
- * By placing feature2 after feature1 in altinstructions section, we logically
- * implement:
- * If CPU has feature2, jmp to alt2 is used
- * else if CPU has feature1, jmp to alt1 is used
- * else jmp to orig is used.
- */
-	.macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
-0:
-	jmp \orig
-1:
-	.section .altinstr_replacement,"ax"
-2:
-	jmp \alt1
-3:
-	jmp \alt2
-	.previous
-
-	.section .altinstructions,"a"
-	altinstruction_entry 0b,2b,\feature1,5,5,0
-	altinstruction_entry 0b,3b,\feature2,5,5,0
-	.previous
-	.endm
-
 	.macro ALIGN_DESTINATION
 	/* check for bad alignment of destination */
 	movl %edi,%ecx
@@ -73,9 +49,11 @@ ENTRY(_copy_to_user)
 	jc bad_to_user
 	cmpq TI_addr_limit(%rax),%rcx
 	ja bad_to_user
-	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
-		copy_user_generic_unrolled,copy_user_generic_string,	\
-		copy_user_enhanced_fast_string
+	ALTERNATIVE_2 "jmp copy_user_generic_unrolled",		\
+		      "jmp copy_user_generic_string",		\
+		      X86_FEATURE_REP_GOOD,			\
+		      "jmp copy_user_enhanced_fast_string",	\
+		      X86_FEATURE_ERMS
 	CFI_ENDPROC
 ENDPROC(_copy_to_user)
 
@@ -88,9 +66,11 @@ ENTRY(_copy_from_user)
 	jc bad_from_user
 	cmpq TI_addr_limit(%rax),%rcx
 	ja bad_from_user
-	ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS,	\
-		copy_user_generic_unrolled,copy_user_generic_string,	\
-		copy_user_enhanced_fast_string
+	ALTERNATIVE_2 "jmp copy_user_generic_unrolled",		\
+		      "jmp copy_user_generic_string",		\
+		      X86_FEATURE_REP_GOOD,			\
+		      "jmp copy_user_enhanced_fast_string",	\
+		      X86_FEATURE_ERMS
 	CFI_ENDPROC
 ENDPROC(_copy_from_user)
 
-- 
cgit v0.10.2


From 669f8a900198599d3c2e2e463bafe12d30d96507 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 15 Jan 2015 09:17:12 +0100
Subject: x86/smap: Use ALTERNATIVE macro

... and drop unfolded version. No need for ASM_NOP3 anymore either as
the alternatives do the proper padding at build time and insert proper
NOPs at boot time.

There should be no apparent operational change from this patch.

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index c56cb4f..ba665eb 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -27,23 +27,11 @@
 
 #ifdef CONFIG_X86_SMAP
 
-#define ASM_CLAC							\
-	661: ASM_NOP3 ;							\
-	.pushsection .altinstr_replacement, "ax" ;			\
-	662: __ASM_CLAC ;						\
-	.popsection ;							\
-	.pushsection .altinstructions, "a" ;				\
-	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ;	\
-	.popsection
-
-#define ASM_STAC							\
-	661: ASM_NOP3 ;							\
-	.pushsection .altinstr_replacement, "ax" ;			\
-	662: __ASM_STAC ;						\
-	.popsection ;							\
-	.pushsection .altinstructions, "a" ;				\
-	altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3, 0 ;	\
-	.popsection
+#define ASM_CLAC \
+	ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
+
+#define ASM_STAC \
+	ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
 
 #else /* CONFIG_X86_SMAP */
 
@@ -61,20 +49,20 @@
 static __always_inline void clac(void)
 {
 	/* Note: a barrier is implicit in alternative() */
-	alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
+	alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
 }
 
 static __always_inline void stac(void)
 {
 	/* Note: a barrier is implicit in alternative() */
-	alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP);
+	alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
 }
 
 /* These macros can be used in asm() statements */
 #define ASM_CLAC \
-	ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
+	ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
 #define ASM_STAC \
-	ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP)
+	ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
 
 #else /* CONFIG_X86_SMAP */
 
-- 
cgit v0.10.2


From 8e65f6e03a90927b8de16c15da976baa6c3fff69 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 18 Jan 2015 12:35:55 +0100
Subject: x86/entry_32: Convert X86_INVD_BUG to ALTERNATIVE macro

Booting a 486 kernel on an AMD guest with this patch applied, says:

  apply_alternatives: feat: 0*32+25, old: (c160a475, len: 5), repl: (c19557d4, len: 5)
  c160a475: alt_insn: 68 10 35 00 c1
  c19557d4: rpl_insn: 68 80 39 00 c1

which is:

  old insn VA: 0xc160a475, CPU feat: X86_FEATURE_XMM, size: 5
  simd_coprocessor_error:
           c160a475:      68 10 35 00 c1          push $0xc1003510 <do_general_protection>
  repl insn: 0xc19557d4, size: 5
           c160a475:      68 80 39 00 c1          push $0xc1003980 <do_simd_coprocessor_error>

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index d23c9a1..b910577 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -816,15 +816,9 @@ ENTRY(simd_coprocessor_error)
 	pushl_cfi $0
 #ifdef CONFIG_X86_INVD_BUG
 	/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661:	pushl_cfi $do_general_protection
-662:
-.section .altinstructions,"a"
-	altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f, 0
-.previous
-.section .altinstr_replacement,"ax"
-663:	pushl $do_simd_coprocessor_error
-664:
-.previous
+	ALTERNATIVE "pushl_cfi $do_general_protection",	\
+		    "pushl $do_simd_coprocessor_error", \
+		    X86_FEATURE_XMM
 #else
 	pushl_cfi $do_simd_coprocessor_error
 #endif
-- 
cgit v0.10.2


From 6620ef28c812b4782b10adb676580c2847d2bec8 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 18 Jan 2015 12:57:41 +0100
Subject: x86/lib/clear_page_64.S: Convert to ALTERNATIVE_2 macro

Move clear_page() up so that we can get 2-byte forward JMPs when
patching:

  apply_alternatives: feat: 3*32+16, old: (ffffffff8130adb0, len: 5), repl: (ffffffff81d0b859, len: 5)
  ffffffff8130adb0: alt_insn: 90 90 90 90 90
  recompute_jump: new_displ: 0x0000003e
  ffffffff81d0b859: rpl_insn: eb 3e 66 66 90

even though the compiler generated 5-byte JMPs which we padded with 5
NOPs.

Also, make the REP_GOOD version be the default as the majority of
machines set REP_GOOD. This way we get to save ourselves the JMP:

  old insn VA: 0xffffffff813038b0, CPU feat: X86_FEATURE_REP_GOOD, size: 5, padlen: 0
  clear_page:

  ffffffff813038b0 <clear_page>:
  ffffffff813038b0:       e9 0b 00 00 00          jmpq ffffffff813038c0
  repl insn: 0xffffffff81cf0e92, size: 0

  old insn VA: 0xffffffff813038b0, CPU feat: X86_FEATURE_ERMS, size: 5, padlen: 0
  clear_page:

  ffffffff813038b0 <clear_page>:
  ffffffff813038b0:       e9 0b 00 00 00          jmpq ffffffff813038c0
  repl insn: 0xffffffff81cf0e92, size: 5
   ffffffff81cf0e92:      e9 69 2a 61 ff          jmpq ffffffff81303900

  ffffffff813038b0 <clear_page>:
  ffffffff813038b0:       e9 69 2a 61 ff          jmpq ffffffff8091631e

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index 38e57fa..e67e579 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,31 +1,35 @@
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
 /*
- * Zero a page. 	
- * rdi	page
- */			
-ENTRY(clear_page_c)
+ * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
+ * recommended to use this when possible and we do use them by default.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original.
+ */
+
+/*
+ * Zero a page.
+ * %rdi	- page
+ */
+ENTRY(clear_page)
 	CFI_STARTPROC
+
+	ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
+		      "jmp clear_page_c_e", X86_FEATURE_ERMS
+
 	movl $4096/8,%ecx
 	xorl %eax,%eax
 	rep stosq
 	ret
 	CFI_ENDPROC
-ENDPROC(clear_page_c)
+ENDPROC(clear_page)
 
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_orig)
 	CFI_STARTPROC
-	movl $4096,%ecx
-	xorl %eax,%eax
-	rep stosb
-	ret
-	CFI_ENDPROC
-ENDPROC(clear_page_c_e)
 
-ENTRY(clear_page)
-	CFI_STARTPROC
 	xorl   %eax,%eax
 	movl   $4096/64,%ecx
 	.p2align 4
@@ -45,29 +49,13 @@ ENTRY(clear_page)
 	nop
 	ret
 	CFI_ENDPROC
-.Lclear_page_end:
-ENDPROC(clear_page)
-
-	/*
-	 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
-	 * It is recommended to use this when possible.
-	 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
-	 * Otherwise, use original function.
-	 *
-	 */
+ENDPROC(clear_page_orig)
 
-#include <asm/cpufeature.h>
-
-	.section .altinstr_replacement,"ax"
-1:	.byte 0xeb					/* jmp <disp8> */
-	.byte (clear_page_c - clear_page) - (2f - 1b)	/* offset */
-2:	.byte 0xeb					/* jmp <disp8> */
-	.byte (clear_page_c_e - clear_page) - (3f - 2b)	/* offset */
-3:
-	.previous
-	.section .altinstructions,"a"
-	altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
-			     .Lclear_page_end-clear_page, 2b-1b, 0
-	altinstruction_entry clear_page,2b,X86_FEATURE_ERMS,   \
-			     .Lclear_page_end-clear_page,3b-2b, 0
-	.previous
+ENTRY(clear_page_c_e)
+	CFI_STARTPROC
+	movl $4096,%ecx
+	xorl %eax,%eax
+	rep stosb
+	ret
+	CFI_ENDPROC
+ENDPROC(clear_page_c_e)
-- 
cgit v0.10.2


From c70e1b475f37f07ab7181ad28458666d59aae634 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 18 Jan 2015 15:19:55 +0100
Subject: x86/asm: Use alternative_2() in rdtsc_barrier()

... now that we have it.

Acked-by: Andy Lutomirski <luto@amacapital.net>
Cc: Richard Weinberger <richard@nod.at>
Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 2ab1eb3..959e45b 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -95,13 +95,11 @@ do {									\
  * Stop RDTSC speculation. This is needed when you need to use RDTSC
  * (or get_cycles or vread that possibly accesses the TSC) in a defined
  * code region.
- *
- * (Could use an alternative three way for this if there was one.)
  */
 static __always_inline void rdtsc_barrier(void)
 {
-	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+			  "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
 #endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1..8ffd214 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -64,8 +64,8 @@
  */
 static inline void rdtsc_barrier(void)
 {
-	alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
-	alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+	alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+			  "lfence", X86_FEATURE_LFENCE_RDTSC);
 }
 
 #endif
-- 
cgit v0.10.2


From a930dc4543a2b213deb9fde12682716edff8a4a6 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Sun, 18 Jan 2015 17:48:18 +0100
Subject: x86/asm: Cleanup prefetch primitives

This is based on a patch originally by hpa.

With the current improvements to the alternatives, we can simply use %P1
as a mem8 operand constraint and rely on the toolchain to generate the
proper instruction sizes. For example, on 32-bit, where we use an empty
old instruction we get:

  apply_alternatives: feat: 6*32+8, old: (c104648b, len: 4), repl: (c195566c, len: 4)
  c104648b: alt_insn: 90 90 90 90
  c195566c: rpl_insn: 0f 0d 4b 5c

  ...

  apply_alternatives: feat: 6*32+8, old: (c18e09b4, len: 3), repl: (c1955948, len: 3)
  c18e09b4: alt_insn: 90 90 90
  c1955948: rpl_insn: 0f 0d 08

  ...

  apply_alternatives: feat: 6*32+8, old: (c1190cf9, len: 7), repl: (c1955a79, len: 7)
  c1190cf9: alt_insn: 90 90 90 90 90 90 90
  c1955a79: rpl_insn: 0f 0d 0d a0 d4 85 c1

all with the proper padding done depending on the size of the
replacement instruction the compiler generates.

Signed-off-by: Borislav Petkov <bp@suse.de>
Cc: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index efc3b22..8118e94 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
 {
 	volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
 
-	alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
+	alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
 		       ASM_OUTPUT2("=r" (v), "=m" (*addr)),
 		       ASM_OUTPUT2("0" (v), "m" (*addr)));
 }
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ec1c935..7be2c9a 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -761,10 +761,10 @@ extern char			ignore_fpu_irq;
 #define ARCH_HAS_SPINLOCK_PREFETCH
 
 #ifdef CONFIG_X86_32
-# define BASE_PREFETCH		ASM_NOP4
+# define BASE_PREFETCH		""
 # define ARCH_HAS_PREFETCH
 #else
-# define BASE_PREFETCH		"prefetcht0 (%1)"
+# define BASE_PREFETCH		"prefetcht0 %P1"
 #endif
 
 /*
@@ -775,10 +775,9 @@ extern char			ignore_fpu_irq;
  */
 static inline void prefetch(const void *x)
 {
-	alternative_input(BASE_PREFETCH,
-			  "prefetchnta (%1)",
+	alternative_input(BASE_PREFETCH, "prefetchnta %P1",
 			  X86_FEATURE_XMM,
-			  "r" (x));
+			  "m" (*(const char *)x));
 }
 
 /*
@@ -788,10 +787,9 @@ static inline void prefetch(const void *x)
  */
 static inline void prefetchw(const void *x)
 {
-	alternative_input(BASE_PREFETCH,
-			  "prefetchw (%1)",
-			  X86_FEATURE_3DNOW,
-			  "r" (x));
+	alternative_input(BASE_PREFETCH, "prefetchw %P1",
+			  X86_FEATURE_3DNOWPREFETCH,
+			  "m" (*(const char *)x));
 }
 
 static inline void spin_lock_prefetch(const void *x)
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239..dd9e505 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c)
 		set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
 
 	rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+	/* 3DNow or LM implies PREFETCHW */
+	if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+		if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+			set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
 }
 
 #ifdef CONFIG_X86_32
-- 
cgit v0.10.2


From 84d95ad4cb9015ea953bf14cea05ba371d4d42bb Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 4 Feb 2015 08:57:00 +0100
Subject: x86/lib/memset_64.S: Convert to ALTERNATIVE_2 macro

Make alternatives replace single JMPs instead of whole memset functions,
thus decreasing the amount of instructions copied during patching time
at boot.

While at it, make it use the REP_GOOD version by default which means
alternatives NOP out the JMP to the other versions, as REP_GOOD is set
by default on the majority of relevant x86 processors.

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index f6153c1..93118fb 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -5,19 +5,30 @@
 #include <asm/cpufeature.h>
 #include <asm/alternative-asm.h>
 
+.weak memset
+
 /*
  * ISO C memset - set a memory block to a byte value. This function uses fast
  * string to get better performance than the original function. The code is
  * simpler and shorter than the orignal function as well.
- *	
+ *
  * rdi   destination
- * rsi   value (char) 
- * rdx   count (bytes) 
- * 
+ * rsi   value (char)
+ * rdx   count (bytes)
+ *
  * rax   original destination
- */	
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemset_c:
+ */
+ENTRY(memset)
+ENTRY(__memset)
+	/*
+	 * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
+	 * to use it when possible. If not available, use fast string instructions.
+	 *
+	 * Otherwise, use original memset function.
+	 */
+	ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+		      "jmp memset_erms", X86_FEATURE_ERMS
+
 	movq %rdi,%r9
 	movq %rdx,%rcx
 	andl $7,%edx
@@ -31,8 +42,8 @@
 	rep stosb
 	movq %r9,%rax
 	ret
-.Lmemset_e:
-	.previous
+ENDPROC(memset)
+ENDPROC(__memset)
 
 /*
  * ISO C memset - set a memory block to a byte value. This function uses
@@ -45,21 +56,16 @@
  *
  * rax   original destination
  */
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemset_c_e:
+ENTRY(memset_erms)
 	movq %rdi,%r9
 	movb %sil,%al
 	movq %rdx,%rcx
 	rep stosb
 	movq %r9,%rax
 	ret
-.Lmemset_e_e:
-	.previous
-
-.weak memset
+ENDPROC(memset_erms)
 
-ENTRY(memset)
-ENTRY(__memset)
+ENTRY(memset_orig)
 	CFI_STARTPROC
 	movq %rdi,%r10
 
@@ -134,23 +140,4 @@ ENTRY(__memset)
 	jmp .Lafter_bad_alignment
 .Lfinal:
 	CFI_ENDPROC
-ENDPROC(memset)
-ENDPROC(__memset)
-
-	/* Some CPUs support enhanced REP MOVSB/STOSB feature.
-	 * It is recommended to use this when possible.
-	 *
-	 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
-	 * instructions.
-	 *
-	 * Otherwise, use original memset function.
-	 *
-	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
-         * feature to implement the right patch order.
-	 */
-	.section .altinstructions,"a"
-	altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-			     .Lfinal-__memset,.Lmemset_e-.Lmemset_c,0
-	altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-			     .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e,0
-	.previous
+ENDPROC(memset_orig)
-- 
cgit v0.10.2


From a77600cd03a44e54df99ea742e02f4899b82c8e1 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 4 Feb 2015 09:11:17 +0100
Subject: x86/lib/memmove_64.S: Convert memmove() to ALTERNATIVE macro

Make it execute the ERMS version if support is present and we're in the
forward memmove() part and remove the unfolded alternatives section
definition.

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index bbfa6b2..0f8a0d0 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -5,7 +5,6 @@
  * This assembly file is re-written from memmove_64.c file.
  *	- Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
  */
-#define _STRING_C
 #include <linux/linkage.h>
 #include <asm/dwarf2.h>
 #include <asm/cpufeature.h>
@@ -44,6 +43,8 @@ ENTRY(__memmove)
 	jg 2f
 
 .Lmemmove_begin_forward:
+	ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+
 	/*
 	 * movsq instruction have many startup latency
 	 * so we handle small size by general register.
@@ -207,21 +208,5 @@ ENTRY(__memmove)
 13:
 	retq
 	CFI_ENDPROC
-
-	.section .altinstr_replacement,"ax"
-.Lmemmove_begin_forward_efs:
-	/* Forward moving data. */
-	movq %rdx, %rcx
-	rep movsb
-	retq
-.Lmemmove_end_forward_efs:
-	.previous
-
-	.section .altinstructions,"a"
-	altinstruction_entry .Lmemmove_begin_forward,		\
-		.Lmemmove_begin_forward_efs,X86_FEATURE_ERMS,	\
-		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
-		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs,0
-	.previous
 ENDPROC(__memmove)
 ENDPROC(memmove)
-- 
cgit v0.10.2


From e0bc8d179e39a31bb3a56974374e55374fbf29be Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Wed, 4 Feb 2015 15:36:49 +0100
Subject: x86/lib/memcpy_64.S: Convert memcpy to ALTERNATIVE_2 macro

Make REP_GOOD variant the default after alternatives have run.

Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bbfdacc..b046664 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,12 +1,20 @@
 /* Copyright 2002 Andi Kleen */
 
 #include <linux/linkage.h>
-
 #include <asm/cpufeature.h>
 #include <asm/dwarf2.h>
 #include <asm/alternative-asm.h>
 
 /*
+ * We build a jump to memcpy_orig by default which gets NOPped out on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
+ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
+ */
+
+.weak memcpy
+
+/*
  * memcpy - Copy a memory block.
  *
  * Input:
@@ -17,15 +25,11 @@
  * Output:
  * rax original destination
  */
+ENTRY(__memcpy)
+ENTRY(memcpy)
+	ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+		      "jmp memcpy_erms", X86_FEATURE_ERMS
 
-/*
- * memcpy_c() - fast string ops (REP MOVSQ) based variant.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
- */
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c:
 	movq %rdi, %rax
 	movq %rdx, %rcx
 	shrq $3, %rcx
@@ -34,29 +38,21 @@
 	movl %edx, %ecx
 	rep movsb
 	ret
-.Lmemcpy_e:
-	.previous
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
 
 /*
- * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
- * memcpy_c. Use memcpy_c_e when possible.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
+ * memcpy_erms() - enhanced fast string memcpy. This is faster and
+ * simpler than memcpy. Use memcpy_erms when possible.
  */
-	.section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c_e:
+ENTRY(memcpy_erms)
 	movq %rdi, %rax
 	movq %rdx, %rcx
 	rep movsb
 	ret
-.Lmemcpy_e_e:
-	.previous
-
-.weak memcpy
+ENDPROC(memcpy_erms)
 
-ENTRY(__memcpy)
-ENTRY(memcpy)
+ENTRY(memcpy_orig)
 	CFI_STARTPROC
 	movq %rdi, %rax
 
@@ -183,26 +179,4 @@ ENTRY(memcpy)
 .Lend:
 	retq
 	CFI_ENDPROC
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-
-	/*
-	 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
-	 * If the feature is supported, memcpy_c_e() is the first choice.
-	 * If enhanced rep movsb copy is not available, use fast string copy
-	 * memcpy_c() when possible. This is faster and code is simpler than
-	 * original memcpy().
-	 * Otherwise, original memcpy() is used.
-	 * In .altinstructions section, ERMS feature is placed after REG_GOOD
-         * feature to implement the right patch order.
-	 *
-	 * Replace only beginning, memcpy is used to apply alternatives,
-	 * so it is silly to overwrite itself with nops - reboot is the
-	 * only outcome...
-	 */
-	.section .altinstructions, "a"
-	altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
-			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c,0
-	altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
-			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e,0
-	.previous
+ENDPROC(memcpy_orig)
-- 
cgit v0.10.2


From 0cf55934ecace74bb7d26c0e9679fb41675a8903 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 26 Feb 2015 18:38:16 +0100
Subject: perf/bench: Fix mem* routines usage after alternatives change

Adjust perf bench to the new changes in the alternatives code for
memcpy/memset.

Reviewed-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
index d66ab79..8c0c1a2 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -1,12 +1,12 @@
 
-MEMCPY_FN(__memcpy,
+MEMCPY_FN(memcpy_orig,
 	"x86-64-unrolled",
 	"unrolled memcpy() in arch/x86/lib/memcpy_64.S")
 
-MEMCPY_FN(memcpy_c,
+MEMCPY_FN(__memcpy,
 	"x86-64-movsq",
 	"movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
 
-MEMCPY_FN(memcpy_c_e,
+MEMCPY_FN(memcpy_erms,
 	"x86-64-movsb",
 	"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
index fcd9cf0..e4c2c30 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -1,8 +1,6 @@
 #define memcpy MEMCPY /* don't hide glibc's memcpy() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#define Lmemcpy_c globl memcpy_c; memcpy_c
-#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
 #include "../../../arch/x86/lib/memcpy_64.S"
 /*
  * We need to provide note.GNU-stack section, saying that we want
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h
index a71dff9..f02d028 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -1,12 +1,12 @@
 
-MEMSET_FN(__memset,
+MEMSET_FN(memset_orig,
 	"x86-64-unrolled",
 	"unrolled memset() in arch/x86/lib/memset_64.S")
 
-MEMSET_FN(memset_c,
+MEMSET_FN(__memset,
 	"x86-64-stosq",
 	"movsq-based memset() in arch/x86/lib/memset_64.S")
 
-MEMSET_FN(memset_c_e,
+MEMSET_FN(memset_erms,
 	"x86-64-stosb",
 	"movsb-based memset() in arch/x86/lib/memset_64.S")
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S
index 9e5af89..de27878 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm.S
+++ b/tools/perf/bench/mem-memset-x86-64-asm.S
@@ -1,8 +1,6 @@
 #define memset MEMSET /* don't hide glibc's memset() */
 #define altinstr_replacement text
 #define globl p2align 4; .globl
-#define Lmemset_c globl memset_c; memset_c
-#define Lmemset_c_e globl memset_c_e; memset_c_e
 #include "../../../arch/x86/lib/memset_64.S"
 
 /*
diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/perf/util/include/asm/alternative-asm.h
index 6789d788..3a3a0f1 100644
--- a/tools/perf/util/include/asm/alternative-asm.h
+++ b/tools/perf/util/include/asm/alternative-asm.h
@@ -4,5 +4,6 @@
 /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
 
 #define altinstruction_entry #
+#define ALTERNATIVE_2 #
 
 #endif
-- 
cgit v0.10.2


From 515e23f0193f42dab3b3e89f2c54da16b8bc3481 Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 26 Feb 2015 18:51:37 +0100
Subject: perf/bench: Carve out mem routine benchmarking

... so that we can call it multiple times. See next patch.

Reviewed-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index 6c14afe..50991c4 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -135,84 +135,32 @@ struct bench_mem_info {
 	const char *const *usage;
 };
 
-static int bench_mem_common(int argc, const char **argv,
-		     const char *prefix __maybe_unused,
-		     struct bench_mem_info *info)
+static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen)
 {
-	int i;
-	size_t len;
-	double totallen;
+	const struct routine *r = &info->routines[r_idx];
 	double result_bps[2];
 	u64 result_cycle[2];
 
-	argc = parse_options(argc, argv, options,
-			     info->usage, 0);
-
-	if (no_prefault && only_prefault) {
-		fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
-		return 1;
-	}
-
-	if (use_cycle)
-		init_cycle();
-
-	len = (size_t)perf_atoll((char *)length_str);
-	totallen = (double)len * iterations;
-
 	result_cycle[0] = result_cycle[1] = 0ULL;
 	result_bps[0] = result_bps[1] = 0.0;
 
-	if ((s64)len <= 0) {
-		fprintf(stderr, "Invalid length:%s\n", length_str);
-		return 1;
-	}
-
-	/* same to without specifying either of prefault and no-prefault */
-	if (only_prefault && no_prefault)
-		only_prefault = no_prefault = false;
-
-	for (i = 0; info->routines[i].name; i++) {
-		if (!strcmp(info->routines[i].name, routine))
-			break;
-	}
-	if (!info->routines[i].name) {
-		printf("Unknown routine:%s\n", routine);
-		printf("Available routines...\n");
-		for (i = 0; info->routines[i].name; i++) {
-			printf("\t%s ... %s\n",
-			       info->routines[i].name, info->routines[i].desc);
-		}
-		return 1;
-	}
-
 	if (bench_format == BENCH_FORMAT_DEFAULT)
 		printf("# Copying %s Bytes ...\n\n", length_str);
 
 	if (!only_prefault && !no_prefault) {
 		/* show both of results */
 		if (use_cycle) {
-			result_cycle[0] =
-				info->do_cycle(&info->routines[i], len, false);
-			result_cycle[1] =
-				info->do_cycle(&info->routines[i], len, true);
+			result_cycle[0] = info->do_cycle(r, len, false);
+			result_cycle[1] = info->do_cycle(r, len, true);
 		} else {
-			result_bps[0] =
-				info->do_gettimeofday(&info->routines[i],
-						len, false);
-			result_bps[1] =
-				info->do_gettimeofday(&info->routines[i],
-						len, true);
+			result_bps[0]   = info->do_gettimeofday(r, len, false);
+			result_bps[1]   = info->do_gettimeofday(r, len, true);
 		}
 	} else {
-		if (use_cycle) {
-			result_cycle[pf] =
-				info->do_cycle(&info->routines[i],
-						len, only_prefault);
-		} else {
-			result_bps[pf] =
-				info->do_gettimeofday(&info->routines[i],
-						len, only_prefault);
-		}
+		if (use_cycle)
+			result_cycle[pf] = info->do_cycle(r, len, only_prefault);
+		else
+			result_bps[pf] = info->do_gettimeofday(r, len, only_prefault);
 	}
 
 	switch (bench_format) {
@@ -265,6 +213,54 @@ static int bench_mem_common(int argc, const char **argv,
 		die("unknown format: %d\n", bench_format);
 		break;
 	}
+}
+
+static int bench_mem_common(int argc, const char **argv,
+		     const char *prefix __maybe_unused,
+		     struct bench_mem_info *info)
+{
+	int i;
+	size_t len;
+	double totallen;
+
+	argc = parse_options(argc, argv, options,
+			     info->usage, 0);
+
+	if (no_prefault && only_prefault) {
+		fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
+		return 1;
+	}
+
+	if (use_cycle)
+		init_cycle();
+
+	len = (size_t)perf_atoll((char *)length_str);
+	totallen = (double)len * iterations;
+
+	if ((s64)len <= 0) {
+		fprintf(stderr, "Invalid length:%s\n", length_str);
+		return 1;
+	}
+
+	/* same to without specifying either of prefault and no-prefault */
+	if (only_prefault && no_prefault)
+		only_prefault = no_prefault = false;
+
+	for (i = 0; info->routines[i].name; i++) {
+		if (!strcmp(info->routines[i].name, routine))
+			break;
+	}
+	if (!info->routines[i].name) {
+		printf("Unknown routine:%s\n", routine);
+		printf("Available routines...\n");
+		for (i = 0; info->routines[i].name; i++) {
+			printf("\t%s ... %s\n",
+			       info->routines[i].name, info->routines[i].desc);
+		}
+		return 1;
+	}
+
+	__bench_mem_routine(info, i, len, totallen);
 
 	return 0;
 }
-- 
cgit v0.10.2


From dfecb95cdfeaf7872d83a96bec3a606e9cd95c8d Mon Sep 17 00:00:00 2001
From: Borislav Petkov <bp@suse.de>
Date: Thu, 26 Feb 2015 19:02:43 +0100
Subject: perf/bench: Add -r all so that you can run all mem* routines

perf bench mem mem{set,cpy} -r all thus runs all available mem
benchmarking routines.

Reviewed-by: Hitoshi Mitake <mitake.hitoshi@lab.ntt.co.jp>
Signed-off-by: Borislav Petkov <bp@suse.de>

diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index 50991c4..eca3d0e 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -36,7 +36,7 @@ static const struct option options[] = {
 		    "Specify length of memory to copy. "
 		    "Available units: B, KB, MB, GB and TB (upper and lower)"),
 	OPT_STRING('r', "routine", &routine, "default",
-		    "Specify routine to copy"),
+		    "Specify routine to copy, \"all\" runs all available routines"),
 	OPT_INTEGER('i', "iterations", &iterations,
 		    "repeat memcpy() invocation this number of times"),
 	OPT_BOOLEAN('c', "cycle", &use_cycle,
@@ -144,6 +144,8 @@ static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t l
 	result_cycle[0] = result_cycle[1] = 0ULL;
 	result_bps[0] = result_bps[1] = 0.0;
 
+	printf("Routine %s (%s)\n", r->name, r->desc);
+
 	if (bench_format == BENCH_FORMAT_DEFAULT)
 		printf("# Copying %s Bytes ...\n\n", length_str);
 
@@ -246,6 +248,12 @@ static int bench_mem_common(int argc, const char **argv,
 	if (only_prefault && no_prefault)
 		only_prefault = no_prefault = false;
 
+	if (!strncmp(routine, "all", 3)) {
+		for (i = 0; info->routines[i].name; i++)
+			__bench_mem_routine(info, i, len, totallen);
+		return 0;
+	}
+
 	for (i = 0; info->routines[i].name; i++) {
 		if (!strcmp(info->routines[i].name, routine))
 			break;
-- 
cgit v0.10.2