From 426932909093e4e7729777a0e2beed4b54911361 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 5 Jan 2012 16:12:25 +0000
Subject: x86-64: Slightly shorten copy_page()

%r13 got saved and restored without ever getting touched, so
there's no need to do so.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F05D9F9020000780006AA0D@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 01c805b..6b34d04 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -20,14 +20,12 @@ ENDPROC(copy_page_c)
 
 ENTRY(copy_page)
 	CFI_STARTPROC
-	subq	$3*8,%rsp
-	CFI_ADJUST_CFA_OFFSET 3*8
+	subq	$2*8,%rsp
+	CFI_ADJUST_CFA_OFFSET 2*8
 	movq	%rbx,(%rsp)
 	CFI_REL_OFFSET rbx, 0
 	movq	%r12,1*8(%rsp)
 	CFI_REL_OFFSET r12, 1*8
-	movq	%r13,2*8(%rsp)
-	CFI_REL_OFFSET r13, 2*8
 
 	movl	$(4096/64)-5,%ecx
 	.p2align 4
@@ -91,10 +89,8 @@ ENTRY(copy_page)
 	CFI_RESTORE rbx
 	movq	1*8(%rsp),%r12
 	CFI_RESTORE r12
-	movq	2*8(%rsp),%r13
-	CFI_RESTORE r13
-	addq	$3*8,%rsp
-	CFI_ADJUST_CFA_OFFSET -3*8
+	addq	$2*8,%rsp
+	CFI_ADJUST_CFA_OFFSET -2*8
 	ret
 .Lcopy_page_end:
 	CFI_ENDPROC
-- 
cgit v0.10.2


From 5d7244e7c984cecead412bde6395ce18618a4a37 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 5 Jan 2012 16:10:42 +0000
Subject: x86-64: Fix memset() to support sizes of 4Gb and above

While currently there doesn't appear to be any reachable in-tree
case where such large memory blocks may be passed to memset()
(alloc_bootmem() being the primary non-reachable one, as it gets
called with suitably large sizes in FLATMEM configurations), we
have recently hit the problem a second time in our Xen kernels.

Rather than working around it a second time, prevent others from
falling into the same trap by fixing this long standing
limitation.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F05D992020000780006AA09@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 79bd454..2dcb380 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -19,16 +19,15 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemset_c:
 	movq %rdi,%r9
-	movl %edx,%r8d
-	andl $7,%r8d
-	movl %edx,%ecx
-	shrl $3,%ecx
+	movq %rdx,%rcx
+	andl $7,%edx
+	shrq $3,%rcx
 	/* expand byte value  */
 	movzbl %sil,%esi
 	movabs $0x0101010101010101,%rax
-	mulq %rsi		/* with rax, clobbers rdx */
+	imulq %rsi,%rax
 	rep stosq
-	movl %r8d,%ecx
+	movl %edx,%ecx
 	rep stosb
 	movq %r9,%rax
 	ret
@@ -50,7 +49,7 @@
 .Lmemset_c_e:
 	movq %rdi,%r9
 	movb %sil,%al
-	movl %edx,%ecx
+	movq %rdx,%rcx
 	rep stosb
 	movq %r9,%rax
 	ret
@@ -61,12 +60,11 @@ ENTRY(memset)
 ENTRY(__memset)
 	CFI_STARTPROC
 	movq %rdi,%r10
-	movq %rdx,%r11
 
 	/* expand byte value  */
 	movzbl %sil,%ecx
 	movabs $0x0101010101010101,%rax
-	mul    %rcx		/* with rax, clobbers rdx */
+	imulq  %rcx,%rax
 
 	/* align dst */
 	movl  %edi,%r9d
@@ -75,13 +73,13 @@ ENTRY(__memset)
 	CFI_REMEMBER_STATE
 .Lafter_bad_alignment:
 
-	movl %r11d,%ecx
-	shrl $6,%ecx
+	movq  %rdx,%rcx
+	shrq  $6,%rcx
 	jz	 .Lhandle_tail
 
 	.p2align 4
 .Lloop_64:
-	decl   %ecx
+	decq  %rcx
 	movq  %rax,(%rdi)
 	movq  %rax,8(%rdi)
 	movq  %rax,16(%rdi)
@@ -97,7 +95,7 @@ ENTRY(__memset)
 	   to predict jump tables. */
 	.p2align 4
 .Lhandle_tail:
-	movl	%r11d,%ecx
+	movl	%edx,%ecx
 	andl    $63&(~7),%ecx
 	jz 		.Lhandle_7
 	shrl	$3,%ecx
@@ -109,12 +107,11 @@ ENTRY(__memset)
 	jnz    .Lloop_8
 
 .Lhandle_7:
-	movl	%r11d,%ecx
-	andl	$7,%ecx
+	andl	$7,%edx
 	jz      .Lende
 	.p2align 4
 .Lloop_1:
-	decl    %ecx
+	decl    %edx
 	movb 	%al,(%rdi)
 	leaq	1(%rdi),%rdi
 	jnz     .Lloop_1
@@ -125,13 +122,13 @@ ENTRY(__memset)
 
 	CFI_RESTORE_STATE
 .Lbad_alignment:
-	cmpq $7,%r11
+	cmpq $7,%rdx
 	jbe	.Lhandle_7
 	movq %rax,(%rdi)	/* unaligned store */
 	movq $8,%r8
 	subq %r9,%r8
 	addq %r8,%rdi
-	subq %r8,%r11
+	subq %r8,%rdx
 	jmp .Lafter_bad_alignment
 .Lfinal:
 	CFI_ENDPROC
-- 
cgit v0.10.2


From 2ab560911a427fdc73bfd3a7d2944d8ee0ca6db8 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 26 Jan 2012 15:50:55 +0000
Subject: x86-64: Fix memcpy() to support sizes of 4Gb and above

While currently there doesn't appear to be any reachable in-tree
case where such large memory blocks may be passed to memcpy(),
we already had hit the problem in our Xen kernels. Just like
done recently for mmeset(), rather than working around it,
prevent others from falling into the same trap by fixing this
long standing limitation.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F21846F020000780006F3FA@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index efbf2a0..1235b04 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -27,9 +27,8 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemcpy_c:
 	movq %rdi, %rax
-
-	movl %edx, %ecx
-	shrl $3, %ecx
+	movq %rdx, %rcx
+	shrq $3, %rcx
 	andl $7, %edx
 	rep movsq
 	movl %edx, %ecx
@@ -48,8 +47,7 @@
 	.section .altinstr_replacement, "ax", @progbits
 .Lmemcpy_c_e:
 	movq %rdi, %rax
-
-	movl %edx, %ecx
+	movq %rdx, %rcx
 	rep movsb
 	ret
 .Lmemcpy_e_e:
@@ -60,10 +58,7 @@ ENTRY(memcpy)
 	CFI_STARTPROC
 	movq %rdi, %rax
 
-	/*
-	 * Use 32bit CMP here to avoid long NOP padding.
-	 */
-	cmp  $0x20, %edx
+	cmpq $0x20, %rdx
 	jb .Lhandle_tail
 
 	/*
@@ -72,7 +67,7 @@ ENTRY(memcpy)
 	 */
 	cmp  %dil, %sil
 	jl .Lcopy_backward
-	subl $0x20, %edx
+	subq $0x20, %rdx
 .Lcopy_forward_loop:
 	subq $0x20,	%rdx
 
@@ -91,7 +86,7 @@ ENTRY(memcpy)
 	movq %r11,	3*8(%rdi)
 	leaq 4*8(%rdi),	%rdi
 	jae  .Lcopy_forward_loop
-	addq $0x20,	%rdx
+	addl $0x20,	%edx
 	jmp  .Lhandle_tail
 
 .Lcopy_backward:
@@ -123,11 +118,11 @@ ENTRY(memcpy)
 	/*
 	 * Calculate copy position to head.
 	 */
-	addq $0x20,	%rdx
+	addl $0x20,	%edx
 	subq %rdx,	%rsi
 	subq %rdx,	%rdi
 .Lhandle_tail:
-	cmpq $16,	%rdx
+	cmpl $16,	%edx
 	jb   .Lless_16bytes
 
 	/*
@@ -144,7 +139,7 @@ ENTRY(memcpy)
 	retq
 	.p2align 4
 .Lless_16bytes:
-	cmpq $8,	%rdx
+	cmpl $8,	%edx
 	jb   .Lless_8bytes
 	/*
 	 * Move data from 8 bytes to 15 bytes.
@@ -156,7 +151,7 @@ ENTRY(memcpy)
 	retq
 	.p2align 4
 .Lless_8bytes:
-	cmpq $4,	%rdx
+	cmpl $4,	%edx
 	jb   .Lless_3bytes
 
 	/*
-- 
cgit v0.10.2


From 9d8e22777e66f420e46490e9fc6f8cb7e0e2222b Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 26 Jan 2012 15:55:32 +0000
Subject: x86-64: Handle byte-wise tail copying in memcpy() without a loop

While hard to measure, reducing the number of possibly/likely
mis-predicted branches can generally be expected to be slightly
better.

Other than apparent at the first glance, this also doesn't grow
the function size (the alignment gap to the next function just
gets smaller).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 1235b04..1c273be 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -164,18 +164,19 @@ ENTRY(memcpy)
 	retq
 	.p2align 4
 .Lless_3bytes:
-	cmpl $0, %edx
-	je .Lend
+	subl $1, %edx
+	jb .Lend
 	/*
 	 * Move data from 1 bytes to 3 bytes.
 	 */
-.Lloop_1:
-	movb (%rsi), %r8b
-	movb %r8b, (%rdi)
-	incq %rdi
-	incq %rsi
-	decl %edx
-	jnz .Lloop_1
+	movzbl (%rsi), %ecx
+	jz .Lstore_1byte
+	movzbq 1(%rsi), %r8
+	movzbq (%rsi, %rdx), %r9
+	movb %r8b, 1(%rdi)
+	movb %r9b, (%rdi, %rdx)
+.Lstore_1byte:
+	movb %cl, (%rdi)
 
 .Lend:
 	retq
-- 
cgit v0.10.2


From 7931d493051ea9b09e4fddee2dc40b2eb88d62b9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 3 Feb 2012 15:06:26 +0000
Subject: x86/spinlocks: Eliminate TICKET_MASK

The definition of it being questionable already (unnecessarily
including a cast), and it being used in a single place that can
be written shorter without it, remove this #define.

Along the same lines, simplify __ticket_spin_is_locked()'s main
expression, which was the more convoluted way because of needs
that went away with the recent type changes by Jeremy.

This is pure cleanup, no functional change intended.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Acked-by: Jeremy Fitzhardinge <jeremy@goop.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/4F2C06020200007800071066@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index a82c2bf..76bfa2c 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -88,14 +88,14 @@ static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
 
-	return !!(tmp.tail ^ tmp.head);
+	return tmp.tail != tmp.head;
 }
 
 static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
 {
 	struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets);
 
-	return ((tmp.tail - tmp.head) & TICKET_MASK) > 1;
+	return (__ticket_t)(tmp.tail - tmp.head) > 1;
 }
 
 #ifndef CONFIG_PARAVIRT_SPINLOCKS
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 8ebd5df..ad0ad07 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -16,7 +16,6 @@ typedef u32 __ticketpair_t;
 #endif
 
 #define TICKET_SHIFT	(sizeof(__ticket_t) * 8)
-#define TICKET_MASK	((__ticket_t)((1 << TICKET_SHIFT) - 1))
 
 typedef struct arch_spinlock {
 	union {
-- 
cgit v0.10.2


From e0891a9816316b5e05fd5b0453ffe9fd6a56f489 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 20 Feb 2012 22:39:18 +0000
Subject: bitops: Adjust the comment on get_order() to describe the size==0
 case

Adjust the comment on get_order() to note that the result of passing a size of
0 results in an undefined value.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: http://lkml.kernel.org/r/20120220223917.16199.9416.stgit@warthog.procyon.org.uk
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/include/asm-generic/getorder.h b/include/asm-generic/getorder.h
index 67e7245..76e9687 100644
--- a/include/asm-generic/getorder.h
+++ b/include/asm-generic/getorder.h
@@ -5,7 +5,28 @@
 
 #include <linux/compiler.h>
 
-/* Pure 2^n version of get_order */
+/**
+ * get_order - Determine the allocation order of a memory size
+ * @size: The size for which to get the order
+ *
+ * Determine the allocation order of a particular sized block of memory.  This
+ * is on a logarithmic scale, where:
+ *
+ *	0 -> 2^0 * PAGE_SIZE and below
+ *	1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1
+ *	2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1
+ *	3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1
+ *	4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1
+ *	...
+ *
+ * The order returned is used to find the smallest allocation granule required
+ * to hold an object of the specified size.
+ *
+ * The result is undefined if the size is 0.
+ *
+ * This function may be used to initialise variables with compile time
+ * evaluations of constants.
+ */
 static inline __attribute_const__ int get_order(unsigned long size)
 {
 	int order;
-- 
cgit v0.10.2


From d66acc39c7cee323733c8503b9de1821a56dff7e Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Mon, 20 Feb 2012 22:39:29 +0000
Subject: bitops: Optimise get_order()

Optimise get_order() to use bit scanning instructions if such exist rather than
a loop.  Also, make it possible to use get_order() in static initialisations
too by building it on top of ilog2() in the constant parameter case.

This has been tested for i386 and x86_64 using the following userspace program,
and for FRV by making appropriate substitutions for fls() and fls64().  It will
abort if the case for get_order() deviates from the original except for the
order of 0, for which get_order() produces an undefined result.  This program
tests both dynamic and static parameters.

	#include <stdlib.h>
	#include <stdio.h>

	#ifdef __x86_64__
	#define BITS_PER_LONG 64
	#else
	#define BITS_PER_LONG 32
	#endif

	#define PAGE_SHIFT 12

	typedef unsigned long long __u64, u64;
	typedef unsigned int __u32, u32;
	#define noinline	__attribute__((noinline))

	static inline int fls(int x)
	{
		int bitpos = -1;

		asm("bsrl %1,%0"
		    : "+r" (bitpos)
		    : "rm" (x));
		return bitpos + 1;
	}

	static __always_inline int fls64(__u64 x)
	{
	#if BITS_PER_LONG == 64
		long bitpos = -1;

		asm("bsrq %1,%0"
		    : "+r" (bitpos)
		    : "rm" (x));
		return bitpos + 1;
	#else
		__u32 h = x >> 32, l = x;
		int bitpos = -1;

		asm("bsrl	%1,%0	\n"
		    "subl	%2,%0	\n"
		    "bsrl	%3,%0	\n"
		    : "+r" (bitpos)
		    : "rm" (l), "i"(32), "rm" (h));

		return bitpos + 33;
	#endif
	}

	static inline __attribute__((const))
	int __ilog2_u32(u32 n)
	{
		return fls(n) - 1;
	}

	static inline __attribute__((const))
	int __ilog2_u64(u64 n)
	{
		return fls64(n) - 1;
	}

	extern __attribute__((const, noreturn))
	int ____ilog2_NaN(void);

	#define ilog2(n)				\
	(						\
		__builtin_constant_p(n) ? (		\
			(n) < 1 ? ____ilog2_NaN() :	\
			(n) & (1ULL << 63) ? 63 :	\
			(n) & (1ULL << 62) ? 62 :	\
			(n) & (1ULL << 61) ? 61 :	\
			(n) & (1ULL << 60) ? 60 :	\
			(n) & (1ULL << 59) ? 59 :	\
			(n) & (1ULL << 58) ? 58 :	\
			(n) & (1ULL << 57) ? 57 :	\
			(n) & (1ULL << 56) ? 56 :	\
			(n) & (1ULL << 55) ? 55 :	\
			(n) & (1ULL << 54) ? 54 :	\
			(n) & (1ULL << 53) ? 53 :	\
			(n) & (1ULL << 52) ? 52 :	\
			(n) & (1ULL << 51) ? 51 :	\
			(n) & (1ULL << 50) ? 50 :	\
			(n) & (1ULL << 49) ? 49 :	\
			(n) & (1ULL << 48) ? 48 :	\
			(n) & (1ULL << 47) ? 47 :	\
			(n) & (1ULL << 46) ? 46 :	\
			(n) & (1ULL << 45) ? 45 :	\
			(n) & (1ULL << 44) ? 44 :	\
			(n) & (1ULL << 43) ? 43 :	\
			(n) & (1ULL << 42) ? 42 :	\
			(n) & (1ULL << 41) ? 41 :	\
			(n) & (1ULL << 40) ? 40 :	\
			(n) & (1ULL << 39) ? 39 :	\
			(n) & (1ULL << 38) ? 38 :	\
			(n) & (1ULL << 37) ? 37 :	\
			(n) & (1ULL << 36) ? 36 :	\
			(n) & (1ULL << 35) ? 35 :	\
			(n) & (1ULL << 34) ? 34 :	\
			(n) & (1ULL << 33) ? 33 :	\
			(n) & (1ULL << 32) ? 32 :	\
			(n) & (1ULL << 31) ? 31 :	\
			(n) & (1ULL << 30) ? 30 :	\
			(n) & (1ULL << 29) ? 29 :	\
			(n) & (1ULL << 28) ? 28 :	\
			(n) & (1ULL << 27) ? 27 :	\
			(n) & (1ULL << 26) ? 26 :	\
			(n) & (1ULL << 25) ? 25 :	\
			(n) & (1ULL << 24) ? 24 :	\
			(n) & (1ULL << 23) ? 23 :	\
			(n) & (1ULL << 22) ? 22 :	\
			(n) & (1ULL << 21) ? 21 :	\
			(n) & (1ULL << 20) ? 20 :	\
			(n) & (1ULL << 19) ? 19 :	\
			(n) & (1ULL << 18) ? 18 :	\
			(n) & (1ULL << 17) ? 17 :	\
			(n) & (1ULL << 16) ? 16 :	\
			(n) & (1ULL << 15) ? 15 :	\
			(n) & (1ULL << 14) ? 14 :	\
			(n) & (1ULL << 13) ? 13 :	\
			(n) & (1ULL << 12) ? 12 :	\
			(n) & (1ULL << 11) ? 11 :	\
			(n) & (1ULL << 10) ? 10 :	\
			(n) & (1ULL <<  9) ?  9 :	\
			(n) & (1ULL <<  8) ?  8 :	\
			(n) & (1ULL <<  7) ?  7 :	\
			(n) & (1ULL <<  6) ?  6 :	\
			(n) & (1ULL <<  5) ?  5 :	\
			(n) & (1ULL <<  4) ?  4 :	\
			(n) & (1ULL <<  3) ?  3 :	\
			(n) & (1ULL <<  2) ?  2 :	\
			(n) & (1ULL <<  1) ?  1 :	\
			(n) & (1ULL <<  0) ?  0 :	\
			____ilog2_NaN()			\
					   ) :		\
		(sizeof(n) <= 4) ?			\
		__ilog2_u32(n) :			\
		__ilog2_u64(n)				\
	 )

	static noinline __attribute__((const))
	int old_get_order(unsigned long size)
	{
		int order;

		size = (size - 1) >> (PAGE_SHIFT - 1);
		order = -1;
		do {
			size >>= 1;
			order++;
		} while (size);
		return order;
	}

	static noinline __attribute__((const))
	int __get_order(unsigned long size)
	{
		int order;
		size--;
		size >>= PAGE_SHIFT;
	#if BITS_PER_LONG == 32
		order = fls(size);
	#else
		order = fls64(size);
	#endif
		return order;
	}

	#define get_order(n)						\
	(								\
		__builtin_constant_p(n) ? (				\
			(n == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :	\
			((n < (1UL << PAGE_SHIFT)) ? 0 :		\
			 ilog2((n) - 1) - PAGE_SHIFT + 1)		\
		) :							\
		__get_order(n)						\
	)

	#define order(N) \
		{ (1UL << N) - 1,	get_order((1UL << N) - 1)	},	\
		{ (1UL << N),		get_order((1UL << N))		},	\
		{ (1UL << N) + 1,	get_order((1UL << N) + 1)	}

	struct order {
		unsigned long n, order;
	};

	static const struct order order_table[] = {
		order(0),
		order(1),
		order(2),
		order(3),
		order(4),
		order(5),
		order(6),
		order(7),
		order(8),
		order(9),
		order(10),
		order(11),
		order(12),
		order(13),
		order(14),
		order(15),
		order(16),
		order(17),
		order(18),
		order(19),
		order(20),
		order(21),
		order(22),
		order(23),
		order(24),
		order(25),
		order(26),
		order(27),
		order(28),
		order(29),
		order(30),
		order(31),
	#if BITS_PER_LONG == 64
		order(32),
		order(33),
		order(34),
		order(35),
	#endif
		{ 0x2929 }
	};

	void check(int loop, unsigned long n)
	{
		unsigned long old, new;

		printf("[%2d]: %09lx | ", loop, n);

		old = old_get_order(n);
		new = get_order(n);

		printf("%3ld, %3ld\n", old, new);
		if (n != 0 && old != new)
			abort();
	}

	int main(int argc, char **argv)
	{
		const struct order *p;
		unsigned long n;
		int loop;

		for (loop = 0; loop <= BITS_PER_LONG - 1; loop++) {
			n = 1UL << loop;
			check(loop, n - 1);
			check(loop, n);
			check(loop, n + 1);
		}

		for (p = order_table; p->n != 0x2929; p++) {
			unsigned long old, new;

			old = old_get_order(p->n);
			new = p->order;
			printf("%09lx\t%3ld, %3ld\n", p->n, old, new);
			if (p->n != 0 && old != new)
				abort();
		}

		return 0;
	}

Disassembling the x86_64 version of the above code shows:

	0000000000400510 <old_get_order>:
	  400510:       48 83 ef 01             sub    $0x1,%rdi
	  400514:       b8 ff ff ff ff          mov    $0xffffffff,%eax
	  400519:       48 c1 ef 0b             shr    $0xb,%rdi
	  40051d:       0f 1f 00                nopl   (%rax)
	  400520:       83 c0 01                add    $0x1,%eax
	  400523:       48 d1 ef                shr    %rdi
	  400526:       75 f8                   jne    400520 <old_get_order+0x10>
	  400528:       f3 c3                   repz retq
	  40052a:       66 0f 1f 44 00 00       nopw   0x0(%rax,%rax,1)

	0000000000400530 <__get_order>:
	  400530:       48 83 ef 01             sub    $0x1,%rdi
	  400534:       48 c7 c0 ff ff ff ff    mov    $0xffffffffffffffff,%rax
	  40053b:       48 c1 ef 0c             shr    $0xc,%rdi
	  40053f:       48 0f bd c7             bsr    %rdi,%rax
	  400543:       83 c0 01                add    $0x1,%eax
	  400546:       c3                      retq
	  400547:       66 0f 1f 84 00 00 00    nopw   0x0(%rax,%rax,1)
	  40054e:       00 00

As can be seen, the new __get_order() function is simpler than the
old_get_order() function.

Signed-off-by: David Howells <dhowells@redhat.com>
Link: http://lkml.kernel.org/r/20120220223928.16199.29548.stgit@warthog.procyon.org.uk
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: H. Peter Anvin <hpa@zytor.com>

diff --git a/include/asm-generic/getorder.h b/include/asm-generic/getorder.h
index 76e9687..e0fb4bf 100644
--- a/include/asm-generic/getorder.h
+++ b/include/asm-generic/getorder.h
@@ -4,6 +4,25 @@
 #ifndef __ASSEMBLY__
 
 #include <linux/compiler.h>
+#include <linux/log2.h>
+
+/*
+ * Runtime evaluation of get_order()
+ */
+static inline __attribute_const__
+int __get_order(unsigned long size)
+{
+	int order;
+
+	size--;
+	size >>= PAGE_SHIFT;
+#if BITS_PER_LONG == 32
+	order = fls(size);
+#else
+	order = fls64(size);
+#endif
+	return order;
+}
 
 /**
  * get_order - Determine the allocation order of a memory size
@@ -27,18 +46,15 @@
  * This function may be used to initialise variables with compile time
  * evaluations of constants.
  */
-static inline __attribute_const__ int get_order(unsigned long size)
-{
-	int order;
-
-	size = (size - 1) >> (PAGE_SHIFT - 1);
-	order = -1;
-	do {
-		size >>= 1;
-		order++;
-	} while (size);
-	return order;
-}
+#define get_order(n)						\
+(								\
+	__builtin_constant_p(n) ? (				\
+		(n == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :	\
+		((n < (1UL << PAGE_SHIFT)) ? 0 :		\
+		 ilog2((n) - 1) - PAGE_SHIFT + 1)		\
+	) :							\
+	__get_order(n)						\
+)
 
 #endif	/* __ASSEMBLY__ */
 
-- 
cgit v0.10.2


From b893485db994b17402524d3d700b950294cb6c97 Mon Sep 17 00:00:00 2001
From: Joerg Roedel <joerg.roedel@amd.com>
Date: Fri, 24 Feb 2012 13:58:15 +0100
Subject: bitops: Add missing parentheses to new get_order macro
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The new get_order macro introcuded in commit

	d66acc39c7cee323733c8503b9de1821a56dff7e

does not use parentheses around all uses of the parameter n.
This causes new compile warnings, for example in the
amd_iommu_init.c function:

drivers/iommu/amd_iommu_init.c:561:6: warning: suggest parentheses around comparison in operand of ‘&’ [-Wparentheses]
drivers/iommu/amd_iommu_init.c:561:6: warning: suggest parentheses around comparison in operand of ‘&’ [-Wparentheses]

Fix those warnings by adding the missing parentheses.

Reported-by: Ingo Molnar <mingo@elte.hu>
Cc: David Howells <dhowells@redhat.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Joerg Roedel <joerg.roedel@amd.com>
Link: http://lkml.kernel.org/r/1330088295-28732-1-git-send-email-joerg.roedel@amd.com
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/include/asm-generic/getorder.h b/include/asm-generic/getorder.h
index e0fb4bf..65e4468 100644
--- a/include/asm-generic/getorder.h
+++ b/include/asm-generic/getorder.h
@@ -49,8 +49,8 @@ int __get_order(unsigned long size)
 #define get_order(n)						\
 (								\
 	__builtin_constant_p(n) ? (				\
-		(n == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :	\
-		((n < (1UL << PAGE_SHIFT)) ? 0 :		\
+		((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT :	\
+		(((n) < (1UL << PAGE_SHIFT)) ? 0 :		\
 		 ilog2((n) - 1) - PAGE_SHIFT + 1)		\
 	) :							\
 	__get_order(n)						\
-- 
cgit v0.10.2


From 626109130267713cac020515504ec341e47c96f9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 24 Feb 2012 14:54:37 +0000
Subject: x86-64: Fix CFI annotations for NMI nesting code

The saving and restoring of %rdx wasn't annotated at all, and the
jumping over sections where state gets partly restored wasn't handled
either.

Further, by folding the pushing of the previous frame in repeat_nmi
into that which so far was immediately preceding restart_nmi (after
moving the restore of %rdx ahead of that, since it doesn't get used
anymore when pushing prior frames), annotations of the replicated
frame creations can be made consistent too.

v2: Fully fold repeat_nmi into the normal code flow (adding a single
    redundant instruction to the "normal" code path), thus retaining
    the special protection of all instructions between repeat_nmi and
    end_repeat_nmi.

Link: http://lkml.kernel.org/r/4F478B630200007800074A31@nat28.tlf.novell.com

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1333d98..e0eca00 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1530,6 +1530,7 @@ ENTRY(nmi)
 
 	/* Use %rdx as out temp variable throughout */
 	pushq_cfi %rdx
+	CFI_REL_OFFSET rdx, 0
 
 	/*
 	 * If %cs was not the kernel segment, then the NMI triggered in user
@@ -1554,6 +1555,7 @@ ENTRY(nmi)
 	 */
 	lea 6*8(%rsp), %rdx
 	test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+	CFI_REMEMBER_STATE
 
 nested_nmi:
 	/*
@@ -1585,10 +1587,12 @@ nested_nmi:
 
 nested_nmi_out:
 	popq_cfi %rdx
+	CFI_RESTORE rdx
 
 	/* No need to check faults here */
 	INTERRUPT_RETURN
 
+	CFI_RESTORE_STATE
 first_nmi:
 	/*
 	 * Because nested NMIs will use the pushed location that we
@@ -1624,6 +1628,10 @@ first_nmi:
 	 * NMI may zero out. The original stack frame and the temp storage
 	 * is also used by nested NMIs and can not be trusted on exit.
 	 */
+	/* Do not pop rdx, nested NMIs will corrupt it */
+	movq (%rsp), %rdx
+	CFI_RESTORE rdx
+
 	/* Set the NMI executing variable on the stack. */
 	pushq_cfi $1
 
@@ -1631,14 +1639,31 @@ first_nmi:
 	.rept 5
 	pushq_cfi 6*8(%rsp)
 	.endr
+	CFI_DEF_CFA_OFFSET SS+8-RIP
+
+	/*
+	 * If there was a nested NMI, the first NMI's iret will return
+	 * here. But NMIs are still enabled and we can take another
+	 * nested NMI. The nested NMI checks the interrupted RIP to see
+	 * if it is between repeat_nmi and end_repeat_nmi, and if so
+	 * it will just return, as we are about to repeat an NMI anyway.
+	 * This makes it safe to copy to the stack frame that a nested
+	 * NMI will update.
+	 */
+repeat_nmi:
+	/*
+	 * Update the stack variable to say we are still in NMI (the update
+	 * is benign for the non-repeat case, where 1 was pushed just above
+	 * to this very stack slot).
+	 */
+	movq $1, 5*8(%rsp)
 
 	/* Make another copy, this one may be modified by nested NMIs */
 	.rept 5
 	pushq_cfi 4*8(%rsp)
 	.endr
-
-	/* Do not pop rdx, nested NMIs will corrupt it */
-	movq 11*8(%rsp), %rdx
+	CFI_DEF_CFA_OFFSET SS+8-RIP
+end_repeat_nmi:
 
 	/*
 	 * Everything below this point can be preempted by a nested
@@ -1646,7 +1671,6 @@ first_nmi:
 	 * caused by an exception and nested NMI will start here, and
 	 * can still be preempted by another NMI.
 	 */
-restart_nmi:
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
 	subq $ORIG_RAX-R15, %rsp
 	CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
@@ -1675,26 +1699,6 @@ nmi_restore:
 	CFI_ENDPROC
 END(nmi)
 
-	/*
-	 * If an NMI hit an iret because of an exception or breakpoint,
-	 * it can lose its NMI context, and a nested NMI may come in.
-	 * In that case, the nested NMI will change the preempted NMI's
-	 * stack to jump to here when it does the final iret.
-	 */
-repeat_nmi:
-	INTR_FRAME
-	/* Update the stack variable to say we are still in NMI */
-	movq $1, 5*8(%rsp)
-
-	/* copy the saved stack back to copy stack */
-	.rept 5
-	pushq_cfi 4*8(%rsp)
-	.endr
-
-	jmp restart_nmi
-	CFI_ENDPROC
-end_repeat_nmi:
-
 ENTRY(ignore_sysret)
 	CFI_STARTPROC
 	mov $-ENOSYS,%eax
-- 
cgit v0.10.2


From 69466466ce889cd2cbc8cda9ff1c6083f48cc7f9 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Fri, 24 Feb 2012 11:55:01 +0000
Subject: x86-64: Improve insn scheduling in SAVE_ARGS_IRQ

In one case, use an address register that was computed earlier (and
with a simpler instruction), thus reducing the risk of a stall.

In the second case, eliminate a branch by using a conditional move (as
is already done in call_softirq and xen_do_hypervisor_callback).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F4788A50200007800074A26@nat28.tlf.novell.com
Reviewed-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: H. Peter Anvin <hpa@linux.intel.com>

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index a20e1cb..211b2e1 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -319,7 +319,7 @@ ENDPROC(native_usergs_sysret64)
 	movq %rsp, %rsi
 
 	leaq -RBP(%rsp),%rdi	/* arg1 for handler */
-	testl $3, CS(%rdi)
+	testl $3, CS-RBP(%rsi)
 	je 1f
 	SWAPGS
 	/*
@@ -329,11 +329,10 @@ ENDPROC(native_usergs_sysret64)
 	 * moving irq_enter into assembly, which would be too much work)
 	 */
 1:	incl PER_CPU_VAR(irq_count)
-	jne 2f
-	mov PER_CPU_VAR(irq_stack_ptr),%rsp
+	cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
 	CFI_DEF_CFA_REGISTER	rsi
 
-2:	/* Store previous stack value */
+	/* Store previous stack value */
 	pushq %rsi
 	CFI_ESCAPE	0x0f /* DW_CFA_def_cfa_expression */, 6, \
 			0x77 /* DW_OP_breg7 */, 0, \
-- 
cgit v0.10.2


From 79fb4ad63e8266ffac1f69bbb45a6f86570493e7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 24 Feb 2012 15:55:13 -0500
Subject: x86: Fix the NMI nesting comments

Some of the comments for the nesting NMI algorithm were stale and
had some references to some prototypes that were first tried.

I also updated the comments to be a little easier to understand
the flow of the code. It definitely needs the documentation.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>

diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index e0eca00..2de3e45 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1624,11 +1624,12 @@ first_nmi:
 	 * | pt_regs                 |
 	 * +-------------------------+
 	 *
-	 * The saved RIP is used to fix up the copied RIP that a nested
-	 * NMI may zero out. The original stack frame and the temp storage
+	 * The saved stack frame is used to fix up the copied stack frame
+	 * that a nested NMI may change to make the interrupted NMI iret jump
+	 * to the repeat_nmi. The original stack frame and the temp storage
 	 * is also used by nested NMIs and can not be trusted on exit.
 	 */
-	/* Do not pop rdx, nested NMIs will corrupt it */
+	/* Do not pop rdx, nested NMIs will corrupt that part of the stack */
 	movq (%rsp), %rdx
 	CFI_RESTORE rdx
 
@@ -1641,6 +1642,8 @@ first_nmi:
 	.endr
 	CFI_DEF_CFA_OFFSET SS+8-RIP
 
+	/* Everything up to here is safe from nested NMIs */
+
 	/*
 	 * If there was a nested NMI, the first NMI's iret will return
 	 * here. But NMIs are still enabled and we can take another
@@ -1667,9 +1670,8 @@ end_repeat_nmi:
 
 	/*
 	 * Everything below this point can be preempted by a nested
-	 * NMI if the first NMI took an exception. Repeated NMIs
-	 * caused by an exception and nested NMI will start here, and
-	 * can still be preempted by another NMI.
+	 * NMI if the first NMI took an exception and reset our iret stack
+	 * so that we repeat another NMI.
 	 */
 	pushq_cfi $-1		/* ORIG_RAX: no syscall to restart */
 	subq $ORIG_RAX-R15, %rsp
-- 
cgit v0.10.2


From 901b04450a0ff44d579158b8b0492ce7e66cd442 Mon Sep 17 00:00:00 2001
From: Alex Shi <alex.shi@intel.com>
Date: Sat, 3 Mar 2012 19:27:27 +0800
Subject: x86/numa: Improve internode cache alignment

Currently cache alignment among nodes in the kernel is still 128
bytes on x86 NUMA machines - we got that X86_INTERNODE_CACHE_SHIFT
default from old P4 processors.

But now most modern x86 CPUs use the same size: 64 bytes from L1 to
last level L3. so let's remove the incorrect setting, and directly
use the L1 cache size to do SMP cache line alignment.

This patch saves some memory space on kernel data, and it also
improves the cache locality of kernel data.

The System.map is quite different with/without this change:

	before patch			after patch
  ...
  000000000000b000 d tlb_vector_|  000000000000b000 d tlb_vector
  000000000000b080 d cpu_loops_p|  000000000000b040 d cpu_loops_
  ...

Signed-off-by: Alex Shi <alex.shi@intel.com>
Cc: asit.k.mallick@intel.com
Link: http://lkml.kernel.org/r/1330774047-18597-1-git-send-email-alex.shi@intel.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 3c57033..6443c6f 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -303,7 +303,6 @@ config X86_GENERIC
 config X86_INTERNODE_CACHE_SHIFT
 	int
 	default "12" if X86_VSMP
-	default "7" if NUMA
 	default X86_L1_CACHE_SHIFT
 
 config X86_CMPXCHG
-- 
cgit v0.10.2


From 0d2bf4899d04fcc7f3a280b0bc74c084badb4e04 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 8 Mar 2012 09:18:02 +0000
Subject: x86: Tighten dependencies of CPU_SUP_*_32

Building in support for either of these CPUs is pointless when
e.g. M686 was selected (since such a kernel would use cmov
instructions, which aren't available on these older CPUs).

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F58875A02000078000770E0@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 6443c6f..706e12e 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -440,7 +440,7 @@ config CPU_SUP_INTEL
 config CPU_SUP_CYRIX_32
 	default y
 	bool "Support Cyrix processors" if PROCESSOR_SELECT
-	depends on !64BIT
+	depends on M386 || M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT)
 	---help---
 	  This enables detection, tunings and quirks for Cyrix processors
 
@@ -494,7 +494,7 @@ config CPU_SUP_TRANSMETA_32
 config CPU_SUP_UMC_32
 	default y
 	bool "Support UMC processors" if PROCESSOR_SELECT
-	depends on !64BIT
+	depends on M386 || M486 || (EXPERT && !64BIT)
 	---help---
 	  This enables detection, tunings and quirks for UMC processors
 
-- 
cgit v0.10.2


From c7e23289a6aa95048a78b252b462f24ca6cf7f96 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 8 Mar 2012 09:23:14 +0000
Subject: x86/32: Print control and debug registers for kerenel context

While for a user mode register dump it may be reasonable to skip
those (albeit x86-64 doesn't do so), for kernel mode dumps these
should be printed to make sure all information possibly
necessary for analysis is available.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F58889202000078000770E7@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index c99f9ed..88ec912 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -87,7 +87,7 @@ void show_registers(struct pt_regs *regs)
 	int i;
 
 	print_modules();
-	__show_regs(regs, 0);
+	__show_regs(regs, !user_mode_vm(regs));
 
 	printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n",
 		TASK_COMM_LEN, current->comm, task_pid_nr(current),
-- 
cgit v0.10.2


From a240ada241dafe290e7532d1ddeb98fdf1419068 Mon Sep 17 00:00:00 2001
From: Jan Beulich <JBeulich@suse.com>
Date: Thu, 8 Mar 2012 09:24:57 +0000
Subject: x86: Include probe_roms.h in probe_roms.c

... to ensure that declarations and definitions are in sync.

Signed-off-by: Jan Beulich <jbeulich@suse.com>
Link: http://lkml.kernel.org/r/4F5888F902000078000770F1@nat28.tlf.novell.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>

diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c
index 34e06e8..0bc72e2 100644
--- a/arch/x86/kernel/probe_roms.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -12,6 +12,7 @@
 #include <linux/pci.h>
 #include <linux/export.h>
 
+#include <asm/probe_roms.h>
 #include <asm/pci-direct.h>
 #include <asm/e820.h>
 #include <asm/mmzone.h>
-- 
cgit v0.10.2