From 426932909093e4e7729777a0e2beed4b54911361 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 5 Jan 2012 16:12:25 +0000 Subject: x86-64: Slightly shorten copy_page() %r13 got saved and restored without ever getting touched, so there's no need to do so. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/4F05D9F9020000780006AA0D@nat28.tlf.novell.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 01c805b..6b34d04 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -20,14 +20,12 @@ ENDPROC(copy_page_c) ENTRY(copy_page) CFI_STARTPROC - subq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET 3*8 + subq $2*8,%rsp + CFI_ADJUST_CFA_OFFSET 2*8 movq %rbx,(%rsp) CFI_REL_OFFSET rbx, 0 movq %r12,1*8(%rsp) CFI_REL_OFFSET r12, 1*8 - movq %r13,2*8(%rsp) - CFI_REL_OFFSET r13, 2*8 movl $(4096/64)-5,%ecx .p2align 4 @@ -91,10 +89,8 @@ ENTRY(copy_page) CFI_RESTORE rbx movq 1*8(%rsp),%r12 CFI_RESTORE r12 - movq 2*8(%rsp),%r13 - CFI_RESTORE r13 - addq $3*8,%rsp - CFI_ADJUST_CFA_OFFSET -3*8 + addq $2*8,%rsp + CFI_ADJUST_CFA_OFFSET -2*8 ret .Lcopy_page_end: CFI_ENDPROC -- cgit v0.10.2 From 5d7244e7c984cecead412bde6395ce18618a4a37 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 5 Jan 2012 16:10:42 +0000 Subject: x86-64: Fix memset() to support sizes of 4Gb and above While currently there doesn't appear to be any reachable in-tree case where such large memory blocks may be passed to memset() (alloc_bootmem() being the primary non-reachable one, as it gets called with suitably large sizes in FLATMEM configurations), we have recently hit the problem a second time in our Xen kernels. Rather than working around it a second time, prevent others from falling into the same trap by fixing this long standing limitation. Signed-off-by: Jan Beulich Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/4F05D992020000780006AA09@nat28.tlf.novell.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 79bd454..2dcb380 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -19,16 +19,15 @@ .section .altinstr_replacement, "ax", @progbits .Lmemset_c: movq %rdi,%r9 - movl %edx,%r8d - andl $7,%r8d - movl %edx,%ecx - shrl $3,%ecx + movq %rdx,%rcx + andl $7,%edx + shrq $3,%rcx /* expand byte value */ movzbl %sil,%esi movabs $0x0101010101010101,%rax - mulq %rsi /* with rax, clobbers rdx */ + imulq %rsi,%rax rep stosq - movl %r8d,%ecx + movl %edx,%ecx rep stosb movq %r9,%rax ret @@ -50,7 +49,7 @@ .Lmemset_c_e: movq %rdi,%r9 movb %sil,%al - movl %edx,%ecx + movq %rdx,%rcx rep stosb movq %r9,%rax ret @@ -61,12 +60,11 @@ ENTRY(memset) ENTRY(__memset) CFI_STARTPROC movq %rdi,%r10 - movq %rdx,%r11 /* expand byte value */ movzbl %sil,%ecx movabs $0x0101010101010101,%rax - mul %rcx /* with rax, clobbers rdx */ + imulq %rcx,%rax /* align dst */ movl %edi,%r9d @@ -75,13 +73,13 @@ ENTRY(__memset) CFI_REMEMBER_STATE .Lafter_bad_alignment: - movl %r11d,%ecx - shrl $6,%ecx + movq %rdx,%rcx + shrq $6,%rcx jz .Lhandle_tail .p2align 4 .Lloop_64: - decl %ecx + decq %rcx movq %rax,(%rdi) movq %rax,8(%rdi) movq %rax,16(%rdi) @@ -97,7 +95,7 @@ ENTRY(__memset) to predict jump tables. */ .p2align 4 .Lhandle_tail: - movl %r11d,%ecx + movl %edx,%ecx andl $63&(~7),%ecx jz .Lhandle_7 shrl $3,%ecx @@ -109,12 +107,11 @@ ENTRY(__memset) jnz .Lloop_8 .Lhandle_7: - movl %r11d,%ecx - andl $7,%ecx + andl $7,%edx jz .Lende .p2align 4 .Lloop_1: - decl %ecx + decl %edx movb %al,(%rdi) leaq 1(%rdi),%rdi jnz .Lloop_1 @@ -125,13 +122,13 @@ ENTRY(__memset) CFI_RESTORE_STATE .Lbad_alignment: - cmpq $7,%r11 + cmpq $7,%rdx jbe .Lhandle_7 movq %rax,(%rdi) /* unaligned store */ movq $8,%r8 subq %r9,%r8 addq %r8,%rdi - subq %r8,%r11 + subq %r8,%rdx jmp .Lafter_bad_alignment .Lfinal: CFI_ENDPROC -- cgit v0.10.2 From 2ab560911a427fdc73bfd3a7d2944d8ee0ca6db8 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 26 Jan 2012 15:50:55 +0000 Subject: x86-64: Fix memcpy() to support sizes of 4Gb and above While currently there doesn't appear to be any reachable in-tree case where such large memory blocks may be passed to memcpy(), we already had hit the problem in our Xen kernels. Just like done recently for mmeset(), rather than working around it, prevent others from falling into the same trap by fixing this long standing limitation. Signed-off-by: Jan Beulich Cc: Linus Torvalds Link: http://lkml.kernel.org/r/4F21846F020000780006F3FA@nat28.tlf.novell.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index efbf2a0..1235b04 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -27,9 +27,8 @@ .section .altinstr_replacement, "ax", @progbits .Lmemcpy_c: movq %rdi, %rax - - movl %edx, %ecx - shrl $3, %ecx + movq %rdx, %rcx + shrq $3, %rcx andl $7, %edx rep movsq movl %edx, %ecx @@ -48,8 +47,7 @@ .section .altinstr_replacement, "ax", @progbits .Lmemcpy_c_e: movq %rdi, %rax - - movl %edx, %ecx + movq %rdx, %rcx rep movsb ret .Lmemcpy_e_e: @@ -60,10 +58,7 @@ ENTRY(memcpy) CFI_STARTPROC movq %rdi, %rax - /* - * Use 32bit CMP here to avoid long NOP padding. - */ - cmp $0x20, %edx + cmpq $0x20, %rdx jb .Lhandle_tail /* @@ -72,7 +67,7 @@ ENTRY(memcpy) */ cmp %dil, %sil jl .Lcopy_backward - subl $0x20, %edx + subq $0x20, %rdx .Lcopy_forward_loop: subq $0x20, %rdx @@ -91,7 +86,7 @@ ENTRY(memcpy) movq %r11, 3*8(%rdi) leaq 4*8(%rdi), %rdi jae .Lcopy_forward_loop - addq $0x20, %rdx + addl $0x20, %edx jmp .Lhandle_tail .Lcopy_backward: @@ -123,11 +118,11 @@ ENTRY(memcpy) /* * Calculate copy position to head. */ - addq $0x20, %rdx + addl $0x20, %edx subq %rdx, %rsi subq %rdx, %rdi .Lhandle_tail: - cmpq $16, %rdx + cmpl $16, %edx jb .Lless_16bytes /* @@ -144,7 +139,7 @@ ENTRY(memcpy) retq .p2align 4 .Lless_16bytes: - cmpq $8, %rdx + cmpl $8, %edx jb .Lless_8bytes /* * Move data from 8 bytes to 15 bytes. @@ -156,7 +151,7 @@ ENTRY(memcpy) retq .p2align 4 .Lless_8bytes: - cmpq $4, %rdx + cmpl $4, %edx jb .Lless_3bytes /* -- cgit v0.10.2 From 9d8e22777e66f420e46490e9fc6f8cb7e0e2222b Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 26 Jan 2012 15:55:32 +0000 Subject: x86-64: Handle byte-wise tail copying in memcpy() without a loop While hard to measure, reducing the number of possibly/likely mis-predicted branches can generally be expected to be slightly better. Other than apparent at the first glance, this also doesn't grow the function size (the alignment gap to the next function just gets smaller). Signed-off-by: Jan Beulich Cc: Linus Torvalds Link: http://lkml.kernel.org/r/4F218584020000780006F422@nat28.tlf.novell.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 1235b04..1c273be 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -164,18 +164,19 @@ ENTRY(memcpy) retq .p2align 4 .Lless_3bytes: - cmpl $0, %edx - je .Lend + subl $1, %edx + jb .Lend /* * Move data from 1 bytes to 3 bytes. */ -.Lloop_1: - movb (%rsi), %r8b - movb %r8b, (%rdi) - incq %rdi - incq %rsi - decl %edx - jnz .Lloop_1 + movzbl (%rsi), %ecx + jz .Lstore_1byte + movzbq 1(%rsi), %r8 + movzbq (%rsi, %rdx), %r9 + movb %r8b, 1(%rdi) + movb %r9b, (%rdi, %rdx) +.Lstore_1byte: + movb %cl, (%rdi) .Lend: retq -- cgit v0.10.2 From 7931d493051ea9b09e4fddee2dc40b2eb88d62b9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 3 Feb 2012 15:06:26 +0000 Subject: x86/spinlocks: Eliminate TICKET_MASK The definition of it being questionable already (unnecessarily including a cast), and it being used in a single place that can be written shorter without it, remove this #define. Along the same lines, simplify __ticket_spin_is_locked()'s main expression, which was the more convoluted way because of needs that went away with the recent type changes by Jeremy. This is pure cleanup, no functional change intended. Signed-off-by: Jan Beulich Acked-by: Jeremy Fitzhardinge Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/4F2C06020200007800071066@nat28.tlf.novell.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index a82c2bf..76bfa2c 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -88,14 +88,14 @@ static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); - return !!(tmp.tail ^ tmp.head); + return tmp.tail != tmp.head; } static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) { struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); - return ((tmp.tail - tmp.head) & TICKET_MASK) > 1; + return (__ticket_t)(tmp.tail - tmp.head) > 1; } #ifndef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 8ebd5df..ad0ad07 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -16,7 +16,6 @@ typedef u32 __ticketpair_t; #endif #define TICKET_SHIFT (sizeof(__ticket_t) * 8) -#define TICKET_MASK ((__ticket_t)((1 << TICKET_SHIFT) - 1)) typedef struct arch_spinlock { union { -- cgit v0.10.2 From e0891a9816316b5e05fd5b0453ffe9fd6a56f489 Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 Feb 2012 22:39:18 +0000 Subject: bitops: Adjust the comment on get_order() to describe the size==0 case Adjust the comment on get_order() to note that the result of passing a size of 0 results in an undefined value. Signed-off-by: David Howells Link: http://lkml.kernel.org/r/20120220223917.16199.9416.stgit@warthog.procyon.org.uk Acked-by: Arnd Bergmann Signed-off-by: H. Peter Anvin diff --git a/include/asm-generic/getorder.h b/include/asm-generic/getorder.h index 67e7245..76e9687 100644 --- a/include/asm-generic/getorder.h +++ b/include/asm-generic/getorder.h @@ -5,7 +5,28 @@ #include -/* Pure 2^n version of get_order */ +/** + * get_order - Determine the allocation order of a memory size + * @size: The size for which to get the order + * + * Determine the allocation order of a particular sized block of memory. This + * is on a logarithmic scale, where: + * + * 0 -> 2^0 * PAGE_SIZE and below + * 1 -> 2^1 * PAGE_SIZE to 2^0 * PAGE_SIZE + 1 + * 2 -> 2^2 * PAGE_SIZE to 2^1 * PAGE_SIZE + 1 + * 3 -> 2^3 * PAGE_SIZE to 2^2 * PAGE_SIZE + 1 + * 4 -> 2^4 * PAGE_SIZE to 2^3 * PAGE_SIZE + 1 + * ... + * + * The order returned is used to find the smallest allocation granule required + * to hold an object of the specified size. + * + * The result is undefined if the size is 0. + * + * This function may be used to initialise variables with compile time + * evaluations of constants. + */ static inline __attribute_const__ int get_order(unsigned long size) { int order; -- cgit v0.10.2 From d66acc39c7cee323733c8503b9de1821a56dff7e Mon Sep 17 00:00:00 2001 From: David Howells Date: Mon, 20 Feb 2012 22:39:29 +0000 Subject: bitops: Optimise get_order() Optimise get_order() to use bit scanning instructions if such exist rather than a loop. Also, make it possible to use get_order() in static initialisations too by building it on top of ilog2() in the constant parameter case. This has been tested for i386 and x86_64 using the following userspace program, and for FRV by making appropriate substitutions for fls() and fls64(). It will abort if the case for get_order() deviates from the original except for the order of 0, for which get_order() produces an undefined result. This program tests both dynamic and static parameters. #include #include #ifdef __x86_64__ #define BITS_PER_LONG 64 #else #define BITS_PER_LONG 32 #endif #define PAGE_SHIFT 12 typedef unsigned long long __u64, u64; typedef unsigned int __u32, u32; #define noinline __attribute__((noinline)) static inline int fls(int x) { int bitpos = -1; asm("bsrl %1,%0" : "+r" (bitpos) : "rm" (x)); return bitpos + 1; } static __always_inline int fls64(__u64 x) { #if BITS_PER_LONG == 64 long bitpos = -1; asm("bsrq %1,%0" : "+r" (bitpos) : "rm" (x)); return bitpos + 1; #else __u32 h = x >> 32, l = x; int bitpos = -1; asm("bsrl %1,%0 \n" "subl %2,%0 \n" "bsrl %3,%0 \n" : "+r" (bitpos) : "rm" (l), "i"(32), "rm" (h)); return bitpos + 33; #endif } static inline __attribute__((const)) int __ilog2_u32(u32 n) { return fls(n) - 1; } static inline __attribute__((const)) int __ilog2_u64(u64 n) { return fls64(n) - 1; } extern __attribute__((const, noreturn)) int ____ilog2_NaN(void); #define ilog2(n) \ ( \ __builtin_constant_p(n) ? ( \ (n) < 1 ? ____ilog2_NaN() : \ (n) & (1ULL << 63) ? 63 : \ (n) & (1ULL << 62) ? 62 : \ (n) & (1ULL << 61) ? 61 : \ (n) & (1ULL << 60) ? 60 : \ (n) & (1ULL << 59) ? 59 : \ (n) & (1ULL << 58) ? 58 : \ (n) & (1ULL << 57) ? 57 : \ (n) & (1ULL << 56) ? 56 : \ (n) & (1ULL << 55) ? 55 : \ (n) & (1ULL << 54) ? 54 : \ (n) & (1ULL << 53) ? 53 : \ (n) & (1ULL << 52) ? 52 : \ (n) & (1ULL << 51) ? 51 : \ (n) & (1ULL << 50) ? 50 : \ (n) & (1ULL << 49) ? 49 : \ (n) & (1ULL << 48) ? 48 : \ (n) & (1ULL << 47) ? 47 : \ (n) & (1ULL << 46) ? 46 : \ (n) & (1ULL << 45) ? 45 : \ (n) & (1ULL << 44) ? 44 : \ (n) & (1ULL << 43) ? 43 : \ (n) & (1ULL << 42) ? 42 : \ (n) & (1ULL << 41) ? 41 : \ (n) & (1ULL << 40) ? 40 : \ (n) & (1ULL << 39) ? 39 : \ (n) & (1ULL << 38) ? 38 : \ (n) & (1ULL << 37) ? 37 : \ (n) & (1ULL << 36) ? 36 : \ (n) & (1ULL << 35) ? 35 : \ (n) & (1ULL << 34) ? 34 : \ (n) & (1ULL << 33) ? 33 : \ (n) & (1ULL << 32) ? 32 : \ (n) & (1ULL << 31) ? 31 : \ (n) & (1ULL << 30) ? 30 : \ (n) & (1ULL << 29) ? 29 : \ (n) & (1ULL << 28) ? 28 : \ (n) & (1ULL << 27) ? 27 : \ (n) & (1ULL << 26) ? 26 : \ (n) & (1ULL << 25) ? 25 : \ (n) & (1ULL << 24) ? 24 : \ (n) & (1ULL << 23) ? 23 : \ (n) & (1ULL << 22) ? 22 : \ (n) & (1ULL << 21) ? 21 : \ (n) & (1ULL << 20) ? 20 : \ (n) & (1ULL << 19) ? 19 : \ (n) & (1ULL << 18) ? 18 : \ (n) & (1ULL << 17) ? 17 : \ (n) & (1ULL << 16) ? 16 : \ (n) & (1ULL << 15) ? 15 : \ (n) & (1ULL << 14) ? 14 : \ (n) & (1ULL << 13) ? 13 : \ (n) & (1ULL << 12) ? 12 : \ (n) & (1ULL << 11) ? 11 : \ (n) & (1ULL << 10) ? 10 : \ (n) & (1ULL << 9) ? 9 : \ (n) & (1ULL << 8) ? 8 : \ (n) & (1ULL << 7) ? 7 : \ (n) & (1ULL << 6) ? 6 : \ (n) & (1ULL << 5) ? 5 : \ (n) & (1ULL << 4) ? 4 : \ (n) & (1ULL << 3) ? 3 : \ (n) & (1ULL << 2) ? 2 : \ (n) & (1ULL << 1) ? 1 : \ (n) & (1ULL << 0) ? 0 : \ ____ilog2_NaN() \ ) : \ (sizeof(n) <= 4) ? \ __ilog2_u32(n) : \ __ilog2_u64(n) \ ) static noinline __attribute__((const)) int old_get_order(unsigned long size) { int order; size = (size - 1) >> (PAGE_SHIFT - 1); order = -1; do { size >>= 1; order++; } while (size); return order; } static noinline __attribute__((const)) int __get_order(unsigned long size) { int order; size--; size >>= PAGE_SHIFT; #if BITS_PER_LONG == 32 order = fls(size); #else order = fls64(size); #endif return order; } #define get_order(n) \ ( \ __builtin_constant_p(n) ? ( \ (n == 0UL) ? BITS_PER_LONG - PAGE_SHIFT : \ ((n < (1UL << PAGE_SHIFT)) ? 0 : \ ilog2((n) - 1) - PAGE_SHIFT + 1) \ ) : \ __get_order(n) \ ) #define order(N) \ { (1UL << N) - 1, get_order((1UL << N) - 1) }, \ { (1UL << N), get_order((1UL << N)) }, \ { (1UL << N) + 1, get_order((1UL << N) + 1) } struct order { unsigned long n, order; }; static const struct order order_table[] = { order(0), order(1), order(2), order(3), order(4), order(5), order(6), order(7), order(8), order(9), order(10), order(11), order(12), order(13), order(14), order(15), order(16), order(17), order(18), order(19), order(20), order(21), order(22), order(23), order(24), order(25), order(26), order(27), order(28), order(29), order(30), order(31), #if BITS_PER_LONG == 64 order(32), order(33), order(34), order(35), #endif { 0x2929 } }; void check(int loop, unsigned long n) { unsigned long old, new; printf("[%2d]: %09lx | ", loop, n); old = old_get_order(n); new = get_order(n); printf("%3ld, %3ld\n", old, new); if (n != 0 && old != new) abort(); } int main(int argc, char **argv) { const struct order *p; unsigned long n; int loop; for (loop = 0; loop <= BITS_PER_LONG - 1; loop++) { n = 1UL << loop; check(loop, n - 1); check(loop, n); check(loop, n + 1); } for (p = order_table; p->n != 0x2929; p++) { unsigned long old, new; old = old_get_order(p->n); new = p->order; printf("%09lx\t%3ld, %3ld\n", p->n, old, new); if (p->n != 0 && old != new) abort(); } return 0; } Disassembling the x86_64 version of the above code shows: 0000000000400510 : 400510: 48 83 ef 01 sub $0x1,%rdi 400514: b8 ff ff ff ff mov $0xffffffff,%eax 400519: 48 c1 ef 0b shr $0xb,%rdi 40051d: 0f 1f 00 nopl (%rax) 400520: 83 c0 01 add $0x1,%eax 400523: 48 d1 ef shr %rdi 400526: 75 f8 jne 400520 400528: f3 c3 repz retq 40052a: 66 0f 1f 44 00 00 nopw 0x0(%rax,%rax,1) 0000000000400530 <__get_order>: 400530: 48 83 ef 01 sub $0x1,%rdi 400534: 48 c7 c0 ff ff ff ff mov $0xffffffffffffffff,%rax 40053b: 48 c1 ef 0c shr $0xc,%rdi 40053f: 48 0f bd c7 bsr %rdi,%rax 400543: 83 c0 01 add $0x1,%eax 400546: c3 retq 400547: 66 0f 1f 84 00 00 00 nopw 0x0(%rax,%rax,1) 40054e: 00 00 As can be seen, the new __get_order() function is simpler than the old_get_order() function. Signed-off-by: David Howells Link: http://lkml.kernel.org/r/20120220223928.16199.29548.stgit@warthog.procyon.org.uk Acked-by: Arnd Bergmann Signed-off-by: H. Peter Anvin diff --git a/include/asm-generic/getorder.h b/include/asm-generic/getorder.h index 76e9687..e0fb4bf 100644 --- a/include/asm-generic/getorder.h +++ b/include/asm-generic/getorder.h @@ -4,6 +4,25 @@ #ifndef __ASSEMBLY__ #include +#include + +/* + * Runtime evaluation of get_order() + */ +static inline __attribute_const__ +int __get_order(unsigned long size) +{ + int order; + + size--; + size >>= PAGE_SHIFT; +#if BITS_PER_LONG == 32 + order = fls(size); +#else + order = fls64(size); +#endif + return order; +} /** * get_order - Determine the allocation order of a memory size @@ -27,18 +46,15 @@ * This function may be used to initialise variables with compile time * evaluations of constants. */ -static inline __attribute_const__ int get_order(unsigned long size) -{ - int order; - - size = (size - 1) >> (PAGE_SHIFT - 1); - order = -1; - do { - size >>= 1; - order++; - } while (size); - return order; -} +#define get_order(n) \ +( \ + __builtin_constant_p(n) ? ( \ + (n == 0UL) ? BITS_PER_LONG - PAGE_SHIFT : \ + ((n < (1UL << PAGE_SHIFT)) ? 0 : \ + ilog2((n) - 1) - PAGE_SHIFT + 1) \ + ) : \ + __get_order(n) \ +) #endif /* __ASSEMBLY__ */ -- cgit v0.10.2 From b893485db994b17402524d3d700b950294cb6c97 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 24 Feb 2012 13:58:15 +0100 Subject: bitops: Add missing parentheses to new get_order macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The new get_order macro introcuded in commit d66acc39c7cee323733c8503b9de1821a56dff7e does not use parentheses around all uses of the parameter n. This causes new compile warnings, for example in the amd_iommu_init.c function: drivers/iommu/amd_iommu_init.c:561:6: warning: suggest parentheses around comparison in operand of ‘&’ [-Wparentheses] drivers/iommu/amd_iommu_init.c:561:6: warning: suggest parentheses around comparison in operand of ‘&’ [-Wparentheses] Fix those warnings by adding the missing parentheses. Reported-by: Ingo Molnar Cc: David Howells Acked-by: Arnd Bergmann Signed-off-by: Joerg Roedel Link: http://lkml.kernel.org/r/1330088295-28732-1-git-send-email-joerg.roedel@amd.com Signed-off-by: H. Peter Anvin diff --git a/include/asm-generic/getorder.h b/include/asm-generic/getorder.h index e0fb4bf..65e4468 100644 --- a/include/asm-generic/getorder.h +++ b/include/asm-generic/getorder.h @@ -49,8 +49,8 @@ int __get_order(unsigned long size) #define get_order(n) \ ( \ __builtin_constant_p(n) ? ( \ - (n == 0UL) ? BITS_PER_LONG - PAGE_SHIFT : \ - ((n < (1UL << PAGE_SHIFT)) ? 0 : \ + ((n) == 0UL) ? BITS_PER_LONG - PAGE_SHIFT : \ + (((n) < (1UL << PAGE_SHIFT)) ? 0 : \ ilog2((n) - 1) - PAGE_SHIFT + 1) \ ) : \ __get_order(n) \ -- cgit v0.10.2 From 626109130267713cac020515504ec341e47c96f9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 24 Feb 2012 14:54:37 +0000 Subject: x86-64: Fix CFI annotations for NMI nesting code The saving and restoring of %rdx wasn't annotated at all, and the jumping over sections where state gets partly restored wasn't handled either. Further, by folding the pushing of the previous frame in repeat_nmi into that which so far was immediately preceding restart_nmi (after moving the restore of %rdx ahead of that, since it doesn't get used anymore when pushing prior frames), annotations of the replicated frame creations can be made consistent too. v2: Fully fold repeat_nmi into the normal code flow (adding a single redundant instruction to the "normal" code path), thus retaining the special protection of all instructions between repeat_nmi and end_repeat_nmi. Link: http://lkml.kernel.org/r/4F478B630200007800074A31@nat28.tlf.novell.com Signed-off-by: Jan Beulich Signed-off-by: Steven Rostedt diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1333d98..e0eca00 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1530,6 +1530,7 @@ ENTRY(nmi) /* Use %rdx as out temp variable throughout */ pushq_cfi %rdx + CFI_REL_OFFSET rdx, 0 /* * If %cs was not the kernel segment, then the NMI triggered in user @@ -1554,6 +1555,7 @@ ENTRY(nmi) */ lea 6*8(%rsp), %rdx test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + CFI_REMEMBER_STATE nested_nmi: /* @@ -1585,10 +1587,12 @@ nested_nmi: nested_nmi_out: popq_cfi %rdx + CFI_RESTORE rdx /* No need to check faults here */ INTERRUPT_RETURN + CFI_RESTORE_STATE first_nmi: /* * Because nested NMIs will use the pushed location that we @@ -1624,6 +1628,10 @@ first_nmi: * NMI may zero out. The original stack frame and the temp storage * is also used by nested NMIs and can not be trusted on exit. */ + /* Do not pop rdx, nested NMIs will corrupt it */ + movq (%rsp), %rdx + CFI_RESTORE rdx + /* Set the NMI executing variable on the stack. */ pushq_cfi $1 @@ -1631,14 +1639,31 @@ first_nmi: .rept 5 pushq_cfi 6*8(%rsp) .endr + CFI_DEF_CFA_OFFSET SS+8-RIP + + /* + * If there was a nested NMI, the first NMI's iret will return + * here. But NMIs are still enabled and we can take another + * nested NMI. The nested NMI checks the interrupted RIP to see + * if it is between repeat_nmi and end_repeat_nmi, and if so + * it will just return, as we are about to repeat an NMI anyway. + * This makes it safe to copy to the stack frame that a nested + * NMI will update. + */ +repeat_nmi: + /* + * Update the stack variable to say we are still in NMI (the update + * is benign for the non-repeat case, where 1 was pushed just above + * to this very stack slot). + */ + movq $1, 5*8(%rsp) /* Make another copy, this one may be modified by nested NMIs */ .rept 5 pushq_cfi 4*8(%rsp) .endr - - /* Do not pop rdx, nested NMIs will corrupt it */ - movq 11*8(%rsp), %rdx + CFI_DEF_CFA_OFFSET SS+8-RIP +end_repeat_nmi: /* * Everything below this point can be preempted by a nested @@ -1646,7 +1671,6 @@ first_nmi: * caused by an exception and nested NMI will start here, and * can still be preempted by another NMI. */ -restart_nmi: pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 @@ -1675,26 +1699,6 @@ nmi_restore: CFI_ENDPROC END(nmi) - /* - * If an NMI hit an iret because of an exception or breakpoint, - * it can lose its NMI context, and a nested NMI may come in. - * In that case, the nested NMI will change the preempted NMI's - * stack to jump to here when it does the final iret. - */ -repeat_nmi: - INTR_FRAME - /* Update the stack variable to say we are still in NMI */ - movq $1, 5*8(%rsp) - - /* copy the saved stack back to copy stack */ - .rept 5 - pushq_cfi 4*8(%rsp) - .endr - - jmp restart_nmi - CFI_ENDPROC -end_repeat_nmi: - ENTRY(ignore_sysret) CFI_STARTPROC mov $-ENOSYS,%eax -- cgit v0.10.2 From 69466466ce889cd2cbc8cda9ff1c6083f48cc7f9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Fri, 24 Feb 2012 11:55:01 +0000 Subject: x86-64: Improve insn scheduling in SAVE_ARGS_IRQ In one case, use an address register that was computed earlier (and with a simpler instruction), thus reducing the risk of a stall. In the second case, eliminate a branch by using a conditional move (as is already done in call_softirq and xen_do_hypervisor_callback). Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F4788A50200007800074A26@nat28.tlf.novell.com Reviewed-by: Andi Kleen Signed-off-by: H. Peter Anvin diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a20e1cb..211b2e1 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -319,7 +319,7 @@ ENDPROC(native_usergs_sysret64) movq %rsp, %rsi leaq -RBP(%rsp),%rdi /* arg1 for handler */ - testl $3, CS(%rdi) + testl $3, CS-RBP(%rsi) je 1f SWAPGS /* @@ -329,11 +329,10 @@ ENDPROC(native_usergs_sysret64) * moving irq_enter into assembly, which would be too much work) */ 1: incl PER_CPU_VAR(irq_count) - jne 2f - mov PER_CPU_VAR(irq_stack_ptr),%rsp + cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp CFI_DEF_CFA_REGISTER rsi -2: /* Store previous stack value */ + /* Store previous stack value */ pushq %rsi CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ 0x77 /* DW_OP_breg7 */, 0, \ -- cgit v0.10.2 From 79fb4ad63e8266ffac1f69bbb45a6f86570493e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Feb 2012 15:55:13 -0500 Subject: x86: Fix the NMI nesting comments Some of the comments for the nesting NMI algorithm were stale and had some references to some prototypes that were first tried. I also updated the comments to be a little easier to understand the flow of the code. It definitely needs the documentation. Signed-off-by: Steven Rostedt diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index e0eca00..2de3e45 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1624,11 +1624,12 @@ first_nmi: * | pt_regs | * +-------------------------+ * - * The saved RIP is used to fix up the copied RIP that a nested - * NMI may zero out. The original stack frame and the temp storage + * The saved stack frame is used to fix up the copied stack frame + * that a nested NMI may change to make the interrupted NMI iret jump + * to the repeat_nmi. The original stack frame and the temp storage * is also used by nested NMIs and can not be trusted on exit. */ - /* Do not pop rdx, nested NMIs will corrupt it */ + /* Do not pop rdx, nested NMIs will corrupt that part of the stack */ movq (%rsp), %rdx CFI_RESTORE rdx @@ -1641,6 +1642,8 @@ first_nmi: .endr CFI_DEF_CFA_OFFSET SS+8-RIP + /* Everything up to here is safe from nested NMIs */ + /* * If there was a nested NMI, the first NMI's iret will return * here. But NMIs are still enabled and we can take another @@ -1667,9 +1670,8 @@ end_repeat_nmi: /* * Everything below this point can be preempted by a nested - * NMI if the first NMI took an exception. Repeated NMIs - * caused by an exception and nested NMI will start here, and - * can still be preempted by another NMI. + * NMI if the first NMI took an exception and reset our iret stack + * so that we repeat another NMI. */ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ subq $ORIG_RAX-R15, %rsp -- cgit v0.10.2 From 901b04450a0ff44d579158b8b0492ce7e66cd442 Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Sat, 3 Mar 2012 19:27:27 +0800 Subject: x86/numa: Improve internode cache alignment Currently cache alignment among nodes in the kernel is still 128 bytes on x86 NUMA machines - we got that X86_INTERNODE_CACHE_SHIFT default from old P4 processors. But now most modern x86 CPUs use the same size: 64 bytes from L1 to last level L3. so let's remove the incorrect setting, and directly use the L1 cache size to do SMP cache line alignment. This patch saves some memory space on kernel data, and it also improves the cache locality of kernel data. The System.map is quite different with/without this change: before patch after patch ... 000000000000b000 d tlb_vector_| 000000000000b000 d tlb_vector 000000000000b080 d cpu_loops_p| 000000000000b040 d cpu_loops_ ... Signed-off-by: Alex Shi Cc: asit.k.mallick@intel.com Link: http://lkml.kernel.org/r/1330774047-18597-1-git-send-email-alex.shi@intel.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 3c57033..6443c6f 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -303,7 +303,6 @@ config X86_GENERIC config X86_INTERNODE_CACHE_SHIFT int default "12" if X86_VSMP - default "7" if NUMA default X86_L1_CACHE_SHIFT config X86_CMPXCHG -- cgit v0.10.2 From 0d2bf4899d04fcc7f3a280b0bc74c084badb4e04 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 8 Mar 2012 09:18:02 +0000 Subject: x86: Tighten dependencies of CPU_SUP_*_32 Building in support for either of these CPUs is pointless when e.g. M686 was selected (since such a kernel would use cmov instructions, which aren't available on these older CPUs). Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F58875A02000078000770E0@nat28.tlf.novell.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 6443c6f..706e12e 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -440,7 +440,7 @@ config CPU_SUP_INTEL config CPU_SUP_CYRIX_32 default y bool "Support Cyrix processors" if PROCESSOR_SELECT - depends on !64BIT + depends on M386 || M486 || M586 || M586TSC || M586MMX || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for Cyrix processors @@ -494,7 +494,7 @@ config CPU_SUP_TRANSMETA_32 config CPU_SUP_UMC_32 default y bool "Support UMC processors" if PROCESSOR_SELECT - depends on !64BIT + depends on M386 || M486 || (EXPERT && !64BIT) ---help--- This enables detection, tunings and quirks for UMC processors -- cgit v0.10.2 From c7e23289a6aa95048a78b252b462f24ca6cf7f96 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 8 Mar 2012 09:23:14 +0000 Subject: x86/32: Print control and debug registers for kerenel context While for a user mode register dump it may be reasonable to skip those (albeit x86-64 doesn't do so), for kernel mode dumps these should be printed to make sure all information possibly necessary for analysis is available. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F58889202000078000770E7@nat28.tlf.novell.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index c99f9ed..88ec912 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -87,7 +87,7 @@ void show_registers(struct pt_regs *regs) int i; print_modules(); - __show_regs(regs, 0); + __show_regs(regs, !user_mode_vm(regs)); printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)\n", TASK_COMM_LEN, current->comm, task_pid_nr(current), -- cgit v0.10.2 From a240ada241dafe290e7532d1ddeb98fdf1419068 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 8 Mar 2012 09:24:57 +0000 Subject: x86: Include probe_roms.h in probe_roms.c ... to ensure that declarations and definitions are in sync. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/4F5888F902000078000770F1@nat28.tlf.novell.com Signed-off-by: Ingo Molnar diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 34e06e8..0bc72e2 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -12,6 +12,7 @@ #include #include +#include #include #include #include -- cgit v0.10.2