diff options
Diffstat (limited to 'arch/x86')
110 files changed, 2418 insertions, 1557 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index efb4294..5731eb7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -26,6 +26,8 @@ config X86 select HAVE_IOREMAP_PROT select HAVE_KPROBES select HAVE_MEMBLOCK + select HAVE_MEMBLOCK_NODE_MAP + select ARCH_DISCARD_MEMBLOCK select ARCH_WANT_OPTIONAL_GPIOLIB select ARCH_WANT_FRAME_POINTERS select HAVE_DMA_ATTRS @@ -204,9 +206,6 @@ config ZONE_DMA32 bool default X86_64 -config ARCH_POPULATES_NODE_MAP - def_bool y - config AUDIT_ARCH bool default X86_64 @@ -343,6 +342,7 @@ config X86_EXTENDED_PLATFORM If you enable this option then you'll be able to select support for the following (non-PC) 64 bit x86 platforms: + Numascale NumaChip ScaleMP vSMP SGI Ultraviolet @@ -351,6 +351,18 @@ config X86_EXTENDED_PLATFORM endif # This is an alphabetically sorted list of 64 bit extended platforms # Please maintain the alphabetic order if and when there are additions +config X86_NUMACHIP + bool "Numascale NumaChip" + depends on X86_64 + depends on X86_EXTENDED_PLATFORM + depends on NUMA + depends on SMP + depends on X86_X2APIC + depends on !EDAC_AMD64 + ---help--- + Adds support for Numascale NumaChip large-SMP systems. Needed to + enable more than ~168 cores. + If you don't have one of these, you should say N here. config X86_VSMP bool "ScaleMP vSMP" diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a6253ec..3e27456 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -134,7 +134,7 @@ ENTRY(ia32_sysenter_target) CFI_REL_OFFSET rsp,0 pushfq_cfi /*CFI_REL_OFFSET rflags,0*/ - movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d + movl TI_sysenter_return+THREAD_INFO(%rsp,3*8-KERNEL_STACK_OFFSET),%r10d CFI_REGISTER rip,r10 pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ @@ -150,9 +150,8 @@ ENTRY(ia32_sysenter_target) .section __ex_table,"a" .quad 1b,ia32_badarg .previous - GET_THREAD_INFO(%r10) - orl $TS_COMPAT,TI_status(%r10) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE jnz sysenter_tracesys cmpq $(IA32_NR_syscalls-1),%rax @@ -162,13 +161,12 @@ sysenter_do_call: sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags(%r10) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz sysexit_audit sysexit_from_sys_call: - andl $~TS_COMPAT,TI_status(%r10) + andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) /* clear IF, that popfq doesn't enable interrupts early */ andl $~0x200,EFLAGS-R11(%rsp) movl RIP-R11(%rsp),%edx /* User %eip */ @@ -205,7 +203,7 @@ sysexit_from_sys_call: .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz ia32_ret_from_sys_call TRACE_IRQS_ON sti @@ -215,12 +213,11 @@ sysexit_from_sys_call: movzbl %al,%edi /* zero-extend that into %edi */ inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ call audit_syscall_exit - GET_THREAD_INFO(%r10) movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi cli TRACE_IRQS_OFF - testl %edi,TI_flags(%r10) + testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz \exit CLEAR_RREGS -ARGOFFSET jmp int_with_check @@ -238,7 +235,7 @@ sysexit_audit: sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz sysenter_auditsys #endif SAVE_REST @@ -309,9 +306,8 @@ ENTRY(ia32_cstar_target) .section __ex_table,"a" .quad 1b,ia32_badarg .previous - GET_THREAD_INFO(%r10) - orl $TS_COMPAT,TI_status(%r10) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) CFI_REMEMBER_STATE jnz cstar_tracesys cmpq $IA32_NR_syscalls-1,%rax @@ -321,13 +317,12 @@ cstar_do_call: cstar_dispatch: call *ia32_sys_call_table(,%rax,8) movq %rax,RAX-ARGOFFSET(%rsp) - GET_THREAD_INFO(%r10) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags(%r10) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz sysretl_audit sysretl_from_sys_call: - andl $~TS_COMPAT,TI_status(%r10) + andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) RESTORE_ARGS 0,-ARG_SKIP,0,0,0 movl RIP-ARGOFFSET(%rsp),%ecx CFI_REGISTER rip,rcx @@ -355,7 +350,7 @@ sysretl_audit: cstar_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%r10) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz cstar_auditsys #endif xchgl %r9d,%ebp @@ -420,9 +415,8 @@ ENTRY(ia32_syscall) /* note the registers are not zero extended to the sf. this could be a problem. */ SAVE_ARGS 0,1,0 - GET_THREAD_INFO(%r10) - orl $TS_COMPAT,TI_status(%r10) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%r10) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz ia32_tracesys cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys @@ -459,8 +453,8 @@ quiet_ni_syscall: CFI_ENDPROC .macro PTREGSCALL label, func, arg - .globl \label -\label: + ALIGN +GLOBAL(\label) leaq \func(%rip),%rax leaq -ARGOFFSET+8(%rsp),\arg /* 8 for return address */ jmp ia32_ptregs_common @@ -477,7 +471,8 @@ quiet_ni_syscall: PTREGSCALL stub32_vfork, sys_vfork, %rdi PTREGSCALL stub32_iopl, sys_iopl, %rsi -ENTRY(ia32_ptregs_common) + ALIGN +ia32_ptregs_common: popq %r11 CFI_ENDPROC CFI_STARTPROC32 simple diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 091508b..952bd01 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -4,10 +4,10 @@ #ifdef CONFIG_SMP .macro LOCK_PREFIX -1: lock +672: lock .section .smp_locks,"a" .balign 4 - .long 1b - . + .long 672b - . .previous .endm #else diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 1a6c09a..3ab9bdd 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -176,6 +176,7 @@ static inline u64 native_x2apic_icr_read(void) } extern int x2apic_phys; +extern int x2apic_preenabled; extern void check_x2apic(void); extern void enable_x2apic(void); extern void x2apic_icr_write(u32 low, u32 id); @@ -198,6 +199,9 @@ static inline void x2apic_force_phys(void) x2apic_phys = 1; } #else +static inline void disable_x2apic(void) +{ +} static inline void check_x2apic(void) { } @@ -212,6 +216,7 @@ static inline void x2apic_force_phys(void) { } +#define nox2apic 0 #define x2apic_preenabled 0 #define x2apic_supported() 0 #endif @@ -410,6 +415,7 @@ extern int wakeup_secondary_cpu_via_nmi(int apicid, unsigned long start_eip); #endif #ifdef CONFIG_X86_LOCAL_APIC + static inline u32 apic_read(u32 reg) { return apic->read(reg); diff --git a/arch/x86/include/asm/apic_flat_64.h b/arch/x86/include/asm/apic_flat_64.h new file mode 100644 index 0000000..a2d3127 --- /dev/null +++ b/arch/x86/include/asm/apic_flat_64.h @@ -0,0 +1,7 @@ +#ifndef _ASM_X86_APIC_FLAT_64_H +#define _ASM_X86_APIC_FLAT_64_H + +extern void flat_init_apic_ldr(void); + +#endif + diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 3925d80..134bba0 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -144,6 +144,7 @@ #define APIC_BASE (fix_to_virt(FIX_APIC_BASE)) #define APIC_BASE_MSR 0x800 +#define XAPIC_ENABLE (1UL << 11) #define X2APIC_ENABLE (1UL << 10) #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 1775d6e..b97596e 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -380,6 +380,8 @@ static inline unsigned long __fls(unsigned long word) return word; } +#undef ADDR + #ifdef __KERNEL__ /** * ffs - find first set bit in word @@ -395,10 +397,25 @@ static inline unsigned long __fls(unsigned long word) static inline int ffs(int x) { int r; -#ifdef CONFIG_X86_CMOV + +#ifdef CONFIG_X86_64 + /* + * AMD64 says BSFL won't clobber the dest reg if x==0; Intel64 says the + * dest reg is undefined if x==0, but their CPU architect says its + * value is written to set it to the same as before, except that the + * top 32 bits will be cleared. + * + * We cannot do this on 32 bits because at the very least some + * 486 CPUs did not behave this way. + */ + long tmp = -1; + asm("bsfl %1,%0" + : "=r" (r) + : "rm" (x), "0" (tmp)); +#elif defined(CONFIG_X86_CMOV) asm("bsfl %1,%0\n\t" "cmovzl %2,%0" - : "=r" (r) : "rm" (x), "r" (-1)); + : "=&r" (r) : "rm" (x), "r" (-1)); #else asm("bsfl %1,%0\n\t" "jnz 1f\n\t" @@ -422,7 +439,22 @@ static inline int ffs(int x) static inline int fls(int x) { int r; -#ifdef CONFIG_X86_CMOV + +#ifdef CONFIG_X86_64 + /* + * AMD64 says BSRL won't clobber the dest reg if x==0; Intel64 says the + * dest reg is undefined if x==0, but their CPU architect says its + * value is written to set it to the same as before, except that the + * top 32 bits will be cleared. + * + * We cannot do this on 32 bits because at the very least some + * 486 CPUs did not behave this way. + */ + long tmp = -1; + asm("bsrl %1,%0" + : "=r" (r) + : "rm" (x), "0" (tmp)); +#elif defined(CONFIG_X86_CMOV) asm("bsrl %1,%0\n\t" "cmovzl %2,%0" : "=&r" (r) : "rm" (x), "rm" (-1)); @@ -434,11 +466,35 @@ static inline int fls(int x) #endif return r + 1; } -#endif /* __KERNEL__ */ - -#undef ADDR -#ifdef __KERNEL__ +/** + * fls64 - find last set bit in a 64-bit word + * @x: the word to search + * + * This is defined in a similar way as the libc and compiler builtin + * ffsll, but returns the position of the most significant set bit. + * + * fls64(value) returns 0 if value is 0 or the position of the last + * set bit if value is nonzero. The last (most significant) bit is + * at position 64. + */ +#ifdef CONFIG_X86_64 +static __always_inline int fls64(__u64 x) +{ + long bitpos = -1; + /* + * AMD64 says BSRQ won't clobber the dest reg if x==0; Intel64 says the + * dest reg is undefined if x==0, but their CPU architect says its + * value is written to set it to the same as before. + */ + asm("bsrq %1,%0" + : "+r" (bitpos) + : "rm" (x)); + return bitpos + 1; +} +#else +#include <asm-generic/bitops/fls64.h> +#endif #include <asm-generic/bitops/find.h> @@ -450,12 +506,6 @@ static inline int fls(int x) #include <asm-generic/bitops/const_hweight.h> -#endif /* __KERNEL__ */ - -#include <asm-generic/bitops/fls64.h> - -#ifdef __KERNEL__ - #include <asm-generic/bitops/le.h> #include <asm-generic/bitops/ext2-atomic-setbit.h> diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 5d3acdf..0c9fa27 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -14,6 +14,8 @@ extern void __cmpxchg_wrong_size(void) __compiletime_error("Bad argument size for cmpxchg"); extern void __xadd_wrong_size(void) __compiletime_error("Bad argument size for xadd"); +extern void __add_wrong_size(void) + __compiletime_error("Bad argument size for add"); /* * Constants for operation sizes. On 32-bit, the 64-bit size it set to @@ -31,60 +33,47 @@ extern void __xadd_wrong_size(void) #define __X86_CASE_Q -1 /* sizeof will never return -1 */ #endif +/* + * An exchange-type operation, which takes a value and a pointer, and + * returns a the old value. + */ +#define __xchg_op(ptr, arg, op, lock) \ + ({ \ + __typeof__ (*(ptr)) __ret = (arg); \ + switch (sizeof(*(ptr))) { \ + case __X86_CASE_B: \ + asm volatile (lock #op "b %b0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_W: \ + asm volatile (lock #op "w %w0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_L: \ + asm volatile (lock #op "l %0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_Q: \ + asm volatile (lock #op "q %q0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + default: \ + __ ## op ## _wrong_size(); \ + } \ + __ret; \ + }) + /* * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. * Since this is generally used to protect other memory information, we * use "asm volatile" and "memory" clobbers to prevent gcc from moving * information around. */ -#define __xchg(x, ptr, size) \ -({ \ - __typeof(*(ptr)) __x = (x); \ - switch (size) { \ - case __X86_CASE_B: \ - { \ - volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile("xchgb %0,%1" \ - : "=q" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case __X86_CASE_W: \ - { \ - volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile("xchgw %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case __X86_CASE_L: \ - { \ - volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile("xchgl %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case __X86_CASE_Q: \ - { \ - volatile u64 *__ptr = (volatile u64 *)(ptr); \ - asm volatile("xchgq %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - default: \ - __xchg_wrong_size(); \ - } \ - __x; \ -}) - -#define xchg(ptr, v) \ - __xchg((v), (ptr), sizeof(*ptr)) +#define xchg(ptr, v) __xchg_op((ptr), (v), xchg, "") /* * Atomic compare and exchange. Compare OLD with MEM, if identical, @@ -165,46 +154,80 @@ extern void __xadd_wrong_size(void) __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) #endif -#define __xadd(ptr, inc, lock) \ +/* + * xadd() adds "inc" to "*ptr" and atomically returns the previous + * value of "*ptr". + * + * xadd() is locked when multiple CPUs are online + * xadd_sync() is always locked + * xadd_local() is never locked + */ +#define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock) +#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) +#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") +#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") + +#define __add(ptr, inc, lock) \ ({ \ __typeof__ (*(ptr)) __ret = (inc); \ switch (sizeof(*(ptr))) { \ case __X86_CASE_B: \ - asm volatile (lock "xaddb %b0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ + asm volatile (lock "addb %b1, %0\n" \ + : "+m" (*(ptr)) : "ri" (inc) \ + : "memory", "cc"); \ break; \ case __X86_CASE_W: \ - asm volatile (lock "xaddw %w0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ + asm volatile (lock "addw %w1, %0\n" \ + : "+m" (*(ptr)) : "ri" (inc) \ + : "memory", "cc"); \ break; \ case __X86_CASE_L: \ - asm volatile (lock "xaddl %0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ + asm volatile (lock "addl %1, %0\n" \ + : "+m" (*(ptr)) : "ri" (inc) \ + : "memory", "cc"); \ break; \ case __X86_CASE_Q: \ - asm volatile (lock "xaddq %q0, %1\n" \ - : "+r" (__ret), "+m" (*(ptr)) \ - : : "memory", "cc"); \ + asm volatile (lock "addq %1, %0\n" \ + : "+m" (*(ptr)) : "ri" (inc) \ + : "memory", "cc"); \ break; \ default: \ - __xadd_wrong_size(); \ + __add_wrong_size(); \ } \ __ret; \ }) /* - * xadd() adds "inc" to "*ptr" and atomically returns the previous - * value of "*ptr". + * add_*() adds "inc" to "*ptr" * - * xadd() is locked when multiple CPUs are online - * xadd_sync() is always locked - * xadd_local() is never locked + * __add() takes a lock prefix + * add_smp() is locked when multiple CPUs are online + * add_sync() is always locked */ -#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) -#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") -#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") +#define add_smp(ptr, inc) __add((ptr), (inc), LOCK_PREFIX) +#define add_sync(ptr, inc) __add((ptr), (inc), "lock; ") + +#define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \ +({ \ + bool __ret; \ + __typeof__(*(p1)) __old1 = (o1), __new1 = (n1); \ + __typeof__(*(p2)) __old2 = (o2), __new2 = (n2); \ + BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long)); \ + BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long)); \ + VM_BUG_ON((unsigned long)(p1) % (2 * sizeof(long))); \ + VM_BUG_ON((unsigned long)((p1) + 1) != (unsigned long)(p2)); \ + asm volatile(pfx "cmpxchg%c4b %2; sete %0" \ + : "=a" (__ret), "+d" (__old2), \ + "+m" (*(p1)), "+m" (*(p2)) \ + : "i" (2 * sizeof(long)), "a" (__old1), \ + "b" (__new1), "c" (__new2)); \ + __ret; \ +}) + +#define cmpxchg_double(p1, p2, o1, o2, n1, n2) \ + __cmpxchg_double(LOCK_PREFIX, p1, p2, o1, o2, n1, n2) + +#define cmpxchg_double_local(p1, p2, o1, o2, n1, n2) \ + __cmpxchg_double(, p1, p2, o1, o2, n1, n2) #endif /* ASM_X86_CMPXCHG_H */ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index fbebb07..53f4b21 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -166,52 +166,6 @@ static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, #endif -#define cmpxchg8b(ptr, o1, o2, n1, n2) \ -({ \ - char __ret; \ - __typeof__(o2) __dummy; \ - __typeof__(*(ptr)) __old1 = (o1); \ - __typeof__(o2) __old2 = (o2); \ - __typeof__(*(ptr)) __new1 = (n1); \ - __typeof__(o2) __new2 = (n2); \ - asm volatile(LOCK_PREFIX "cmpxchg8b %2; setz %1" \ - : "=d"(__dummy), "=a" (__ret), "+m" (*ptr)\ - : "a" (__old1), "d"(__old2), \ - "b" (__new1), "c" (__new2) \ - : "memory"); \ - __ret; }) - - -#define cmpxchg8b_local(ptr, o1, o2, n1, n2) \ -({ \ - char __ret; \ - __typeof__(o2) __dummy; \ - __typeof__(*(ptr)) __old1 = (o1); \ - __typeof__(o2) __old2 = (o2); \ - __typeof__(*(ptr)) __new1 = (n1); \ - __typeof__(o2) __new2 = (n2); \ - asm volatile("cmpxchg8b %2; setz %1" \ - : "=d"(__dummy), "=a"(__ret), "+m" (*ptr)\ - : "a" (__old), "d"(__old2), \ - "b" (__new1), "c" (__new2), \ - : "memory"); \ - __ret; }) - - -#define cmpxchg_double(ptr, o1, o2, n1, n2) \ -({ \ - BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ - VM_BUG_ON((unsigned long)(ptr) % 8); \ - cmpxchg8b((ptr), (o1), (o2), (n1), (n2)); \ -}) - -#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \ -({ \ - BUILD_BUG_ON(sizeof(*(ptr)) != 4); \ - VM_BUG_ON((unsigned long)(ptr) % 8); \ - cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \ -}) - #define system_has_cmpxchg_double() cpu_has_cx8 #endif /* _ASM_X86_CMPXCHG_32_H */ diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 285da02..614be87 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -20,49 +20,6 @@ static inline void set_64bit(volatile u64 *ptr, u64 val) cmpxchg_local((ptr), (o), (n)); \ }) -#define cmpxchg16b(ptr, o1, o2, n1, n2) \ -({ \ - char __ret; \ - __typeof__(o2) __junk; \ - __typeof__(*(ptr)) __old1 = (o1); \ - __typeof__(o2) __old2 = (o2); \ - __typeof__(*(ptr)) __new1 = (n1); \ - __typeof__(o2) __new2 = (n2); \ - asm volatile(LOCK_PREFIX "cmpxchg16b %2;setz %1" \ - : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \ - : "b"(__new1), "c"(__new2), \ - "a"(__old1), "d"(__old2)); \ - __ret; }) - - -#define cmpxchg16b_local(ptr, o1, o2, n1, n2) \ -({ \ - char __ret; \ - __typeof__(o2) __junk; \ - __typeof__(*(ptr)) __old1 = (o1); \ - __typeof__(o2) __old2 = (o2); \ - __typeof__(*(ptr)) __new1 = (n1); \ - __typeof__(o2) __new2 = (n2); \ - asm volatile("cmpxchg16b %2;setz %1" \ - : "=d"(__junk), "=a"(__ret), "+m" (*ptr) \ - : "b"(__new1), "c"(__new2), \ - "a"(__old1), "d"(__old2)); \ - __ret; }) - -#define cmpxchg_double(ptr, o1, o2, n1, n2) \ -({ \ - BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ - VM_BUG_ON((unsigned long)(ptr) % 16); \ - cmpxchg16b((ptr), (o1), (o2), (n1), (n2)); \ -}) - -#define cmpxchg_double_local(ptr, o1, o2, n1, n2) \ -({ \ - BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ - VM_BUG_ON((unsigned long)(ptr) % 16); \ - cmpxchg16b_local((ptr), (o1), (o2), (n1), (n2)); \ -}) - #define system_has_cmpxchg_double() cpu_has_cx16 #endif /* _ASM_X86_CMPXCHG_64_H */ diff --git a/arch/x86/include/asm/div64.h b/arch/x86/include/asm/div64.h index 9a2d644..ced283a 100644 --- a/arch/x86/include/asm/div64.h +++ b/arch/x86/include/asm/div64.h @@ -4,6 +4,7 @@ #ifdef CONFIG_X86_32 #include <linux/types.h> +#include <linux/log2.h> /* * do_div() is NOT a C function. It wants to return @@ -21,15 +22,20 @@ ({ \ unsigned long __upper, __low, __high, __mod, __base; \ __base = (base); \ - asm("":"=a" (__low), "=d" (__high) : "A" (n)); \ - __upper = __high; \ - if (__high) { \ - __upper = __high % (__base); \ - __high = __high / (__base); \ + if (__builtin_constant_p(__base) && is_power_of_2(__base)) { \ + __mod = n & (__base - 1); \ + n >>= ilog2(__base); \ + } else { \ + asm("" : "=a" (__low), "=d" (__high) : "A" (n));\ + __upper = __high; \ + if (__high) { \ + __upper = __high % (__base); \ + __high = __high / (__base); \ + } \ + asm("divl %2" : "=a" (__low), "=d" (__mod) \ + : "rm" (__base), "0" (__low), "1" (__upper)); \ + asm("" : "=A" (n) : "a" (__low), "d" (__high)); \ } \ - asm("divl %2":"=a" (__low), "=d" (__mod) \ - : "rm" (__base), "0" (__low), "1" (__upper)); \ - asm("":"=A" (n) : "a" (__low), "d" (__high)); \ __mod; \ }) diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h index 908b969..3778256 100644 --- a/arch/x86/include/asm/e820.h +++ b/arch/x86/include/asm/e820.h @@ -117,7 +117,7 @@ static inline void early_memtest(unsigned long start, unsigned long end) extern unsigned long e820_end_of_ram_pfn(void); extern unsigned long e820_end_of_low_ram_pfn(void); -extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); +extern u64 early_reserve_e820(u64 sizet, u64 align); void memblock_x86_fill(void); void memblock_find_dma_reserve(void); diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 55e4de6..da0b3ca 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -11,6 +11,7 @@ typedef struct { #ifdef CONFIG_X86_LOCAL_APIC unsigned int apic_timer_irqs; /* arch dependent */ unsigned int irq_spurious_count; + unsigned int icr_read_retry_count; #endif unsigned int x86_platform_ipis; /* arch dependent */ unsigned int apic_perf_irqs; diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index c9e09ea..6919e93 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -218,7 +218,7 @@ static inline void fpu_fxsave(struct fpu *fpu) #ifdef CONFIG_SMP #define safe_address (__per_cpu_offset[0]) #else -#define safe_address (kstat_cpu(0).cpustat.user) +#define safe_address (__get_cpu_var(kernel_cpustat).cpustat[CPUTIME_USER]) #endif /* diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 88c765e..74df3f1 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -137,6 +137,13 @@ static inline int insn_is_avx(struct insn *insn) return (insn->vex_prefix.value != 0); } +/* Ensure this instruction is decoded completely */ +static inline int insn_complete(struct insn *insn) +{ + return insn->opcode.got && insn->modrm.got && insn->sib.got && + insn->displacement.got && insn->immediate.got; +} + static inline insn_byte_t insn_vex_m_bits(struct insn *insn) { if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */ diff --git a/arch/x86/include/asm/mach_timer.h b/arch/x86/include/asm/mach_timer.h index 8537285..88d0c3c 100644 --- a/arch/x86/include/asm/mach_timer.h +++ b/arch/x86/include/asm/mach_timer.h @@ -15,7 +15,7 @@ #define CALIBRATE_TIME_MSEC 30 /* 30 msecs */ #define CALIBRATE_LATCH \ - ((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) + ((PIT_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000) static inline void mach_prepare_counter(void) { diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 01fdf56..0e8e85b 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -81,8 +81,8 @@ static inline unsigned char current_lock_cmos_reg(void) #else #define lock_cmos_prefix(reg) do {} while (0) #define lock_cmos_suffix(reg) do {} while (0) -#define lock_cmos(reg) -#define unlock_cmos() +#define lock_cmos(reg) do { } while (0) +#define unlock_cmos() do { } while (0) #define do_i_have_lock_cmos() 0 #define current_lock_cmos_reg() 0 #endif diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 0e8ae57..6add827 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -50,10 +50,11 @@ #define MCJ_CTX_MASK 3 #define MCJ_CTX(flags) ((flags) & MCJ_CTX_MASK) #define MCJ_CTX_RANDOM 0 /* inject context: random */ -#define MCJ_CTX_PROCESS 1 /* inject context: process */ -#define MCJ_CTX_IRQ 2 /* inject context: IRQ */ -#define MCJ_NMI_BROADCAST 4 /* do NMI broadcasting */ -#define MCJ_EXCEPTION 8 /* raise as exception */ +#define MCJ_CTX_PROCESS 0x1 /* inject context: process */ +#define MCJ_CTX_IRQ 0x2 /* inject context: IRQ */ +#define MCJ_NMI_BROADCAST 0x4 /* do NMI broadcasting */ +#define MCJ_EXCEPTION 0x8 /* raise as exception */ +#define MCJ_IRQ_BRAODCAST 0x10 /* do IRQ broadcasting */ /* Fields are zero when not available */ struct mce { @@ -120,7 +121,8 @@ struct mce_log { #ifdef __KERNEL__ -extern struct atomic_notifier_head x86_mce_decoder_chain; +extern void mce_register_decode_chain(struct notifier_block *nb); +extern void mce_unregister_decode_chain(struct notifier_block *nb); #include <linux/percpu.h> #include <linux/init.h> diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h deleted file mode 100644 index 0cd3800..0000000 --- a/arch/x86/include/asm/memblock.h +++ /dev/null @@ -1,23 +0,0 @@ -#ifndef _X86_MEMBLOCK_H -#define _X86_MEMBLOCK_H - -#define ARCH_DISCARD_MEMBLOCK - -u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align); - -void memblock_x86_reserve_range(u64 start, u64 end, char *name); -void memblock_x86_free_range(u64 start, u64 end); -struct range; -int __get_free_all_memory_range(struct range **range, int nodeid, - unsigned long start_pfn, unsigned long end_pfn); -int get_free_all_memory_range(struct range **rangep, int nodeid); - -void memblock_x86_register_active_regions(int nid, unsigned long start_pfn, - unsigned long last_pfn); -u64 memblock_x86_hole_size(u64 start, u64 end); -u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align); -u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit); -u64 memblock_x86_memory_in_range(u64 addr, u64 limit); -bool memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align); - -#endif diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index 2421507..4ebe157 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -48,6 +48,7 @@ static inline struct microcode_ops * __init init_intel_microcode(void) #ifdef CONFIG_MICROCODE_AMD extern struct microcode_ops * __init init_amd_microcode(void); +extern void __exit exit_amd_microcode(void); static inline void get_ucode_data(void *to, const u8 *from, size_t n) { @@ -59,6 +60,7 @@ static inline struct microcode_ops * __init init_amd_microcode(void) { return NULL; } +static inline void __exit exit_amd_microcode(void) {} #endif #endif /* _ASM_X86_MICROCODE_H */ diff --git a/arch/x86/include/asm/numachip/numachip_csr.h b/arch/x86/include/asm/numachip/numachip_csr.h new file mode 100644 index 0000000..660f843 --- /dev/null +++ b/arch/x86/include/asm/numachip/numachip_csr.h @@ -0,0 +1,167 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-Specific Header file + * + * Copyright (C) 2011 Numascale AS. All rights reserved. + * + * Send feedback to <support@numascale.com> + * + */ + +#ifndef _ASM_X86_NUMACHIP_NUMACHIP_CSR_H +#define _ASM_X86_NUMACHIP_NUMACHIP_CSR_H + +#include <linux/numa.h> +#include <linux/percpu.h> +#include <linux/io.h> +#include <linux/swab.h> +#include <asm/types.h> +#include <asm/processor.h> + +#define CSR_NODE_SHIFT 16 +#define CSR_NODE_BITS(p) (((unsigned long)(p)) << CSR_NODE_SHIFT) +#define CSR_NODE_MASK 0x0fff /* 4K nodes */ + +/* 32K CSR space, b15 indicates geo/non-geo */ +#define CSR_OFFSET_MASK 0x7fffUL + +/* Global CSR space covers all 4K possible nodes with 64K CSR space per node */ +#define NUMACHIP_GCSR_BASE 0x3fff00000000ULL +#define NUMACHIP_GCSR_LIM 0x3fff0fffffffULL +#define NUMACHIP_GCSR_SIZE (NUMACHIP_GCSR_LIM - NUMACHIP_GCSR_BASE + 1) + +/* + * Local CSR space starts in global CSR space with "nodeid" = 0xfff0, however + * when using the direct mapping on x86_64, both start and size needs to be + * aligned with PMD_SIZE which is 2M + */ +#define NUMACHIP_LCSR_BASE 0x3ffffe000000ULL +#define NUMACHIP_LCSR_LIM 0x3fffffffffffULL +#define NUMACHIP_LCSR_SIZE (NUMACHIP_LCSR_LIM - NUMACHIP_LCSR_BASE + 1) + +static inline void *gcsr_address(int node, unsigned long offset) +{ + return __va(NUMACHIP_GCSR_BASE | (1UL << 15) | + CSR_NODE_BITS(node & CSR_NODE_MASK) | (offset & CSR_OFFSET_MASK)); +} + +static inline void *lcsr_address(unsigned long offset) +{ + return __va(NUMACHIP_LCSR_BASE | (1UL << 15) | + CSR_NODE_BITS(0xfff0) | (offset & CSR_OFFSET_MASK)); +} + +static inline unsigned int read_gcsr(int node, unsigned long offset) +{ + return swab32(readl(gcsr_address(node, offset))); +} + +static inline void write_gcsr(int node, unsigned long offset, unsigned int val) +{ + writel(swab32(val), gcsr_address(node, offset)); +} + +static inline unsigned int read_lcsr(unsigned long offset) +{ + return swab32(readl(lcsr_address(offset))); +} + +static inline void write_lcsr(unsigned long offset, unsigned int val) +{ + writel(swab32(val), lcsr_address(offset)); +} + +/* ========================================================================= */ +/* CSR_G0_STATE_CLEAR */ +/* ========================================================================= */ + +#define CSR_G0_STATE_CLEAR (0x000 + (0 << 12)) +union numachip_csr_g0_state_clear { + unsigned int v; + struct numachip_csr_g0_state_clear_s { + unsigned int _state:2; + unsigned int _rsvd_2_6:5; + unsigned int _lost:1; + unsigned int _rsvd_8_31:24; + } s; +}; + +/* ========================================================================= */ +/* CSR_G0_NODE_IDS */ +/* ========================================================================= */ + +#define CSR_G0_NODE_IDS (0x008 + (0 << 12)) +union numachip_csr_g0_node_ids { + unsigned int v; + struct numachip_csr_g0_node_ids_s { + unsigned int _initialid:16; + unsigned int _nodeid:12; + unsigned int _rsvd_28_31:4; + } s; +}; + +/* ========================================================================= */ +/* CSR_G3_EXT_IRQ_GEN */ +/* ========================================================================= */ + +#define CSR_G3_EXT_IRQ_GEN (0x030 + (3 << 12)) +union numachip_csr_g3_ext_irq_gen { + unsigned int v; + struct numachip_csr_g3_ext_irq_gen_s { + unsigned int _vector:8; + unsigned int _msgtype:3; + unsigned int _index:5; + unsigned int _destination_apic_id:16; + } s; +}; + +/* ========================================================================= */ +/* CSR_G3_EXT_IRQ_STATUS */ +/* ========================================================================= */ + +#define CSR_G3_EXT_IRQ_STATUS (0x034 + (3 << 12)) +union numachip_csr_g3_ext_irq_status { + unsigned int v; + struct numachip_csr_g3_ext_irq_status_s { + unsigned int _result:32; + } s; +}; + +/* ========================================================================= */ +/* CSR_G3_EXT_IRQ_DEST */ +/* ========================================================================= */ + +#define CSR_G3_EXT_IRQ_DEST (0x038 + (3 << 12)) +union numachip_csr_g3_ext_irq_dest { + unsigned int v; + struct numachip_csr_g3_ext_irq_dest_s { + unsigned int _irq:8; + unsigned int _rsvd_8_31:24; + } s; +}; + +/* ========================================================================= */ +/* CSR_G3_NC_ATT_MAP_SELECT */ +/* ========================================================================= */ + +#define CSR_G3_NC_ATT_MAP_SELECT (0x7fc + (3 << 12)) +union numachip_csr_g3_nc_att_map_select { + unsigned int v; + struct numachip_csr_g3_nc_att_map_select_s { + unsigned int _upper_address_bits:4; + unsigned int _select_ram:4; + unsigned int _rsvd_8_31:24; + } s; +}; + +/* ========================================================================= */ +/* CSR_G3_NC_ATT_MAP_SELECT_0-255 */ +/* ========================================================================= */ + +#define CSR_G3_NC_ATT_MAP_SELECT_0 (0x800 + (3 << 12)) + +#endif /* _ASM_X86_NUMACHIP_NUMACHIP_CSR_H */ + diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 3470c9d..529bf07e 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -451,23 +451,20 @@ do { \ #endif /* !CONFIG_M386 */ #ifdef CONFIG_X86_CMPXCHG64 -#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \ +#define percpu_cmpxchg8b_double(pcp1, pcp2, o1, o2, n1, n2) \ ({ \ - char __ret; \ - typeof(o1) __o1 = o1; \ - typeof(o1) __n1 = n1; \ - typeof(o2) __o2 = o2; \ - typeof(o2) __n2 = n2; \ - typeof(o2) __dummy = n2; \ + bool __ret; \ + typeof(pcp1) __o1 = (o1), __n1 = (n1); \ + typeof(pcp2) __o2 = (o2), __n2 = (n2); \ asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \ - : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \ - : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \ + : "=a" (__ret), "+m" (pcp1), "+m" (pcp2), "+d" (__o2) \ + : "b" (__n1), "c" (__n2), "a" (__o1)); \ __ret; \ }) -#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) -#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) -#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) +#define __this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double +#define this_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double +#define irqsafe_cpu_cmpxchg_double_4 percpu_cmpxchg8b_double #endif /* CONFIG_X86_CMPXCHG64 */ /* @@ -508,31 +505,23 @@ do { \ * it in software. The address used in the cmpxchg16 instruction must be * aligned to a 16 byte boundary. */ -#ifdef CONFIG_SMP -#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3 -#else -#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2 -#endif -#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \ +#define percpu_cmpxchg16b_double(pcp1, pcp2, o1, o2, n1, n2) \ ({ \ - char __ret; \ - typeof(o1) __o1 = o1; \ - typeof(o1) __n1 = n1; \ - typeof(o2) __o2 = o2; \ - typeof(o2) __n2 = n2; \ - typeof(o2) __dummy; \ - alternative_io(CMPXCHG16B_EMU_CALL, \ - "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \ + bool __ret; \ + typeof(pcp1) __o1 = (o1), __n1 = (n1); \ + typeof(pcp2) __o2 = (o2), __n2 = (n2); \ + alternative_io("leaq %P1,%%rsi\n\tcall this_cpu_cmpxchg16b_emu\n\t", \ + "cmpxchg16b " __percpu_arg(1) "\n\tsetz %0\n\t", \ X86_FEATURE_CX16, \ - ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ - "S" (&pcp1), "b"(__n1), "c"(__n2), \ - "a"(__o1), "d"(__o2) : "memory"); \ + ASM_OUTPUT2("=a" (__ret), "+m" (pcp1), \ + "+m" (pcp2), "+d" (__o2)), \ + "b" (__n1), "c" (__n2), "a" (__o1) : "rsi"); \ __ret; \ }) -#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) -#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) -#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) +#define __this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double +#define this_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double +#define irqsafe_cpu_cmpxchg_double_8 percpu_cmpxchg16b_double #endif diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index f61c62f..096c975 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -57,6 +57,7 @@ (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) #define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 +#define ARCH_PERFMON_EVENTS_COUNT 7 /* * Intel "Architectural Performance Monitoring" CPUID @@ -72,6 +73,19 @@ union cpuid10_eax { unsigned int full; }; +union cpuid10_ebx { + struct { + unsigned int no_unhalted_core_cycles:1; + unsigned int no_instructions_retired:1; + unsigned int no_unhalted_reference_cycles:1; + unsigned int no_llc_reference:1; + unsigned int no_llc_misses:1; + unsigned int no_branch_instruction_retired:1; + unsigned int no_branch_misses_retired:1; + } split; + unsigned int full; +}; + union cpuid10_edx { struct { unsigned int num_counters_fixed:5; @@ -81,6 +95,15 @@ union cpuid10_edx { unsigned int full; }; +struct x86_pmu_capability { + int version; + int num_counters_gp; + int num_counters_fixed; + int bit_width_gp; + int bit_width_fixed; + unsigned int events_mask; + int events_mask_len; +}; /* * Fixed-purpose performance events: @@ -89,23 +112,24 @@ union cpuid10_edx { /* * All 3 fixed-mode PMCs are configured via this single MSR: */ -#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d +#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d /* * The counts are available in three separate MSRs: */ /* Instr_Retired.Any: */ -#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 -#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) +#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 +#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) /* CPU_CLK_Unhalted.Core: */ -#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a -#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) +#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a +#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) /* CPU_CLK_Unhalted.Ref: */ -#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b -#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) +#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b +#define X86_PMC_IDX_FIXED_REF_CYCLES (X86_PMC_IDX_FIXED + 2) +#define X86_PMC_MSK_FIXED_REF_CYCLES (1ULL << X86_PMC_IDX_FIXED_REF_CYCLES) /* * We model BTS tracing as another fixed-mode PMC. @@ -202,6 +226,7 @@ struct perf_guest_switch_msr { }; extern struct perf_guest_switch_msr *perf_guest_get_msrs(int *nr); +extern void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap); #else static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) { @@ -209,6 +234,11 @@ static inline perf_guest_switch_msr *perf_guest_get_msrs(int *nr) return NULL; } +static inline void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) +{ + memset(cap, 0, sizeof(*cap)); +} + static inline void perf_events_lapic_init(void) { } #endif diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 18601c8..49afb3f 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -703,7 +703,7 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, pte_update(mm, addr, ptep); } -#define flush_tlb_fix_spurious_fault(vma, address) +#define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h index 2dddb31..f8ab3ea 100644 --- a/arch/x86/include/asm/processor-flags.h +++ b/arch/x86/include/asm/processor-flags.h @@ -6,6 +6,7 @@ * EFLAGS bits */ #define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +#define X86_EFLAGS_BIT1 0x00000002 /* Bit 1 - always on */ #define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ #define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */ #define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b650435..aa9088c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -99,7 +99,6 @@ struct cpuinfo_x86 { u16 apicid; u16 initial_apicid; u16 x86_clflush_size; -#ifdef CONFIG_SMP /* number of cores as seen by the OS: */ u16 booted_cores; /* Physical processor id: */ @@ -110,7 +109,6 @@ struct cpuinfo_x86 { u8 compute_unit_id; /* Index into per_cpu list: */ u16 cpu_index; -#endif u32 microcode; } __attribute__((__aligned__(SMP_CACHE_BYTES))); diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 972c260..a82c2bf 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -79,23 +79,10 @@ static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } -#if (NR_CPUS < 256) static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { - asm volatile(UNLOCK_LOCK_PREFIX "incb %0" - : "+m" (lock->head_tail) - : - : "memory", "cc"); + __add(&lock->tickets.head, 1, UNLOCK_LOCK_PREFIX); } -#else -static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) -{ - asm volatile(UNLOCK_LOCK_PREFIX "incw %0" - : "+m" (lock->head_tail) - : - : "memory", "cc"); -} -#endif static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a1fe5c1..185b719 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -40,7 +40,8 @@ struct thread_info { */ __u8 supervisor_stack[0]; #endif - int uaccess_err; + int sig_on_uaccess_error:1; + int uaccess_err:1; /* uaccess failed */ }; #define INIT_THREAD_INFO(tsk) \ @@ -231,6 +232,12 @@ static inline struct thread_info *current_thread_info(void) movq PER_CPU_VAR(kernel_stack),reg ; \ subq $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg +/* + * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in + * a certain register (to be used in assembler memory operands). + */ +#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) + #endif #endif /* !X86_32 */ diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index c006924..800f77c 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -130,10 +130,8 @@ extern void setup_node_to_cpumask_map(void); .balance_interval = 1, \ } -#ifdef CONFIG_X86_64 extern int __node_distance(int, int); #define node_distance(a, b) __node_distance(a, b) -#endif #else /* !CONFIG_NUMA */ diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 83e2efd..15d9915 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -51,6 +51,8 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern unsigned long native_calibrate_tsc(void); +extern int tsc_clocksource_reliable; + /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 36361bf..8be5f54 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -462,7 +462,7 @@ struct __large_struct { unsigned long buf[100]; }; barrier(); #define uaccess_catch(err) \ - (err) |= current_thread_info()->uaccess_err; \ + (err) |= (current_thread_info()->uaccess_err ? -EFAULT : 0); \ current_thread_info()->uaccess_err = prev_err; \ } while (0) diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 1971e65..1ac860a 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -7,6 +7,7 @@ struct mpc_bus; struct mpc_cpu; struct mpc_table; +struct cpuinfo_x86; /** * struct x86_init_mpparse - platform specific mpparse ops @@ -147,6 +148,7 @@ struct x86_init_ops { */ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); + void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); }; /** @@ -186,5 +188,6 @@ extern struct x86_msi_ops x86_msi; extern void x86_init_noop(void); extern void x86_init_uint_noop(unsigned int unused); +extern void x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node); #endif diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4558f0d..ce664f3 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -219,6 +219,8 @@ static int __init acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; + int apic_id; + u8 enabled; processor = (struct acpi_madt_local_x2apic *)header; @@ -227,6 +229,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); + apic_id = processor->local_apic_id; + enabled = processor->lapic_flags & ACPI_MADT_ENABLED; #ifdef CONFIG_X86_X2APIC /* * We need to register disabled CPU as well to permit @@ -235,8 +239,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - acpi_register_lapic(processor->local_apic_id, /* APIC ID */ - processor->lapic_flags & ACPI_MADT_ENABLED); + if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled) + printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); + else + acpi_register_lapic(apic_id, enabled); #else printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); #endif diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4c39baa..013c181 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -123,16 +123,14 @@ int amd_get_subcaches(int cpu) { struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; unsigned int mask; - int cuid = 0; + int cuid; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return 0; pci_read_config_dword(link, 0x1d4, &mask); -#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; -#endif return (mask >> (4 * cuid)) & 0xf; } @@ -141,7 +139,7 @@ int amd_set_subcaches(int cpu, int mask) static unsigned int reset, ban; struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); unsigned int reg; - int cuid = 0; + int cuid; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) return -EINVAL; @@ -159,9 +157,7 @@ int amd_set_subcaches(int cpu, int mask) pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); } -#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; -#endif mask <<= 4 * cuid; mask |= (0xf ^ (1 << cuid)) << 26; diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3d2661c..6e76c19 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -88,13 +88,13 @@ static u32 __init allocate_aperture(void) */ addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, aper_size, aper_size); - if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { + if (!addr || addr + aper_size > GART_MAX_ADDR) { printk(KERN_ERR "Cannot allocate aperture memory hole (%lx,%uK)\n", addr, aper_size>>10); return 0; } - memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); + memblock_reserve(addr, aper_size); /* * Kmemleak should not scan this block as it may not be mapped via the * kernel direct mapping. diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 767fd04..0ae0323 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_SMP) += ipi.o ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here +obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f98d84c..2eec05b 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -146,16 +146,26 @@ __setup("apicpmtimer", setup_apicpmtimer); int x2apic_mode; #ifdef CONFIG_X86_X2APIC /* x2apic enabled before OS handover */ -static int x2apic_preenabled; +int x2apic_preenabled; +static int x2apic_disabled; +static int nox2apic; static __init int setup_nox2apic(char *str) { if (x2apic_enabled()) { - pr_warning("Bios already enabled x2apic, " - "can't enforce nox2apic"); - return 0; - } + int apicid = native_apic_msr_read(APIC_ID); + + if (apicid >= 255) { + pr_warning("Apicid: %08x, cannot enforce nox2apic\n", + apicid); + return 0; + } + + pr_warning("x2apic already enabled. will disable it\n"); + } else + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + + nox2apic = 1; - setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; } early_param("nox2apic", setup_nox2apic); @@ -250,6 +260,7 @@ u32 native_safe_apic_wait_icr_idle(void) send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; if (!send_status) break; + inc_irq_stat(icr_read_retry_count); udelay(100); } while (timeout++ < 1000); @@ -876,8 +887,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ - exit_idle(); irq_enter(); + exit_idle(); local_apic_timer_interrupt(); irq_exit(); @@ -1431,6 +1442,45 @@ void __init bsp_end_local_APIC_setup(void) } #ifdef CONFIG_X86_X2APIC +/* + * Need to disable xapic and x2apic at the same time and then enable xapic mode + */ +static inline void __disable_x2apic(u64 msr) +{ + wrmsrl(MSR_IA32_APICBASE, + msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); + wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); +} + +static __init void disable_x2apic(void) +{ + u64 msr; + + if (!cpu_has_x2apic) + return; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (msr & X2APIC_ENABLE) { + u32 x2apic_id = read_apic_id(); + + if (x2apic_id >= 255) + panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + + pr_info("Disabling x2apic\n"); + __disable_x2apic(msr); + + if (nox2apic) { + clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC); + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + } + + x2apic_disabled = 1; + x2apic_mode = 0; + + register_lapic_address(mp_lapic_addr); + } +} + void check_x2apic(void) { if (x2apic_enabled()) { @@ -1441,15 +1491,20 @@ void check_x2apic(void) void enable_x2apic(void) { - int msr, msr2; + u64 msr; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (x2apic_disabled) { + __disable_x2apic(msr); + return; + } if (!x2apic_mode) return; - rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { printk_once(KERN_INFO "Enabling x2apic\n"); - wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2); + wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); } } #endif /* CONFIG_X86_X2APIC */ @@ -1486,25 +1541,34 @@ void __init enable_IR_x2apic(void) ret = save_ioapic_entries(); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); - goto out; + return; } local_irq_save(flags); legacy_pic->mask_all(); mask_ioapic_entries(); + if (x2apic_preenabled && nox2apic) + disable_x2apic(); + if (dmar_table_init_ret) ret = -1; else ret = enable_IR(); + if (!x2apic_supported()) + goto skip_x2apic; + if (ret < 0) { /* IR is required if there is APIC ID > 255 even when running * under KVM */ if (max_physical_apicid > 255 || - !hypervisor_x2apic_available()) - goto nox2apic; + !hypervisor_x2apic_available()) { + if (x2apic_preenabled) + disable_x2apic(); + goto skip_x2apic; + } /* * without IR all CPUs can be addressed by IOAPIC/MSI * only in physical mode @@ -1512,8 +1576,10 @@ void __init enable_IR_x2apic(void) x2apic_force_phys(); } - if (ret == IRQ_REMAP_XAPIC_MODE) - goto nox2apic; + if (ret == IRQ_REMAP_XAPIC_MODE) { + pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); + goto skip_x2apic; + } x2apic_enabled = 1; @@ -1523,22 +1589,11 @@ void __init enable_IR_x2apic(void) pr_info("Enabled x2apic\n"); } -nox2apic: +skip_x2apic: if (ret < 0) /* IR enabling failed */ restore_ioapic_entries(); legacy_pic->restore_mask(); local_irq_restore(flags); - -out: - if (x2apic_enabled || !x2apic_supported()) - return; - - if (x2apic_preenabled) - panic("x2apic: enabled by BIOS but kernel init failed."); - else if (ret == IRQ_REMAP_XAPIC_MODE) - pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); - else if (ret < 0) - pr_info("x2apic not enabled, IRQ remapping init failed\n"); } #ifdef CONFIG_X86_64 @@ -1809,8 +1864,8 @@ void smp_spurious_interrupt(struct pt_regs *regs) { u32 v; - exit_idle(); irq_enter(); + exit_idle(); /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1846,8 +1901,8 @@ void smp_error_interrupt(struct pt_regs *regs) "Illegal register address", /* APIC Error Bit 7 */ }; - exit_idle(); irq_enter(); + exit_idle(); /* First tickle the hardware, only then report what went on. -- REW */ v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index f7a41e4..8c3cdde 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel * document number 292116). So here it goes... */ -static void flat_init_apic_ldr(void) +void flat_init_apic_ldr(void) { unsigned long val; unsigned long num, id; @@ -171,9 +171,14 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } +static int flat_probe(void) +{ + return 1; +} + static struct apic apic_flat = { .name = "flat", - .probe = NULL, + .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, .apic_id_registered = flat_apic_id_registered, diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c new file mode 100644 index 0000000..09d3d8c --- /dev/null +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -0,0 +1,294 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-Specific APIC Code + * + * Copyright (C) 2011 Numascale AS. All rights reserved. + * + * Send feedback to <support@numascale.com> + * + */ + +#include <linux/errno.h> +#include <linux/threads.h> +#include <linux/cpumask.h> +#include <linux/string.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/ctype.h> +#include <linux/init.h> +#include <linux/hardirq.h> +#include <linux/delay.h> + +#include <asm/numachip/numachip_csr.h> +#include <asm/smp.h> +#include <asm/apic.h> +#include <asm/ipi.h> +#include <asm/apic_flat_64.h> + +static int numachip_system __read_mostly; + +static struct apic apic_numachip __read_mostly; + +static unsigned int get_apic_id(unsigned long x) +{ + unsigned long value; + unsigned int id; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U); + + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = ((id & 0xffU) << 24); + return x; +} + +static unsigned int read_xapic_id(void) +{ + return get_apic_id(apic_read(APIC_ID)); +} + +static int numachip_apic_id_registered(void) +{ + return physid_isset(read_xapic_id(), phys_cpu_present_map); +} + +static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return initial_apic_id >> index_msb; +} + +static const struct cpumask *numachip_target_cpus(void) +{ + return cpu_online_mask; +} + +static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + +static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ + union numachip_csr_g3_ext_irq_gen int_gen; + + int_gen.s._destination_apic_id = phys_apicid; + int_gen.s._vector = 0; + int_gen.s._msgtype = APIC_DM_INIT >> 8; + int_gen.s._index = 0; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + + int_gen.s._msgtype = APIC_DM_STARTUP >> 8; + int_gen.s._vector = start_rip >> 12; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + + atomic_set(&init_deasserted, 1); + return 0; +} + +static void numachip_send_IPI_one(int cpu, int vector) +{ + union numachip_csr_g3_ext_irq_gen int_gen; + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + + int_gen.s._destination_apic_id = apicid; + int_gen.s._vector = vector; + int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8; + int_gen.s._index = 0; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); +} + +static void numachip_send_IPI_mask(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + numachip_send_IPI_one(cpu, vector); +} + +static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_allbutself(int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_all(int vector) +{ + numachip_send_IPI_mask(cpu_online_mask, vector); +} + +static void numachip_send_IPI_self(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +} + +static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = cpumask_first(cpumask); + if (likely((unsigned)cpu < nr_cpu_ids)) + return per_cpu(x86_cpu_to_apicid, cpu); + + return BAD_APICID; +} + +static unsigned int +numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + } + return per_cpu(x86_cpu_to_apicid, cpu); +} + +static int __init numachip_probe(void) +{ + return apic == &apic_numachip; +} + +static void __init map_csrs(void) +{ + printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n", + NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1); + init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + + printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n", + NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1); + init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); +} + +static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + c->phys_proc_id = node; + per_cpu(cpu_llc_id, smp_processor_id()) = node; +} + +static int __init numachip_system_init(void) +{ + unsigned int val; + + if (!numachip_system) + return 0; + + x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + + map_csrs(); + + val = read_lcsr(CSR_G0_NODE_IDS); + printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val); + + return 0; +} +early_initcall(numachip_system_init); + +static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (!strncmp(oem_id, "NUMASC", 6)) { + numachip_system = 1; + return 1; + } + + return 0; +} + +static struct apic apic_numachip __refconst = { + + .name = "NumaConnect system", + .probe = numachip_probe, + .acpi_madt_oem_check = numachip_acpi_madt_oem_check, + .apic_id_registered = numachip_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .target_cpus = numachip_target_cpus, + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + .check_apicid_present = NULL, + + .vector_allocation_domain = numachip_vector_allocation_domain, + .init_apic_ldr = flat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + .phys_pkg_id = numachip_phys_pkg_id, + .mps_oem_check = NULL, + + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = 0xffU << 24, + + .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and, + + .send_IPI_mask = numachip_send_IPI_mask, + .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, + .send_IPI_allbutself = numachip_send_IPI_allbutself, + .send_IPI_all = numachip_send_IPI_all, + .send_IPI_self = numachip_send_IPI_self, + + .wakeup_secondary_cpu = numachip_wakeup_secondary, + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + .wait_for_init_deassert = NULL, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, /* REMRD not supported */ + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, +}; +apic_driver(apic_numachip); + diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6d939d7..fb07275 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) unsigned vector, me; ack_APIC_irq(); - exit_idle(); irq_enter(); + exit_idle(); me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { @@ -2948,6 +2948,10 @@ static inline void __init check_timer(void) } local_irq_disable(); apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + if (x2apic_preenabled) + apic_printk(APIC_QUIET, KERN_INFO + "Perhaps problem with the pre-enabled x2apic mode\n" + "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); out: diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 452932d..5da1269 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -62,7 +62,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size); void __init setup_bios_corruption_check(void) { - u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + phys_addr_t start, end; + u64 i; if (memory_corruption_check == -1) { memory_corruption_check = @@ -82,28 +83,23 @@ void __init setup_bios_corruption_check(void) corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { - u64 size; - addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE); + for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { + start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + if (start >= end) + continue; - if (addr == MEMBLOCK_ERROR) - break; - - if (addr >= corruption_check_size) - break; - - if ((addr + size) > corruption_check_size) - size = corruption_check_size - addr; - - memblock_x86_reserve_range(addr, addr + size, "SCAN RAM"); - scan_areas[num_scan_areas].addr = addr; - scan_areas[num_scan_areas].size = size; - num_scan_areas++; + memblock_reserve(start, end - start); + scan_areas[num_scan_areas].addr = start; + scan_areas[num_scan_areas].size = end - start; /* Assume we've already mapped this early memory */ - memset(__va(addr), 0, size); + memset(__va(start), 0, end - start); - addr += size; + if (++num_scan_areas >= MAX_SCAN_AREAS) + break; } if (num_scan_areas) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0bab2b1..f4773f4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) valid_k7: ; -#endif } static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) @@ -353,6 +351,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) if (node == NUMA_NO_NODE) node = per_cpu(cpu_llc_id, cpu); + /* + * If core numbers are inconsistent, it's likely a multi-fabric platform, + * so invoke platform-specific handler + */ + if (c->phys_proc_id != node) + x86_cpuinit.fixup_cpu_id(c, node); + if (!node_online(node)) { /* * Two possibilities here: diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e58d978..159103c 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_32 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ - if (c->x86_model >= 6 && c->x86_model <= 9) { + if (c->x86_model >= 6 && c->x86_model <= 13) { rdmsr(MSR_VIA_FCR, lo, hi); lo |= (1<<1 | 1<<7); wrmsr(MSR_VIA_FCR, lo, hi); diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa003b1..850f296 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_early_init) this_cpu->c_early_init(c); -#ifdef CONFIG_SMP c->cpu_index = 0; -#endif filter_cpuid_features(c, false); setup_smep(c); @@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) c->apicid = c->initial_apicid; # endif #endif - -#ifdef CONFIG_X86_HT c->phys_proc_id = c->initial_apicid; -#endif } setup_smep(c); @@ -1141,6 +1136,15 @@ static void dbg_restore_debug_regs(void) #endif /* ! CONFIG_KGDB */ /* + * Prints an error where the NUMA and configured core-number mismatch and the + * platform didn't override this to fix it up + */ +void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); +} + +/* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1b22dcc..8bacc78 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,5 +1,4 @@ #ifndef ARCH_X86_CPU_H - #define ARCH_X86_CPU_H struct cpu_model_info { @@ -35,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); -extern void get_cpu_cap(struct cpuinfo_x86 *c); - -#endif +#endif /* ARCH_X86_CPU_H */ diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 5231312..3e6ff6c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void) static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) WARN_ONCE(1, "WARNING: SMP operation may be unreliable" "with B stepping processors.\n"); } -#endif } static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 319882e..fc4beb3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/string.h> #include <linux/fs.h> +#include <linux/preempt.h> #include <linux/smp.h> #include <linux/notifier.h> #include <linux/kdebug.h> @@ -92,6 +93,18 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) return NMI_HANDLED; } +static void mce_irq_ipi(void *info) +{ + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + + if (cpumask_test_cpu(cpu, mce_inject_cpumask) && + m->inject_flags & MCJ_EXCEPTION) { + cpumask_clear_cpu(cpu, mce_inject_cpumask); + raise_exception(m, NULL); + } +} + /* Inject mce on current CPU */ static int raise_local(void) { @@ -139,9 +152,10 @@ static void raise_mce(struct mce *m) return; #ifdef CONFIG_X86_LOCAL_APIC - if (m->inject_flags & MCJ_NMI_BROADCAST) { + if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; + get_online_cpus(); cpumask_copy(mce_inject_cpumask, cpu_online_mask); cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); @@ -151,13 +165,25 @@ static void raise_mce(struct mce *m) MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpumask_empty(mce_inject_cpumask)) - apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); + if (!cpumask_empty(mce_inject_cpumask)) { + if (m->inject_flags & MCJ_IRQ_BRAODCAST) { + /* + * don't wait because mce_irq_ipi is necessary + * to be sync with following raise_local + */ + preempt_disable(); + smp_call_function_many(mce_inject_cpumask, + mce_irq_ipi, NULL, 0); + preempt_enable(); + } else if (m->inject_flags & MCJ_NMI_BROADCAST) + apic->send_IPI_mask(mce_inject_cpumask, + NMI_VECTOR); + } start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR - "Timeout waiting for mce inject NMI %lx\n", + "Timeout waiting for mce inject %lx\n", *cpumask_bits(mce_inject_cpumask)); break; } diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2af127d..cbe82b5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -95,13 +95,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -/* - * CPU/chipset specific EDAC code can register a notifier call here to print - * MCE errors in a human-readable form. - */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); -EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); - /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL @@ -109,6 +102,12 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { static DEFINE_PER_CPU(struct work_struct, mce_work); +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -119,9 +118,7 @@ void mce_setup(struct mce *m) m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); -#ifdef CONFIG_SMP m->socketid = cpu_data(m->extcpu).phys_proc_id; -#endif m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } @@ -190,6 +187,57 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +static void drain_mcelog_buffer(void) +{ + unsigned int next, i, prev = 0; + + next = rcu_dereference_check_mce(mcelog.next); + + do { + struct mce *m; + + /* drain what was logged during boot */ + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + unsigned retries = 1; + + m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2*retries)) + retries++; + + cpu_relax(); + + if (!m->finished && retries >= 4) { + pr_err("MCE: skipping error being logged currently!\n"); + break; + } + } + smp_rmb(); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + } + + memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); +} + + +void mce_register_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + drain_mcelog_buffer(); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); + static void print_mce(struct mce *m) { int ret = 0; diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f547421..1d76872 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -64,11 +64,9 @@ struct threshold_bank { }; static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); -#ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { 0, 0, 0, 0, 1 }; -#endif static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ @@ -202,10 +200,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP if (shared_bank[bank] && c->cpu_core_id) break; -#endif + offset = setup_APIC_mce(offset, (high & MASK_LVTOFF_HI) >> 20); @@ -531,7 +528,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) sprintf(name, "threshold_bank%i", bank); -#ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ i = cpumask_first(cpu_llc_shared_mask(cpu)); @@ -558,7 +554,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } -#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 787e06c..39c6089 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -323,17 +323,6 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ -/* - * Set up the most two significant bit to notify mce log that this thermal - * event type. - * This is a temp solution. May be changed in the future with mce log - * infrasture. - */ -#define CORE_THROTTLED (0) -#define CORE_POWER_LIMIT ((__u64)1 << 62) -#define PACKAGE_THROTTLED ((__u64)2 << 62) -#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) - static void notify_thresholds(__u64 msr_val) { /* check whether the interrupt handler is defined; @@ -363,27 +352,23 @@ static void intel_thermal_interrupt(void) if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_THROTTLED | msr_val); + mce_log_therm_throt_event(msr_val); if (this_cpu_has(X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); + CORE_LEVEL); if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); + PACKAGE_LEVEL); if (this_cpu_has(X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_POWER_LIMIT - | msr_val); + PACKAGE_LEVEL); } } @@ -397,8 +382,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) { - exit_idle(); irq_enter(); + exit_idle(); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); irq_exit(); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index d746df2..aa578ca 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt; asmlinkage void smp_threshold_interrupt(void) { - exit_idle(); irq_enter(); + exit_idle(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); irq_exit(); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2bda212..5adce10 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -484,18 +484,195 @@ static inline int is_x86_event(struct perf_event *event) return event->pmu == &pmu; } +/* + * Event scheduler state: + * + * Assign events iterating over all events and counters, beginning + * with events with least weights first. Keep the current iterator + * state in struct sched_state. + */ +struct sched_state { + int weight; + int event; /* event index */ + int counter; /* counter index */ + int unassigned; /* number of events to be assigned left */ + unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +}; + +/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ +#define SCHED_STATES_MAX 2 + +struct perf_sched { + int max_weight; + int max_events; + struct event_constraint **constraints; + struct sched_state state; + int saved_states; + struct sched_state saved[SCHED_STATES_MAX]; +}; + +/* + * Initialize interator that runs through all events and counters. + */ +static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c, + int num, int wmin, int wmax) +{ + int idx; + + memset(sched, 0, sizeof(*sched)); + sched->max_events = num; + sched->max_weight = wmax; + sched->constraints = c; + + for (idx = 0; idx < num; idx++) { + if (c[idx]->weight == wmin) + break; + } + + sched->state.event = idx; /* start with min weight */ + sched->state.weight = wmin; + sched->state.unassigned = num; +} + +static void perf_sched_save_state(struct perf_sched *sched) +{ + if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) + return; + + sched->saved[sched->saved_states] = sched->state; + sched->saved_states++; +} + +static bool perf_sched_restore_state(struct perf_sched *sched) +{ + if (!sched->saved_states) + return false; + + sched->saved_states--; + sched->state = sched->saved[sched->saved_states]; + + /* continue with next counter: */ + clear_bit(sched->state.counter++, sched->state.used); + + return true; +} + +/* + * Select a counter for the current event to schedule. Return true on + * success. + */ +static bool __perf_sched_find_counter(struct perf_sched *sched) +{ + struct event_constraint *c; + int idx; + + if (!sched->state.unassigned) + return false; + + if (sched->state.event >= sched->max_events) + return false; + + c = sched->constraints[sched->state.event]; + + /* Prefer fixed purpose counters */ + if (x86_pmu.num_counters_fixed) { + idx = X86_PMC_IDX_FIXED; + for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + } + /* Grab the first unused counter starting with idx */ + idx = sched->state.counter; + for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + + return false; + +done: + sched->state.counter = idx; + + if (c->overlap) + perf_sched_save_state(sched); + + return true; +} + +static bool perf_sched_find_counter(struct perf_sched *sched) +{ + while (!__perf_sched_find_counter(sched)) { + if (!perf_sched_restore_state(sched)) + return false; + } + + return true; +} + +/* + * Go through all unassigned events and find the next one to schedule. + * Take events with the least weight first. Return true on success. + */ +static bool perf_sched_next_event(struct perf_sched *sched) +{ + struct event_constraint *c; + + if (!sched->state.unassigned || !--sched->state.unassigned) + return false; + + do { + /* next event */ + sched->state.event++; + if (sched->state.event >= sched->max_events) { + /* next weight */ + sched->state.event = 0; + sched->state.weight++; + if (sched->state.weight > sched->max_weight) + return false; + } + c = sched->constraints[sched->state.event]; + } while (c->weight != sched->state.weight); + + sched->state.counter = 0; /* start with first counter */ + + return true; +} + +/* + * Assign a counter for each event. + */ +static int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign) +{ + struct perf_sched sched; + + perf_sched_init(&sched, constraints, n, wmin, wmax); + + do { + if (!perf_sched_find_counter(&sched)) + break; /* failed */ + if (assign) + assign[sched.state.event] = sched.state.counter; + } while (perf_sched_next_event(&sched)); + + return sched.state.unassigned; +} + int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int i, j, w, wmax, num = 0; + int i, wmin, wmax, num = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); - for (i = 0; i < n; i++) { + for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); constraints[i] = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); } /* @@ -521,60 +698,12 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (assign) assign[i] = hwc->idx; } - if (i == n) - goto done; - - /* - * begin slow path - */ - - bitmap_zero(used_mask, X86_PMC_IDX_MAX); - /* - * weight = number of possible counters - * - * 1 = most constrained, only works on one counter - * wmax = least constrained, works on any counter - * - * assign events to counters starting with most - * constrained events. - */ - wmax = x86_pmu.num_counters; + /* slow path */ + if (i != n) + num = perf_assign_events(constraints, n, wmin, wmax, assign); /* - * when fixed event counters are present, - * wmax is incremented by 1 to account - * for one more choice - */ - if (x86_pmu.num_counters_fixed) - wmax++; - - for (w = 1, num = n; num && w <= wmax; w++) { - /* for each event */ - for (i = 0; num && i < n; i++) { - c = constraints[i]; - hwc = &cpuc->event_list[i]->hw; - - if (c->weight != w) - continue; - - for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { - if (!test_bit(j, used_mask)) - break; - } - - if (j == X86_PMC_IDX_MAX) - break; - - __set_bit(j, used_mask); - - if (assign) - assign[i] = j; - num--; - } - } -done: - /* * scheduling failed or is just a simulation, * free resources if necessary */ @@ -1119,6 +1248,7 @@ static void __init pmu_check_apic(void) static int __init init_hw_perf_events(void) { + struct x86_pmu_quirk *quirk; struct event_constraint *c; int err; @@ -1147,8 +1277,8 @@ static int __init init_hw_perf_events(void) pr_cont("%s PMU driver.\n", x86_pmu.name); - if (x86_pmu.quirks) - x86_pmu.quirks(); + for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) + quirk->func(); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", @@ -1171,12 +1301,18 @@ static int __init init_hw_perf_events(void) unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, - 0, x86_pmu.num_counters); + 0, x86_pmu.num_counters, 0); if (x86_pmu.event_constraints) { + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK) + if (c->cmask != X86_RAW_EVENT_MASK + || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { continue; + } c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; c->weight += x86_pmu.num_counters; @@ -1566,3 +1702,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs) return misc; } + +void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) +{ + cap->version = x86_pmu.version; + cap->num_counters_gp = x86_pmu.num_counters; + cap->num_counters_fixed = x86_pmu.num_counters_fixed; + cap->bit_width_gp = x86_pmu.cntval_bits; + cap->bit_width_fixed = x86_pmu.cntval_bits; + cap->events_mask = (unsigned int)x86_pmu.events_maskl; + cap->events_mask_len = x86_pmu.events_mask_len; +} +EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index b9698d4..8944062 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -45,6 +45,7 @@ struct event_constraint { u64 code; u64 cmask; int weight; + int overlap; }; struct amd_nb { @@ -151,15 +152,40 @@ struct cpu_hw_events { void *kfree_on_online; }; -#define __EVENT_CONSTRAINT(c, n, m, w) {\ +#define __EVENT_CONSTRAINT(c, n, m, w, o) {\ { .idxmsk64 = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = (w), \ + .overlap = (o), \ } #define EVENT_CONSTRAINT(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0) + +/* + * The overlap flag marks event constraints with overlapping counter + * masks. This is the case if the counter mask of such an event is not + * a subset of any other counter mask of a constraint with an equal or + * higher weight, e.g.: + * + * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); + * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); + * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); + * + * The event scheduler may not select the correct counter in the first + * cycle because it needs to know which subsequent events will be + * scheduled. It may fail to schedule the events then. So we set the + * overlap flag for such constraints to give the scheduler a hint which + * events to select for counter rescheduling. + * + * Care must be taken as the rescheduling algorithm is O(n!) which + * will increase scheduling cycles for an over-commited system + * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros + * and its counter masks must be kept at a minimum. + */ +#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1) /* * Constraint on the Event code. @@ -235,6 +261,11 @@ union perf_capabilities { u64 capabilities; }; +struct x86_pmu_quirk { + struct x86_pmu_quirk *next; + void (*func)(void); +}; + /* * struct x86_pmu - generic x86 pmu */ @@ -259,6 +290,11 @@ struct x86_pmu { int num_counters_fixed; int cntval_bits; u64 cntval_mask; + union { + unsigned long events_maskl; + unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; + }; + int events_mask_len; int apic; u64 max_period; struct event_constraint * @@ -268,7 +304,7 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; - void (*quirks)(void); + struct x86_pmu_quirk *quirks; int perfctr_second_write; int (*cpu_prepare)(int cpu); @@ -309,6 +345,15 @@ struct x86_pmu { struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); }; +#define x86_add_quirk(func_) \ +do { \ + static struct x86_pmu_quirk __quirk __initdata = { \ + .func = func_, \ + }; \ + __quirk.next = x86_pmu.quirks; \ + x86_pmu.quirks = &__quirk; \ +} while (0) + #define ERF_NO_HT_SHARING 1 #define ERF_HAS_RSP_1 2 diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index aeefd45..0397b23 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -492,7 +492,7 @@ static __initconst const struct x86_pmu amd_pmu = { static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); -static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 121f1be..3bd37bd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -28,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ }; static struct event_constraint intel_core_event_constraints[] __read_mostly = @@ -45,12 +46,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* - * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event - * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed - * ratio between these counters. - */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -68,7 +64,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -90,7 +86,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -102,7 +98,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ @@ -125,7 +121,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; @@ -1519,7 +1515,7 @@ static __initconst const struct x86_pmu intel_pmu = { .guest_get_msrs = intel_guest_get_msrs, }; -static void intel_clovertown_quirks(void) +static __init void intel_clovertown_quirk(void) { /* * PEBS is unreliable due to: @@ -1545,19 +1541,60 @@ static void intel_clovertown_quirks(void) x86_pmu.pebs_constraints = NULL; } -static void intel_sandybridge_quirks(void) +static __init void intel_sandybridge_quirk(void) { printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } +static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, +}; + +static __init void intel_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { + intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; + printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); + } +} + +static __init void intel_nehalem_quirk(void) +{ + union cpuid10_ebx ebx; + + ebx.full = x86_pmu.events_maskl; + if (ebx.split.no_branch_misses_retired) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + ebx.split.no_branch_misses_retired = 0; + x86_pmu.events_maskl = ebx.full; + printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); + } +} + __init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; + union cpuid10_ebx ebx; unsigned int unused; - unsigned int ebx; int version; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { @@ -1574,8 +1611,8 @@ __init int intel_pmu_init(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not. */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) return -ENODEV; version = eax.split.version_id; @@ -1589,6 +1626,9 @@ __init int intel_pmu_init(void) x86_pmu.cntval_bits = eax.split.bit_width; x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: @@ -1608,6 +1648,8 @@ __init int intel_pmu_init(void) intel_ds_init(); + x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ + /* * Install the hw-cache-events table: */ @@ -1617,7 +1659,7 @@ __init int intel_pmu_init(void) break; case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - x86_pmu.quirks = intel_clovertown_quirks; + x86_add_quirk(intel_clovertown_quirk); case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ case 29: /* six-core 45 nm xeon "Dunnington" */ @@ -1651,17 +1693,8 @@ __init int intel_pmu_init(void) /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; - if (ebx & 0x40) { - /* - * Erratum AAJ80 detected, we work it around by using - * the BR_MISP_EXEC.ANY event. This will over-count - * branch-misses, but it's still much better than the - * architectural event which is often completely bogus: - */ - intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + x86_add_quirk(intel_nehalem_quirk); - pr_cont("erratum AAJ80 worked around, "); - } pr_cont("Nehalem events, "); break; @@ -1701,7 +1734,7 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ - x86_pmu.quirks = intel_sandybridge_quirks; + x86_add_quirk(intel_sandybridge_quirk); case 45: /* SandyBridge, "Romely-EP" */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -1738,5 +1771,6 @@ __init int intel_pmu_init(void) break; } } + return 0; } diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c index 5abbea2..7b3fe56 100644 --- a/arch/x86/kernel/cpu/powerflags.c +++ b/arch/x86/kernel/cpu/powerflags.c @@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = { "100mhzsteps", "hwpstate", "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ + "cpb", /* core performance boost */ + "eff_freq_ro", /* Readonly aperf/mperf */ }; diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 14b2314..8022c66 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - unsigned int cpu = 0; + unsigned int cpu; int i; -#ifdef CONFIG_SMP cpu = c->cpu_index; -#endif seq_printf(m, "processor\t: %u\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 303a0e4..8071e2f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -738,35 +738,17 @@ core_initcall(e820_mark_nvs_memory); /* * pre allocated 4k and reserved it in memblock and e820_saved */ -u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) +u64 __init early_reserve_e820(u64 size, u64 align) { - u64 size = 0; u64 addr; - u64 start; - for (start = startt; ; start += size) { - start = memblock_x86_find_in_range_size(start, &size, align); - if (start == MEMBLOCK_ERROR) - return 0; - if (size >= sizet) - break; + addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); + if (addr) { + e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); + printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); + update_e820_saved(); } -#ifdef CONFIG_X86_32 - if (start >= MAXMEM) - return 0; - if (start + size > MAXMEM) - size = MAXMEM - start; -#endif - - addr = round_down(start + size - sizet, align); - if (addr < start) - return 0; - memblock_x86_reserve_range(addr, addr + sizet, "new next"); - e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); - printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); - update_e820_saved(); - return addr; } @@ -1090,7 +1072,7 @@ void __init memblock_x86_fill(void) * We are safe to enable resizing, beause memblock_x86_fill() * is rather later for x86 */ - memblock_can_resize = 1; + memblock_allow_resize(); for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; @@ -1105,22 +1087,36 @@ void __init memblock_x86_fill(void) memblock_add(ei->addr, ei->size); } - memblock_analyze(); memblock_dump_all(); } void __init memblock_find_dma_reserve(void) { #ifdef CONFIG_X86_64 - u64 free_size_pfn; - u64 mem_size_pfn; + u64 nr_pages = 0, nr_free_pages = 0; + unsigned long start_pfn, end_pfn; + phys_addr_t start, end; + int i; + u64 u; + /* * need to find out used area below MAX_DMA_PFN * need to use memblock to get free size in [0, MAX_DMA_PFN] * at first, and assume boot_mem will not take below MAX_DMA_PFN */ - mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; - free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; - set_dma_reserve(mem_size_pfn - free_size_pfn); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); + end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); + nr_pages += end_pfn - start_pfn; + } + + for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { + start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); + end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); + if (start_pfn < end_pfn) + nr_free_pages += end_pfn - start_pfn; + } + + set_dma_reserve(nr_pages - nr_free_pages); #endif } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f3f6f53..22d0e21 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -625,6 +625,8 @@ work_notifysig: # deal with pending signals and movl %esp, %eax jne work_notifysig_v86 # returning to kernel-space or # vm86-space + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig @@ -638,6 +640,8 @@ work_notifysig_v86: #else movl %esp, %eax #endif + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index faf8d5e..a20e1cb 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -221,7 +221,7 @@ ENDPROC(native_usergs_sysret64) /*CFI_REL_OFFSET ss,0*/ pushq_cfi %rax /* rsp */ CFI_REL_OFFSET rsp,0 - pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ + pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */ /*CFI_REL_OFFSET rflags,0*/ pushq_cfi $__KERNEL_CS /* cs */ /*CFI_REL_OFFSET cs,0*/ @@ -411,7 +411,7 @@ ENTRY(ret_from_fork) RESTORE_REST testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? - je int_ret_from_sys_call + jz retint_restore_args testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET jnz int_ret_from_sys_call @@ -465,7 +465,7 @@ ENTRY(system_call) * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */ -ENTRY(system_call_after_swapgs) +GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(old_rsp) movq PER_CPU_VAR(kernel_stack),%rsp @@ -478,8 +478,7 @@ ENTRY(system_call_after_swapgs) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET - GET_THREAD_INFO(%rcx) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz tracesys system_call_fastpath: cmpq $__NR_syscall_max,%rax @@ -496,10 +495,9 @@ ret_from_sys_call: /* edi: flagmask */ sysret_check: LOCKDEP_SYS_EXIT - GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - movl TI_flags(%rcx),%edx + movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx andl %edi,%edx jnz sysret_careful CFI_REMEMBER_STATE @@ -583,7 +581,7 @@ sysret_audit: /* Do syscall tracing */ tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz auditsys #endif SAVE_REST @@ -612,8 +610,6 @@ tracesys: GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $3,CS-ARGOFFSET(%rsp) - je retint_restore_args movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ GLOBAL(int_with_check) @@ -953,6 +949,7 @@ END(common_interrupt) ENTRY(\sym) INTR_FRAME pushq_cfi $~(\num) +.Lcommon_\sym: interrupt \do_sym jmp ret_from_intr CFI_ENDPROC @@ -976,13 +973,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP -.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + ALIGN + INTR_FRAME +.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 .if NUM_INVALIDATE_TLB_VECTORS > \idx -apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ - invalidate_interrupt\idx smp_invalidate_interrupt +ENTRY(invalidate_interrupt\idx) + pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx) + jmp .Lcommon_invalidate_interrupt0 + CFI_ADJUST_CFA_OFFSET -8 +END(invalidate_interrupt\idx) .endif .endr + CFI_ENDPROC +apicinterrupt INVALIDATE_TLB_VECTOR_START, \ + invalidate_interrupt0, smp_invalidate_interrupt #endif apicinterrupt THRESHOLD_APIC_VECTOR \ diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index af0699b..48d9d4e 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -52,5 +52,5 @@ void __init reserve_ebda_region(void) lowmem = 0x9f000; /* reserve all memory between lowmem and the 1MB mark */ - memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved"); + memblock_reserve(lowmem, 0x100000 - lowmem); } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3bb0850..51ff186 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,9 +31,8 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { - memblock_init(); - - memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + memblock_reserve(__pa_symbol(&_text), + __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ @@ -42,7 +41,7 @@ void __init i386_start_kernel(void) u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5655c22..3a3b779 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -98,9 +98,8 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); - memblock_init(); - - memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + memblock_reserve(__pa_symbol(&_text), + __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ @@ -109,7 +108,7 @@ void __init x86_64_start_reservations(char *real_mode_data) unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); } #endif diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 1bb0bf4..07b0a56 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -32,8 +32,6 @@ #define HPET_MIN_CYCLES 128 #define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) -#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) - /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ @@ -55,6 +53,11 @@ struct hpet_dev { char name[10]; }; +inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) +{ + return container_of(evtdev, struct hpet_dev, evt); +} + inline unsigned int hpet_readl(unsigned int a) { return readl(hpet_virt_address + a); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 429e0c9..7943e0c 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -74,6 +74,10 @@ int arch_show_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); seq_printf(p, " IRQ work interrupts\n"); + seq_printf(p, "%*s: ", prec, "RTR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); + seq_printf(p, " APIC ICR read retries\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); @@ -136,6 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; + sum += irq_stats(cpu)->icr_read_retry_count; #endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; @@ -181,8 +186,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; - exit_idle(); irq_enter(); + exit_idle(); irq = __this_cpu_read(vector_irq[vector]); @@ -209,10 +214,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs) ack_APIC_irq(); - exit_idle(); - irq_enter(); + exit_idle(); + inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index ea9d5f2f..2889b3d 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry, put_online_cpus(); } -void arch_jump_label_transform_static(struct jump_entry *entry, +__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { __jump_label_transform(entry, type, text_poke_early); diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index d494799..fe86493 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -1,14 +1,18 @@ /* * AMD CPU Microcode Update Driver for Linux - * Copyright (C) 2008 Advanced Micro Devices Inc. + * Copyright (C) 2008-2011 Advanced Micro Devices Inc. * * Author: Peter Oruba <peter.oruba@amd.com> * * Based on work by: * Tigran Aivazian <tigran@aivazian.fsnet.co.uk> * - * This driver allows to upgrade microcode on AMD - * family 0x10 and 0x11 processors. + * Maintainers: + * Andreas Herrmann <andreas.herrmann3@amd.com> + * Borislav Petkov <borislav.petkov@amd.com> + * + * This driver allows to upgrade microcode on F10h AMD + * CPUs and later. * * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. @@ -71,6 +75,9 @@ struct microcode_amd { static struct equiv_cpu_entry *equiv_cpu_table; +/* page-sized ucode patch buffer */ +void *patch; + static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -86,27 +93,76 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return 0; } -static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, - int rev) +static unsigned int verify_ucode_size(int cpu, u32 patch_size, + unsigned int size) { - unsigned int current_cpu_id; - u16 equiv_cpu_id = 0; - unsigned int i = 0; + struct cpuinfo_x86 *c = &cpu_data(cpu); + u32 max_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + + switch (c->x86) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } + + if (patch_size > min_t(u32, size, max_size)) { + pr_err("patch size mismatch\n"); + return 0; + } + + return patch_size; +} + +static u16 find_equiv_id(void) +{ + unsigned int current_cpu_id, i = 0; BUG_ON(equiv_cpu_table == NULL); + current_cpu_id = cpuid_eax(0x00000001); while (equiv_cpu_table[i].installed_cpu != 0) { - if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { - equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; - break; - } + if (current_cpu_id == equiv_cpu_table[i].installed_cpu) + return equiv_cpu_table[i].equiv_cpu; + i++; } + return 0; +} +/* + * we signal a good patch is found by returning its size > 0 + */ +static int get_matching_microcode(int cpu, const u8 *ucode_ptr, + unsigned int leftover_size, int rev, + unsigned int *current_size) +{ + struct microcode_header_amd *mc_hdr; + unsigned int actual_size; + u16 equiv_cpu_id; + + /* size of the current patch we're staring at */ + *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; + + equiv_cpu_id = find_equiv_id(); if (!equiv_cpu_id) return 0; + /* + * let's look at the patch header itself now + */ + mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE); + if (mc_hdr->processor_rev_id != equiv_cpu_id) return 0; @@ -120,7 +176,20 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, if (mc_hdr->patch_id <= rev) return 0; - return 1; + /* + * now that the header looks sane, verify its size + */ + actual_size = verify_ucode_size(cpu, *current_size, leftover_size); + if (!actual_size) + return 0; + + /* clear the patch buffer */ + memset(patch, 0, PAGE_SIZE); + + /* all looks ok, get the binary patch */ + get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size); + + return actual_size; } static int apply_microcode_amd(int cpu) @@ -155,63 +224,6 @@ static int apply_microcode_amd(int cpu) return 0; } -static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - u32 max_size, actual_size; - -#define F1XH_MPB_MAX_SIZE 2048 -#define F14H_MPB_MAX_SIZE 1824 -#define F15H_MPB_MAX_SIZE 4096 - - switch (c->x86) { - case 0x14: - max_size = F14H_MPB_MAX_SIZE; - break; - case 0x15: - max_size = F15H_MPB_MAX_SIZE; - break; - default: - max_size = F1XH_MPB_MAX_SIZE; - break; - } - - actual_size = *(u32 *)(buf + 4); - - if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) { - pr_err("section size mismatch\n"); - return 0; - } - - return actual_size; -} - -static struct microcode_header_amd * -get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) -{ - struct microcode_header_amd *mc = NULL; - unsigned int actual_size = 0; - - if (*(u32 *)buf != UCODE_UCODE_TYPE) { - pr_err("invalid type field in container file section header\n"); - goto out; - } - - actual_size = verify_ucode_size(cpu, buf, size); - if (!actual_size) - goto out; - - mc = vzalloc(actual_size); - if (!mc) - goto out; - - get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size); - *mc_size = actual_size + SECTION_HDR_SIZE; - -out: - return mc; -} - static int install_equiv_cpu_table(const u8 *buf) { unsigned int *ibuf = (unsigned int *)buf; @@ -247,36 +259,38 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header_amd *mc_hdr = NULL; - unsigned int mc_size, leftover; + unsigned int mc_size, leftover, current_size = 0; int offset; const u8 *ucode_ptr = data; void *new_mc = NULL; unsigned int new_rev = uci->cpu_sig.rev; - enum ucode_state state = UCODE_OK; + enum ucode_state state = UCODE_ERROR; offset = install_equiv_cpu_table(ucode_ptr); if (offset < 0) { pr_err("failed to create equivalent cpu table\n"); - return UCODE_ERROR; + goto out; } - ucode_ptr += offset; leftover = size - offset; - while (leftover) { - mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); - if (!mc_hdr) - break; + if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + goto free_table; + } - if (get_matching_microcode(cpu, mc_hdr, new_rev)) { - vfree(new_mc); + while (leftover) { + mc_size = get_matching_microcode(cpu, ucode_ptr, leftover, + new_rev, ¤t_size); + if (mc_size) { + mc_hdr = patch; + new_mc = patch; new_rev = mc_hdr->patch_id; - new_mc = mc_hdr; - } else - vfree(mc_hdr); + goto out_ok; + } - ucode_ptr += mc_size; - leftover -= mc_size; + ucode_ptr += current_size; + leftover -= current_size; } if (!new_mc) { @@ -284,19 +298,16 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) goto free_table; } - if (!leftover) { - vfree(uci->mc); - uci->mc = new_mc; - pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", - cpu, uci->cpu_sig.rev, new_rev); - } else { - vfree(new_mc); - state = UCODE_ERROR; - } +out_ok: + uci->mc = new_mc; + state = UCODE_OK; + pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", + cpu, uci->cpu_sig.rev, new_rev); free_table: free_equiv_cpu_table(); +out: return state; } @@ -337,7 +348,6 @@ static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - vfree(uci->mc); uci->mc = NULL; } @@ -351,5 +361,14 @@ static struct microcode_ops microcode_amd_ops = { struct microcode_ops * __init init_amd_microcode(void) { + patch = (void *)get_zeroed_page(GFP_KERNEL); + if (!patch) + return NULL; + return µcode_amd_ops; } + +void __exit exit_amd_microcode(void) +{ + free_page((unsigned long)patch); +} diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9d46f5e..9302e2d 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -563,6 +563,8 @@ module_init(microcode_init); static void __exit microcode_exit(void) { + struct cpuinfo_x86 *c = &cpu_data(0); + microcode_dev_exit(); unregister_hotcpu_notifier(&mc_cpu_notifier); @@ -580,6 +582,9 @@ static void __exit microcode_exit(void) microcode_ops = NULL; + if (c->x86_vendor == X86_VENDOR_AMD) + exit_amd_microcode(); + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } module_exit(microcode_exit); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 0741b062..ca470e4 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -564,9 +564,7 @@ void __init default_get_smp_config(unsigned int early) static void __init smp_reserve_memory(struct mpf_intel *mpf) { - unsigned long size = get_mpc_size(mpf->physptr); - - memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc"); + memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); } static int __init smp_scan_config(unsigned long base, unsigned long length) @@ -595,7 +593,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) mpf, (u64)virt_to_phys(mpf)); mem = virt_to_phys(mpf); - memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf"); + memblock_reserve(mem, sizeof(*mpf)); if (mpf->physptr) smp_reserve_memory(mpf); @@ -836,10 +834,8 @@ early_param("alloc_mptable", parse_alloc_mptable_opt); void __init early_reserve_e820_mpc_new(void) { - if (enable_update_mptable && alloc_mptable) { - u64 startt = 0; - mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); - } + if (enable_update_mptable && alloc_mptable) + mpc_new_phys = early_reserve_e820(mpc_new_length, 4); } static int __init update_mp_table(void) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ee5d4fb..15763af 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | 0x2; + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 795b79f..485204f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,8 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -116,7 +117,8 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } - tick_nohz_restart_sched_tick(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3bd7e6e..9b9fe4a 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { rmb(); @@ -139,8 +139,14 @@ void cpu_idle(void) enter_idle(); /* Don't trace irqs off for idle */ stop_critical_timings(); + + /* enter_idle() needs rcu for notifiers */ + rcu_idle_enter(); + if (cpuidle_idle_call()) pm_idle(); + + rcu_idle_exit(); start_critical_timings(); /* In many cases the interrupt that ended idle @@ -149,7 +155,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); @@ -293,13 +299,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; return -ENOMEM; } - memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, - IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8252879..89a04c7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -749,7 +749,8 @@ put: /* * Handle PTRACE_POKEUSR calls for the debug register area. */ -int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +static int ptrace_set_debugreg(struct task_struct *tsk, int n, + unsigned long val) { struct thread_struct *thread = &(tsk->thread); int rc = 0; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cf0ef98..d05444a 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -306,7 +306,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK"); + memblock_reserve(__pa(_brk_start), + __pa(_brk_end) - __pa(_brk_start)); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -331,13 +332,13 @@ static void __init relocate_initrd(void) ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, PAGE_SIZE); - if (ramdisk_here == MEMBLOCK_ERROR) + if (!ramdisk_here) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); + memblock_reserve(ramdisk_here, area_size); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", @@ -393,7 +394,7 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= (end_of_lowmem>>1)) { - memblock_x86_free_range(ramdisk_image, ramdisk_end); + memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; @@ -416,7 +417,7 @@ static void __init reserve_initrd(void) relocate_initrd(); - memblock_x86_free_range(ramdisk_image, ramdisk_end); + memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else static void __init reserve_initrd(void) @@ -490,15 +491,13 @@ static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_data *data; u64 pa_data; - char buf[32]; if (boot_params.hdr.version < 0x0209) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); - sprintf(buf, "setup data %x", data->type); - memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf); + memblock_reserve(pa_data, sizeof(*data) + data->len); pa_data = data->next; early_iounmap(data, sizeof(*data)); } @@ -554,7 +553,7 @@ static void __init reserve_crashkernel(void) crash_base = memblock_find_in_range(alignment, CRASH_KERNEL_ADDR_MAX, crash_size, alignment); - if (crash_base == MEMBLOCK_ERROR) { + if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } @@ -568,7 +567,7 @@ static void __init reserve_crashkernel(void) return; } } - memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL"); + memblock_reserve(crash_base, crash_size); printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " "for crashkernel (System RAM: %ldMB)\n", @@ -626,7 +625,7 @@ static __init void reserve_ibft_region(void) addr = find_ibft_region(&size); if (size) - memblock_x86_reserve_range(addr, addr + size, "* ibft"); + memblock_reserve(addr, size); } static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9f548cb..e38e217 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -840,7 +840,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || - !physid_isset(apicid, phys_cpu_present_map)) { + !physid_isset(apicid, phys_cpu_present_map) || + (!x2apic_mode && apicid >= 255)) { printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a91ae77..a73b610 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -14,11 +14,11 @@ void __init setup_trampolines(void) /* Has to be in very low memory so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (mem == MEMBLOCK_ERROR) + if (!mem) panic("Cannot allocate trampoline\n"); x86_trampoline_base = __va(mem); - memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); + memblock_reserve(mem, size); printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", x86_trampoline_base, (unsigned long long)mem, size); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a8e3eb8..fa1191fb 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -306,15 +306,10 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) == NOTIFY_STOP) return; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ -#ifdef CONFIG_KPROBES + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; -#else - if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; -#endif preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index db48336..2c9cf0f 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; -static int tsc_clocksource_reliable; +int tsc_clocksource_reliable; /* * Scheduler clock - returns current time in nanosec units. */ @@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) } #define CAL_MS 10 -#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) +#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) #define CAL_PIT_LOOPS 1000 #define CAL2_MS 50 -#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) #define CAL2_PIT_LOOPS 5000 diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 0aa5fed8..9eba29b 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + if (tsc_clocksource_reliable) { if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) pr_info( "Skipped synchronization checks as TSC is reliable.\n"); @@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void) { int cpus = 2; - if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) + if (unsynchronized_tsc() || tsc_clocksource_reliable) return; /* diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index e4d4a22..b07ba93 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), }; -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; static int __init vsyscall_setup(char *str) { @@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ + /* + * XXX: if access_ok, get_user, and put_user handled + * sig_on_uaccess_error, this could go away. + */ + + if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { + siginfo_t info; + struct thread_struct *thread = ¤t->thread; + + thread->error_code = 6; /* user fault, no page, write */ + thread->cr2 = ptr; + thread->trap_no = 14; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)ptr; + + force_sig_info(SIGSEGV, &info, current); + return false; + } else { + return true; + } +} + bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; int vsyscall_nr; + int prev_sig_on_uaccess_error; long ret; /* @@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) if (seccomp_mode(&tsk->seccomp)) do_exit(SIGKILL); + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + /* + * 0 is a valid user pointer (in the access_ok sense) on 32-bit and + * 64-bit, so we don't need to special-case it here. For all the + * vsyscalls, 0 means "don't write anything" not "write it at + * address 0". + */ + ret = -EFAULT; switch (vsyscall_nr) { case 0: + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) + break; + ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) + break; + ret = sys_time((time_t __user *)regs->di); break; case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) + break; + ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, 0); break; } + current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + if (ret == -EFAULT) { - /* - * Bad news -- userspace fed a bad pointer to a vsyscall. - * - * With a real vsyscall, that would have caused SIGSEGV. - * To make writing reliable exploits using the emulated - * vsyscalls harder, generate SIGSEGV here as well. - */ + /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); - goto sigsegv; + + /* + * If we failed to generate a signal for any reason, + * generate one here. (This should be impossible.) + */ + if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && + !sigismember(&tsk->pending.signal, SIGSEGV))) + goto sigsegv; + + return true; /* Don't emulate the ret. */ } regs->ax = ret; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c1d6cd5..91f83e2 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = { struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { .setup_percpu_clockev = setup_secondary_APIC_clock, + .fixup_cpu_id = x86_default_fixup_cpu_id, }; static void default_nmi_init(void) { }; diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c index 46fc4ee..88ad5fb 100644 --- a/arch/x86/lib/inat.c +++ b/arch/x86/lib/inat.c @@ -82,9 +82,16 @@ insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m, const insn_attr_t *table; if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX) return 0; - table = inat_avx_tables[vex_m][vex_p]; + /* At first, this checks the master table */ + table = inat_avx_tables[vex_m][0]; if (!table) return 0; + if (!inat_is_group(table[opcode]) && vex_p) { + /* If this is not a group, get attribute directly */ + table = inat_avx_tables[vex_m][vex_p]; + if (!table) + return 0; + } return table[opcode]; } diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 374562e..5a1f9f3 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -202,7 +202,7 @@ void insn_get_opcode(struct insn *insn) m = insn_vex_m_bits(insn); p = insn_vex_p_bits(insn); insn->attr = inat_get_avx_attribute(op, m, p); - if (!inat_accept_vex(insn->attr)) + if (!inat_accept_vex(insn->attr) && !inat_is_group(insn->attr)) insn->attr = 0; /* This instruction is bad */ goto end; /* VEX has only 1 byte for opcode */ } @@ -249,6 +249,8 @@ void insn_get_modrm(struct insn *insn) pfx = insn_last_prefix(insn); insn->attr = inat_get_group_attribute(mod, pfx, insn->attr); + if (insn_is_avx(insn) && !inat_accept_vex(insn->attr)) + insn->attr = 0; /* This is bad */ } } diff --git a/arch/x86/lib/string_32.c b/arch/x86/lib/string_32.c index 82004d2..bd59090 100644 --- a/arch/x86/lib/string_32.c +++ b/arch/x86/lib/string_32.c @@ -164,15 +164,13 @@ EXPORT_SYMBOL(strchr); size_t strlen(const char *s) { int d0; - int res; + size_t res; asm volatile("repne\n\t" - "scasb\n\t" - "notl %0\n\t" - "decl %0" + "scasb" : "=c" (res), "=&D" (d0) : "1" (s), "a" (0), "0" (0xffffffffu) : "memory"); - return res; + return ~res - 1; } EXPORT_SYMBOL(strlen); #endif diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index a793da5..5b83c51 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -1,5 +1,11 @@ # x86 Opcode Maps # +# This is (mostly) based on following documentations. +# - Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.2 +# (#325383-040US, October 2011) +# - Intel(R) Advanced Vector Extensions Programming Reference +# (#319433-011,JUNE 2011). +# #<Opcode maps> # Table: table-name # Referrer: escaped-name @@ -15,10 +21,13 @@ # EndTable # # AVX Superscripts -# (VEX): this opcode can accept VEX prefix. -# (oVEX): this opcode requires VEX prefix. -# (o128): this opcode only supports 128bit VEX. -# (o256): this opcode only supports 256bit VEX. +# (v): this opcode requires VEX prefix. +# (v1): this opcode only supports 128bit VEX. +# +# Last Prefix Superscripts +# - (66): the last prefix is 0x66 +# - (F3): the last prefix is 0xF3 +# - (F2): the last prefix is 0xF2 # Table: one byte opcode @@ -199,8 +208,8 @@ a0: MOV AL,Ob a1: MOV rAX,Ov a2: MOV Ob,AL a3: MOV Ov,rAX -a4: MOVS/B Xb,Yb -a5: MOVS/W/D/Q Xv,Yv +a4: MOVS/B Yb,Xb +a5: MOVS/W/D/Q Yv,Xv a6: CMPS/B Xb,Yb a7: CMPS/W/D Xv,Yv a8: TEST AL,Ib @@ -233,8 +242,8 @@ c0: Grp2 Eb,Ib (1A) c1: Grp2 Ev,Ib (1A) c2: RETN Iw (f64) c3: RETN -c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix) -c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix) +c4: LES Gz,Mp (i64) | VEX+2byte (Prefix) +c5: LDS Gz,Mp (i64) | VEX+1byte (Prefix) c6: Grp11 Eb,Ib (1A) c7: Grp11 Ev,Iz (1A) c8: ENTER Iw,Ib @@ -320,14 +329,19 @@ AVXcode: 1 # 3DNow! uses the last imm byte as opcode extension. 0f: 3DNow! Pq,Qq,Ib # 0x0f 0x10-0x1f -10: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128) -11: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128) -12: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX) -13: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128) -14: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX) -15: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX) -16: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX) -17: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128) +# NOTE: According to Intel SDM opcode map, vmovups and vmovupd has no operands +# but it actually has operands. And also, vmovss and vmovsd only accept 128bit. +# MOVSS/MOVSD has too many forms(3) on SDM. This map just shows a typical form. +# Many AVX instructions lack v1 superscript, according to Intel AVX-Prgramming +# Reference A.1 +10: vmovups Vps,Wps | vmovupd Vpd,Wpd (66) | vmovss Vx,Hx,Wss (F3),(v1) | vmovsd Vx,Hx,Wsd (F2),(v1) +11: vmovups Wps,Vps | vmovupd Wpd,Vpd (66) | vmovss Wss,Hx,Vss (F3),(v1) | vmovsd Wsd,Hx,Vsd (F2),(v1) +12: vmovlps Vq,Hq,Mq (v1) | vmovhlps Vq,Hq,Uq (v1) | vmovlpd Vq,Hq,Mq (66),(v1) | vmovsldup Vx,Wx (F3) | vmovddup Vx,Wx (F2) +13: vmovlps Mq,Vq (v1) | vmovlpd Mq,Vq (66),(v1) +14: vunpcklps Vx,Hx,Wx | vunpcklpd Vx,Hx,Wx (66) +15: vunpckhps Vx,Hx,Wx | vunpckhpd Vx,Hx,Wx (66) +16: vmovhps Vdq,Hq,Mq (v1) | vmovlhps Vdq,Hq,Uq (v1) | vmovhpd Vdq,Hq,Mq (66),(v1) | vmovshdup Vx,Wx (F3) +17: vmovhps Mq,Vq (v1) | vmovhpd Mq,Vq (66),(v1) 18: Grp16 (1A) 19: 1a: @@ -345,14 +359,14 @@ AVXcode: 1 25: 26: 27: -28: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX) -29: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX) -2a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128) -2b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX) -2c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128) -2d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128) -2e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128) -2f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128) +28: vmovaps Vps,Wps | vmovapd Vpd,Wpd (66) +29: vmovaps Wps,Vps | vmovapd Wpd,Vpd (66) +2a: cvtpi2ps Vps,Qpi | cvtpi2pd Vpd,Qpi (66) | vcvtsi2ss Vss,Hss,Ey (F3),(v1) | vcvtsi2sd Vsd,Hsd,Ey (F2),(v1) +2b: vmovntps Mps,Vps | vmovntpd Mpd,Vpd (66) +2c: cvttps2pi Ppi,Wps | cvttpd2pi Ppi,Wpd (66) | vcvttss2si Gy,Wss (F3),(v1) | vcvttsd2si Gy,Wsd (F2),(v1) +2d: cvtps2pi Ppi,Wps | cvtpd2pi Qpi,Wpd (66) | vcvtss2si Gy,Wss (F3),(v1) | vcvtsd2si Gy,Wsd (F2),(v1) +2e: vucomiss Vss,Wss (v1) | vucomisd Vsd,Wsd (66),(v1) +2f: vcomiss Vss,Wss (v1) | vcomisd Vsd,Wsd (66),(v1) # 0x0f 0x30-0x3f 30: WRMSR 31: RDTSC @@ -388,65 +402,66 @@ AVXcode: 1 4e: CMOVLE/NG Gv,Ev 4f: CMOVNLE/G Gv,Ev # 0x0f 0x50-0x5f -50: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX) -51: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128) -52: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128) -53: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128) -54: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX) -55: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX) -56: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX) -57: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX) -58: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128) -59: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128) -5a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128) -5b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX) -5c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128) -5d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128) -5e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128) -5f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128) +50: vmovmskps Gy,Ups | vmovmskpd Gy,Upd (66) +51: vsqrtps Vps,Wps | vsqrtpd Vpd,Wpd (66) | vsqrtss Vss,Hss,Wss (F3),(v1) | vsqrtsd Vsd,Hsd,Wsd (F2),(v1) +52: vrsqrtps Vps,Wps | vrsqrtss Vss,Hss,Wss (F3),(v1) +53: vrcpps Vps,Wps | vrcpss Vss,Hss,Wss (F3),(v1) +54: vandps Vps,Hps,Wps | vandpd Vpd,Hpd,Wpd (66) +55: vandnps Vps,Hps,Wps | vandnpd Vpd,Hpd,Wpd (66) +56: vorps Vps,Hps,Wps | vorpd Vpd,Hpd,Wpd (66) +57: vxorps Vps,Hps,Wps | vxorpd Vpd,Hpd,Wpd (66) +58: vaddps Vps,Hps,Wps | vaddpd Vpd,Hpd,Wpd (66) | vaddss Vss,Hss,Wss (F3),(v1) | vaddsd Vsd,Hsd,Wsd (F2),(v1) +59: vmulps Vps,Hps,Wps | vmulpd Vpd,Hpd,Wpd (66) | vmulss Vss,Hss,Wss (F3),(v1) | vmulsd Vsd,Hsd,Wsd (F2),(v1) +5a: vcvtps2pd Vpd,Wps | vcvtpd2ps Vps,Wpd (66) | vcvtss2sd Vsd,Hx,Wss (F3),(v1) | vcvtsd2ss Vss,Hx,Wsd (F2),(v1) +5b: vcvtdq2ps Vps,Wdq | vcvtps2dq Vdq,Wps (66) | vcvttps2dq Vdq,Wps (F3) +5c: vsubps Vps,Hps,Wps | vsubpd Vpd,Hpd,Wpd (66) | vsubss Vss,Hss,Wss (F3),(v1) | vsubsd Vsd,Hsd,Wsd (F2),(v1) +5d: vminps Vps,Hps,Wps | vminpd Vpd,Hpd,Wpd (66) | vminss Vss,Hss,Wss (F3),(v1) | vminsd Vsd,Hsd,Wsd (F2),(v1) +5e: vdivps Vps,Hps,Wps | vdivpd Vpd,Hpd,Wpd (66) | vdivss Vss,Hss,Wss (F3),(v1) | vdivsd Vsd,Hsd,Wsd (F2),(v1) +5f: vmaxps Vps,Hps,Wps | vmaxpd Vpd,Hpd,Wpd (66) | vmaxss Vss,Hss,Wss (F3),(v1) | vmaxsd Vsd,Hsd,Wsd (F2),(v1) # 0x0f 0x60-0x6f -60: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128) -61: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128) -62: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128) -63: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128) -64: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128) -65: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128) -66: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128) -67: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128) -68: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128) -69: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128) -6a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128) -6b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128) -6c: punpcklqdq Vdq,Wdq (66),(VEX),(o128) -6d: punpckhqdq Vdq,Wdq (66),(VEX),(o128) -6e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128) -6f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX) +60: punpcklbw Pq,Qd | vpunpcklbw Vx,Hx,Wx (66),(v1) +61: punpcklwd Pq,Qd | vpunpcklwd Vx,Hx,Wx (66),(v1) +62: punpckldq Pq,Qd | vpunpckldq Vx,Hx,Wx (66),(v1) +63: packsswb Pq,Qq | vpacksswb Vx,Hx,Wx (66),(v1) +64: pcmpgtb Pq,Qq | vpcmpgtb Vx,Hx,Wx (66),(v1) +65: pcmpgtw Pq,Qq | vpcmpgtw Vx,Hx,Wx (66),(v1) +66: pcmpgtd Pq,Qq | vpcmpgtd Vx,Hx,Wx (66),(v1) +67: packuswb Pq,Qq | vpackuswb Vx,Hx,Wx (66),(v1) +68: punpckhbw Pq,Qd | vpunpckhbw Vx,Hx,Wx (66),(v1) +69: punpckhwd Pq,Qd | vpunpckhwd Vx,Hx,Wx (66),(v1) +6a: punpckhdq Pq,Qd | vpunpckhdq Vx,Hx,Wx (66),(v1) +6b: packssdw Pq,Qd | vpackssdw Vx,Hx,Wx (66),(v1) +6c: vpunpcklqdq Vx,Hx,Wx (66),(v1) +6d: vpunpckhqdq Vx,Hx,Wx (66),(v1) +6e: movd/q Pd,Ey | vmovd/q Vy,Ey (66),(v1) +6f: movq Pq,Qq | vmovdqa Vx,Wx (66) | vmovdqu Vx,Wx (F3) # 0x0f 0x70-0x7f -70: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128) +70: pshufw Pq,Qq,Ib | vpshufd Vx,Wx,Ib (66),(v1) | vpshufhw Vx,Wx,Ib (F3),(v1) | vpshuflw Vx,Wx,Ib (F2),(v1) 71: Grp12 (1A) 72: Grp13 (1A) 73: Grp14 (1A) -74: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128) -75: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128) -76: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128) -77: emms/vzeroupper/vzeroall (VEX) -78: VMREAD Ed/q,Gd/q -79: VMWRITE Gd/q,Ed/q +74: pcmpeqb Pq,Qq | vpcmpeqb Vx,Hx,Wx (66),(v1) +75: pcmpeqw Pq,Qq | vpcmpeqw Vx,Hx,Wx (66),(v1) +76: pcmpeqd Pq,Qq | vpcmpeqd Vx,Hx,Wx (66),(v1) +# Note: Remove (v), because vzeroall and vzeroupper becomes emms without VEX. +77: emms | vzeroupper | vzeroall +78: VMREAD Ey,Gy +79: VMWRITE Gy,Ey 7a: 7b: -7c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX) -7d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX) -7e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128) -7f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX) +7c: vhaddpd Vpd,Hpd,Wpd (66) | vhaddps Vps,Hps,Wps (F2) +7d: vhsubpd Vpd,Hpd,Wpd (66) | vhsubps Vps,Hps,Wps (F2) +7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) +7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) # 0x0f 0x80-0x8f 80: JO Jz (f64) 81: JNO Jz (f64) -82: JB/JNAE/JC Jz (f64) -83: JNB/JAE/JNC Jz (f64) -84: JZ/JE Jz (f64) -85: JNZ/JNE Jz (f64) +82: JB/JC/JNAE Jz (f64) +83: JAE/JNB/JNC Jz (f64) +84: JE/JZ Jz (f64) +85: JNE/JNZ Jz (f64) 86: JBE/JNA Jz (f64) -87: JNBE/JA Jz (f64) +87: JA/JNBE Jz (f64) 88: JS Jz (f64) 89: JNS Jz (f64) 8a: JP/JPE Jz (f64) @@ -502,18 +517,18 @@ b8: JMPE | POPCNT Gv,Ev (F3) b9: Grp10 (1A) ba: Grp8 Ev,Ib (1A) bb: BTC Ev,Gv -bc: BSF Gv,Ev -bd: BSR Gv,Ev +bc: BSF Gv,Ev | TZCNT Gv,Ev (F3) +bd: BSR Gv,Ev | LZCNT Gv,Ev (F3) be: MOVSX Gv,Eb bf: MOVSX Gv,Ew # 0x0f 0xc0-0xcf c0: XADD Eb,Gb c1: XADD Ev,Gv -c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX) -c3: movnti Md/q,Gd/q -c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128) -c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128) -c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX) +c2: vcmpps Vps,Hps,Wps,Ib | vcmppd Vpd,Hpd,Wpd,Ib (66) | vcmpss Vss,Hss,Wss,Ib (F3),(v1) | vcmpsd Vsd,Hsd,Wsd,Ib (F2),(v1) +c3: movnti My,Gy +c4: pinsrw Pq,Ry/Mw,Ib | vpinsrw Vdq,Hdq,Ry/Mw,Ib (66),(v1) +c5: pextrw Gd,Nq,Ib | vpextrw Gd,Udq,Ib (66),(v1) +c6: vshufps Vps,Hps,Wps,Ib | vshufpd Vpd,Hpd,Wpd,Ib (66) c7: Grp9 (1A) c8: BSWAP RAX/EAX/R8/R8D c9: BSWAP RCX/ECX/R9/R9D @@ -524,55 +539,55 @@ cd: BSWAP RBP/EBP/R13/R13D ce: BSWAP RSI/ESI/R14/R14D cf: BSWAP RDI/EDI/R15/R15D # 0x0f 0xd0-0xdf -d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX) -d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128) -d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128) -d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128) -d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128) -d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128) -d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) -d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128) -d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128) -d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128) -da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128) -db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128) -dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128) -dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128) -de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128) -df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128) +d0: vaddsubpd Vpd,Hpd,Wpd (66) | vaddsubps Vps,Hps,Wps (F2) +d1: psrlw Pq,Qq | vpsrlw Vx,Hx,Wx (66),(v1) +d2: psrld Pq,Qq | vpsrld Vx,Hx,Wx (66),(v1) +d3: psrlq Pq,Qq | vpsrlq Vx,Hx,Wx (66),(v1) +d4: paddq Pq,Qq | vpaddq Vx,Hx,Wx (66),(v1) +d5: pmullw Pq,Qq | vpmullw Vx,Hx,Wx (66),(v1) +d6: vmovq Wq,Vq (66),(v1) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2) +d7: pmovmskb Gd,Nq | vpmovmskb Gd,Ux (66),(v1) +d8: psubusb Pq,Qq | vpsubusb Vx,Hx,Wx (66),(v1) +d9: psubusw Pq,Qq | vpsubusw Vx,Hx,Wx (66),(v1) +da: pminub Pq,Qq | vpminub Vx,Hx,Wx (66),(v1) +db: pand Pq,Qq | vpand Vx,Hx,Wx (66),(v1) +dc: paddusb Pq,Qq | vpaddusb Vx,Hx,Wx (66),(v1) +dd: paddusw Pq,Qq | vpaddusw Vx,Hx,Wx (66),(v1) +de: pmaxub Pq,Qq | vpmaxub Vx,Hx,Wx (66),(v1) +df: pandn Pq,Qq | vpandn Vx,Hx,Wx (66),(v1) # 0x0f 0xe0-0xef -e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128) -e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128) -e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128) -e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128) -e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128) -e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128) -e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX) -e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX) -e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128) -e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128) -ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128) -eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128) -ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128) -ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128) -ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128) -ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128) +e0: pavgb Pq,Qq | vpavgb Vx,Hx,Wx (66),(v1) +e1: psraw Pq,Qq | vpsraw Vx,Hx,Wx (66),(v1) +e2: psrad Pq,Qq | vpsrad Vx,Hx,Wx (66),(v1) +e3: pavgw Pq,Qq | vpavgw Vx,Hx,Wx (66),(v1) +e4: pmulhuw Pq,Qq | vpmulhuw Vx,Hx,Wx (66),(v1) +e5: pmulhw Pq,Qq | vpmulhw Vx,Hx,Wx (66),(v1) +e6: vcvttpd2dq Vx,Wpd (66) | vcvtdq2pd Vx,Wdq (F3) | vcvtpd2dq Vx,Wpd (F2) +e7: movntq Mq,Pq | vmovntdq Mx,Vx (66) +e8: psubsb Pq,Qq | vpsubsb Vx,Hx,Wx (66),(v1) +e9: psubsw Pq,Qq | vpsubsw Vx,Hx,Wx (66),(v1) +ea: pminsw Pq,Qq | vpminsw Vx,Hx,Wx (66),(v1) +eb: por Pq,Qq | vpor Vx,Hx,Wx (66),(v1) +ec: paddsb Pq,Qq | vpaddsb Vx,Hx,Wx (66),(v1) +ed: paddsw Pq,Qq | vpaddsw Vx,Hx,Wx (66),(v1) +ee: pmaxsw Pq,Qq | vpmaxsw Vx,Hx,Wx (66),(v1) +ef: pxor Pq,Qq | vpxor Vx,Hx,Wx (66),(v1) # 0x0f 0xf0-0xff -f0: lddqu Vdq,Mdq (F2),(VEX) -f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128) -f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128) -f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128) -f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128) -f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128) -f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128) -f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128) -f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128) -f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128) -fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128) -fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128) -fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128) -fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128) -fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128) +f0: vlddqu Vx,Mx (F2) +f1: psllw Pq,Qq | vpsllw Vx,Hx,Wx (66),(v1) +f2: pslld Pq,Qq | vpslld Vx,Hx,Wx (66),(v1) +f3: psllq Pq,Qq | vpsllq Vx,Hx,Wx (66),(v1) +f4: pmuludq Pq,Qq | vpmuludq Vx,Hx,Wx (66),(v1) +f5: pmaddwd Pq,Qq | vpmaddwd Vx,Hx,Wx (66),(v1) +f6: psadbw Pq,Qq | vpsadbw Vx,Hx,Wx (66),(v1) +f7: maskmovq Pq,Nq | vmaskmovdqu Vx,Ux (66),(v1) +f8: psubb Pq,Qq | vpsubb Vx,Hx,Wx (66),(v1) +f9: psubw Pq,Qq | vpsubw Vx,Hx,Wx (66),(v1) +fa: psubd Pq,Qq | vpsubd Vx,Hx,Wx (66),(v1) +fb: psubq Pq,Qq | vpsubq Vx,Hx,Wx (66),(v1) +fc: paddb Pq,Qq | vpaddb Vx,Hx,Wx (66),(v1) +fd: paddw Pq,Qq | vpaddw Vx,Hx,Wx (66),(v1) +fe: paddd Pq,Qq | vpaddd Vx,Hx,Wx (66),(v1) ff: EndTable @@ -580,155 +595,193 @@ Table: 3-byte opcode 1 (0x0f 0x38) Referrer: 3-byte escape 1 AVXcode: 2 # 0x0f 0x38 0x00-0x0f -00: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128) -01: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128) -02: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128) -03: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128) -04: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128) -05: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128) -06: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128) -07: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128) -08: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128) -09: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128) -0a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128) -0b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128) -0c: Vpermilps /r (66),(oVEX) -0d: Vpermilpd /r (66),(oVEX) -0e: vtestps /r (66),(oVEX) -0f: vtestpd /r (66),(oVEX) +00: pshufb Pq,Qq | vpshufb Vx,Hx,Wx (66),(v1) +01: phaddw Pq,Qq | vphaddw Vx,Hx,Wx (66),(v1) +02: phaddd Pq,Qq | vphaddd Vx,Hx,Wx (66),(v1) +03: phaddsw Pq,Qq | vphaddsw Vx,Hx,Wx (66),(v1) +04: pmaddubsw Pq,Qq | vpmaddubsw Vx,Hx,Wx (66),(v1) +05: phsubw Pq,Qq | vphsubw Vx,Hx,Wx (66),(v1) +06: phsubd Pq,Qq | vphsubd Vx,Hx,Wx (66),(v1) +07: phsubsw Pq,Qq | vphsubsw Vx,Hx,Wx (66),(v1) +08: psignb Pq,Qq | vpsignb Vx,Hx,Wx (66),(v1) +09: psignw Pq,Qq | vpsignw Vx,Hx,Wx (66),(v1) +0a: psignd Pq,Qq | vpsignd Vx,Hx,Wx (66),(v1) +0b: pmulhrsw Pq,Qq | vpmulhrsw Vx,Hx,Wx (66),(v1) +0c: vpermilps Vx,Hx,Wx (66),(v) +0d: vpermilpd Vx,Hx,Wx (66),(v) +0e: vtestps Vx,Wx (66),(v) +0f: vtestpd Vx,Wx (66),(v) # 0x0f 0x38 0x10-0x1f 10: pblendvb Vdq,Wdq (66) 11: 12: -13: +13: vcvtph2ps Vx,Wx,Ib (66),(v) 14: blendvps Vdq,Wdq (66) 15: blendvpd Vdq,Wdq (66) -16: -17: ptest Vdq,Wdq (66),(VEX) -18: vbroadcastss /r (66),(oVEX) -19: vbroadcastsd /r (66),(oVEX),(o256) -1a: vbroadcastf128 /r (66),(oVEX),(o256) +16: vpermps Vqq,Hqq,Wqq (66),(v) +17: vptest Vx,Wx (66) +18: vbroadcastss Vx,Wd (66),(v) +19: vbroadcastsd Vqq,Wq (66),(v) +1a: vbroadcastf128 Vqq,Mdq (66),(v) 1b: -1c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128) -1d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128) -1e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128) +1c: pabsb Pq,Qq | vpabsb Vx,Wx (66),(v1) +1d: pabsw Pq,Qq | vpabsw Vx,Wx (66),(v1) +1e: pabsd Pq,Qq | vpabsd Vx,Wx (66),(v1) 1f: # 0x0f 0x38 0x20-0x2f -20: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128) -21: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128) -22: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128) -23: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128) -24: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128) -25: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128) +20: vpmovsxbw Vx,Ux/Mq (66),(v1) +21: vpmovsxbd Vx,Ux/Md (66),(v1) +22: vpmovsxbq Vx,Ux/Mw (66),(v1) +23: vpmovsxwd Vx,Ux/Mq (66),(v1) +24: vpmovsxwq Vx,Ux/Md (66),(v1) +25: vpmovsxdq Vx,Ux/Mq (66),(v1) 26: 27: -28: pmuldq Vdq,Wdq (66),(VEX),(o128) -29: pcmpeqq Vdq,Wdq (66),(VEX),(o128) -2a: movntdqa Vdq,Mdq (66),(VEX),(o128) -2b: packusdw Vdq,Wdq (66),(VEX),(o128) -2c: vmaskmovps(ld) /r (66),(oVEX) -2d: vmaskmovpd(ld) /r (66),(oVEX) -2e: vmaskmovps(st) /r (66),(oVEX) -2f: vmaskmovpd(st) /r (66),(oVEX) +28: vpmuldq Vx,Hx,Wx (66),(v1) +29: vpcmpeqq Vx,Hx,Wx (66),(v1) +2a: vmovntdqa Vx,Mx (66),(v1) +2b: vpackusdw Vx,Hx,Wx (66),(v1) +2c: vmaskmovps Vx,Hx,Mx (66),(v) +2d: vmaskmovpd Vx,Hx,Mx (66),(v) +2e: vmaskmovps Mx,Hx,Vx (66),(v) +2f: vmaskmovpd Mx,Hx,Vx (66),(v) # 0x0f 0x38 0x30-0x3f -30: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128) -31: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128) -32: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128) -33: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128) -34: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128) -35: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128) -36: -37: pcmpgtq Vdq,Wdq (66),(VEX),(o128) -38: pminsb Vdq,Wdq (66),(VEX),(o128) -39: pminsd Vdq,Wdq (66),(VEX),(o128) -3a: pminuw Vdq,Wdq (66),(VEX),(o128) -3b: pminud Vdq,Wdq (66),(VEX),(o128) -3c: pmaxsb Vdq,Wdq (66),(VEX),(o128) -3d: pmaxsd Vdq,Wdq (66),(VEX),(o128) -3e: pmaxuw Vdq,Wdq (66),(VEX),(o128) -3f: pmaxud Vdq,Wdq (66),(VEX),(o128) +30: vpmovzxbw Vx,Ux/Mq (66),(v1) +31: vpmovzxbd Vx,Ux/Md (66),(v1) +32: vpmovzxbq Vx,Ux/Mw (66),(v1) +33: vpmovzxwd Vx,Ux/Mq (66),(v1) +34: vpmovzxwq Vx,Ux/Md (66),(v1) +35: vpmovzxdq Vx,Ux/Mq (66),(v1) +36: vpermd Vqq,Hqq,Wqq (66),(v) +37: vpcmpgtq Vx,Hx,Wx (66),(v1) +38: vpminsb Vx,Hx,Wx (66),(v1) +39: vpminsd Vx,Hx,Wx (66),(v1) +3a: vpminuw Vx,Hx,Wx (66),(v1) +3b: vpminud Vx,Hx,Wx (66),(v1) +3c: vpmaxsb Vx,Hx,Wx (66),(v1) +3d: vpmaxsd Vx,Hx,Wx (66),(v1) +3e: vpmaxuw Vx,Hx,Wx (66),(v1) +3f: vpmaxud Vx,Hx,Wx (66),(v1) # 0x0f 0x38 0x40-0x8f -40: pmulld Vdq,Wdq (66),(VEX),(o128) -41: phminposuw Vdq,Wdq (66),(VEX),(o128) -80: INVEPT Gd/q,Mdq (66) -81: INVPID Gd/q,Mdq (66) +40: vpmulld Vx,Hx,Wx (66),(v1) +41: vphminposuw Vdq,Wdq (66),(v1) +42: +43: +44: +45: vpsrlvd/q Vx,Hx,Wx (66),(v) +46: vpsravd Vx,Hx,Wx (66),(v) +47: vpsllvd/q Vx,Hx,Wx (66),(v) +# Skip 0x48-0x57 +58: vpbroadcastd Vx,Wx (66),(v) +59: vpbroadcastq Vx,Wx (66),(v) +5a: vbroadcasti128 Vqq,Mdq (66),(v) +# Skip 0x5b-0x77 +78: vpbroadcastb Vx,Wx (66),(v) +79: vpbroadcastw Vx,Wx (66),(v) +# Skip 0x7a-0x7f +80: INVEPT Gy,Mdq (66) +81: INVPID Gy,Mdq (66) +82: INVPCID Gy,Mdq (66) +8c: vpmaskmovd/q Vx,Hx,Mx (66),(v) +8e: vpmaskmovd/q Mx,Vx,Hx (66),(v) # 0x0f 0x38 0x90-0xbf (FMA) -96: vfmaddsub132pd/ps /r (66),(VEX) -97: vfmsubadd132pd/ps /r (66),(VEX) -98: vfmadd132pd/ps /r (66),(VEX) -99: vfmadd132sd/ss /r (66),(VEX),(o128) -9a: vfmsub132pd/ps /r (66),(VEX) -9b: vfmsub132sd/ss /r (66),(VEX),(o128) -9c: vfnmadd132pd/ps /r (66),(VEX) -9d: vfnmadd132sd/ss /r (66),(VEX),(o128) -9e: vfnmsub132pd/ps /r (66),(VEX) -9f: vfnmsub132sd/ss /r (66),(VEX),(o128) -a6: vfmaddsub213pd/ps /r (66),(VEX) -a7: vfmsubadd213pd/ps /r (66),(VEX) -a8: vfmadd213pd/ps /r (66),(VEX) -a9: vfmadd213sd/ss /r (66),(VEX),(o128) -aa: vfmsub213pd/ps /r (66),(VEX) -ab: vfmsub213sd/ss /r (66),(VEX),(o128) -ac: vfnmadd213pd/ps /r (66),(VEX) -ad: vfnmadd213sd/ss /r (66),(VEX),(o128) -ae: vfnmsub213pd/ps /r (66),(VEX) -af: vfnmsub213sd/ss /r (66),(VEX),(o128) -b6: vfmaddsub231pd/ps /r (66),(VEX) -b7: vfmsubadd231pd/ps /r (66),(VEX) -b8: vfmadd231pd/ps /r (66),(VEX) -b9: vfmadd231sd/ss /r (66),(VEX),(o128) -ba: vfmsub231pd/ps /r (66),(VEX) -bb: vfmsub231sd/ss /r (66),(VEX),(o128) -bc: vfnmadd231pd/ps /r (66),(VEX) -bd: vfnmadd231sd/ss /r (66),(VEX),(o128) -be: vfnmsub231pd/ps /r (66),(VEX) -bf: vfnmsub231sd/ss /r (66),(VEX),(o128) +90: vgatherdd/q Vx,Hx,Wx (66),(v) +91: vgatherqd/q Vx,Hx,Wx (66),(v) +92: vgatherdps/d Vx,Hx,Wx (66),(v) +93: vgatherqps/d Vx,Hx,Wx (66),(v) +94: +95: +96: vfmaddsub132ps/d Vx,Hx,Wx (66),(v) +97: vfmsubadd132ps/d Vx,Hx,Wx (66),(v) +98: vfmadd132ps/d Vx,Hx,Wx (66),(v) +99: vfmadd132ss/d Vx,Hx,Wx (66),(v),(v1) +9a: vfmsub132ps/d Vx,Hx,Wx (66),(v) +9b: vfmsub132ss/d Vx,Hx,Wx (66),(v),(v1) +9c: vfnmadd132ps/d Vx,Hx,Wx (66),(v) +9d: vfnmadd132ss/d Vx,Hx,Wx (66),(v),(v1) +9e: vfnmsub132ps/d Vx,Hx,Wx (66),(v) +9f: vfnmsub132ss/d Vx,Hx,Wx (66),(v),(v1) +a6: vfmaddsub213ps/d Vx,Hx,Wx (66),(v) +a7: vfmsubadd213ps/d Vx,Hx,Wx (66),(v) +a8: vfmadd213ps/d Vx,Hx,Wx (66),(v) +a9: vfmadd213ss/d Vx,Hx,Wx (66),(v),(v1) +aa: vfmsub213ps/d Vx,Hx,Wx (66),(v) +ab: vfmsub213ss/d Vx,Hx,Wx (66),(v),(v1) +ac: vfnmadd213ps/d Vx,Hx,Wx (66),(v) +ad: vfnmadd213ss/d Vx,Hx,Wx (66),(v),(v1) +ae: vfnmsub213ps/d Vx,Hx,Wx (66),(v) +af: vfnmsub213ss/d Vx,Hx,Wx (66),(v),(v1) +b6: vfmaddsub231ps/d Vx,Hx,Wx (66),(v) +b7: vfmsubadd231ps/d Vx,Hx,Wx (66),(v) +b8: vfmadd231ps/d Vx,Hx,Wx (66),(v) +b9: vfmadd231ss/d Vx,Hx,Wx (66),(v),(v1) +ba: vfmsub231ps/d Vx,Hx,Wx (66),(v) +bb: vfmsub231ss/d Vx,Hx,Wx (66),(v),(v1) +bc: vfnmadd231ps/d Vx,Hx,Wx (66),(v) +bd: vfnmadd231ss/d Vx,Hx,Wx (66),(v),(v1) +be: vfnmsub231ps/d Vx,Hx,Wx (66),(v) +bf: vfnmsub231ss/d Vx,Hx,Wx (66),(v),(v1) # 0x0f 0x38 0xc0-0xff -db: aesimc Vdq,Wdq (66),(VEX),(o128) -dc: aesenc Vdq,Wdq (66),(VEX),(o128) -dd: aesenclast Vdq,Wdq (66),(VEX),(o128) -de: aesdec Vdq,Wdq (66),(VEX),(o128) -df: aesdeclast Vdq,Wdq (66),(VEX),(o128) -f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2) -f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2) +db: VAESIMC Vdq,Wdq (66),(v1) +dc: VAESENC Vdq,Hdq,Wdq (66),(v1) +dd: VAESENCLAST Vdq,Hdq,Wdq (66),(v1) +de: VAESDEC Vdq,Hdq,Wdq (66),(v1) +df: VAESDECLAST Vdq,Hdq,Wdq (66),(v1) +f0: MOVBE Gy,My | MOVBE Gw,Mw (66) | CRC32 Gd,Eb (F2) +f1: MOVBE My,Gy | MOVBE Mw,Gw (66) | CRC32 Gd,Ey (F2) +f3: ANDN Gy,By,Ey (v) +f4: Grp17 (1A) +f5: BZHI Gy,Ey,By (v) | PEXT Gy,By,Ey (F3),(v) | PDEP Gy,By,Ey (F2),(v) +f6: MULX By,Gy,rDX,Ey (F2),(v) +f7: BEXTR Gy,Ey,By (v) | SHLX Gy,Ey,By (66),(v) | SARX Gy,Ey,By (F3),(v) | SHRX Gy,Ey,By (F2),(v) EndTable Table: 3-byte opcode 2 (0x0f 0x3a) Referrer: 3-byte escape 2 AVXcode: 3 # 0x0f 0x3a 0x00-0xff -04: vpermilps /r,Ib (66),(oVEX) -05: vpermilpd /r,Ib (66),(oVEX) -06: vperm2f128 /r,Ib (66),(oVEX),(o256) -08: roundps Vdq,Wdq,Ib (66),(VEX) -09: roundpd Vdq,Wdq,Ib (66),(VEX) -0a: roundss Vss,Wss,Ib (66),(VEX),(o128) -0b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128) -0c: blendps Vdq,Wdq,Ib (66),(VEX) -0d: blendpd Vdq,Wdq,Ib (66),(VEX) -0e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128) -0f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128) -14: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128) -15: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128) -16: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128) -17: extractps Ed,Vdq,Ib (66),(VEX),(o128) -18: vinsertf128 /r,Ib (66),(oVEX),(o256) -19: vextractf128 /r,Ib (66),(oVEX),(o256) -20: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128) -21: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128) -22: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128) -40: dpps Vdq,Wdq,Ib (66),(VEX) -41: dppd Vdq,Wdq,Ib (66),(VEX),(o128) -42: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128) -44: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128) -4a: vblendvps /r,Ib (66),(oVEX) -4b: vblendvpd /r,Ib (66),(oVEX) -4c: vpblendvb /r,Ib (66),(oVEX),(o128) -60: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128) -61: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128) -62: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128) -63: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128) -df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128) +00: vpermq Vqq,Wqq,Ib (66),(v) +01: vpermpd Vqq,Wqq,Ib (66),(v) +02: vpblendd Vx,Hx,Wx,Ib (66),(v) +03: +04: vpermilps Vx,Wx,Ib (66),(v) +05: vpermilpd Vx,Wx,Ib (66),(v) +06: vperm2f128 Vqq,Hqq,Wqq,Ib (66),(v) +07: +08: vroundps Vx,Wx,Ib (66) +09: vroundpd Vx,Wx,Ib (66) +0a: vroundss Vss,Wss,Ib (66),(v1) +0b: vroundsd Vsd,Wsd,Ib (66),(v1) +0c: vblendps Vx,Hx,Wx,Ib (66) +0d: vblendpd Vx,Hx,Wx,Ib (66) +0e: vpblendw Vx,Hx,Wx,Ib (66),(v1) +0f: palignr Pq,Qq,Ib | vpalignr Vx,Hx,Wx,Ib (66),(v1) +14: vpextrb Rd/Mb,Vdq,Ib (66),(v1) +15: vpextrw Rd/Mw,Vdq,Ib (66),(v1) +16: vpextrd/q Ey,Vdq,Ib (66),(v1) +17: vextractps Ed,Vdq,Ib (66),(v1) +18: vinsertf128 Vqq,Hqq,Wqq,Ib (66),(v) +19: vextractf128 Wdq,Vqq,Ib (66),(v) +1d: vcvtps2ph Wx,Vx,Ib (66),(v) +20: vpinsrb Vdq,Hdq,Ry/Mb,Ib (66),(v1) +21: vinsertps Vdq,Hdq,Udq/Md,Ib (66),(v1) +22: vpinsrd/q Vdq,Hdq,Ey,Ib (66),(v1) +38: vinserti128 Vqq,Hqq,Wqq,Ib (66),(v) +39: vextracti128 Wdq,Vqq,Ib (66),(v) +40: vdpps Vx,Hx,Wx,Ib (66) +41: vdppd Vdq,Hdq,Wdq,Ib (66),(v1) +42: vmpsadbw Vx,Hx,Wx,Ib (66),(v1) +44: vpclmulqdq Vdq,Hdq,Wdq,Ib (66),(v1) +46: vperm2i128 Vqq,Hqq,Wqq,Ib (66),(v) +4a: vblendvps Vx,Hx,Wx,Lx (66),(v) +4b: vblendvpd Vx,Hx,Wx,Lx (66),(v) +4c: vpblendvb Vx,Hx,Wx,Lx (66),(v1) +60: vpcmpestrm Vdq,Wdq,Ib (66),(v1) +61: vpcmpestri Vdq,Wdq,Ib (66),(v1) +62: vpcmpistrm Vdq,Wdq,Ib (66),(v1) +63: vpcmpistri Vdq,Wdq,Ib (66),(v1) +df: VAESKEYGEN Vdq,Wdq,Ib (66),(v1) +f0: RORX Gy,Ey,Ib (F2),(v) EndTable GrpTable: Grp1 @@ -790,7 +843,7 @@ GrpTable: Grp5 2: CALLN Ev (f64) 3: CALLF Ep 4: JMPN Ev (f64) -5: JMPF Ep +5: JMPF Mp 6: PUSH Ev (d64) 7: EndTable @@ -807,7 +860,7 @@ EndTable GrpTable: Grp7 0: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B) 1: SIDT Ms | MONITOR (000),(11B) | MWAIT (001) -2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) +2: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B) | VMFUNC (100),(11B) 3: LIDT Ms 4: SMSW Mw/Rv 5: @@ -824,44 +877,45 @@ EndTable GrpTable: Grp9 1: CMPXCHG8B/16B Mq/Mdq -6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) -7: VMPTRST Mq +6: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3) | RDRAND Rv (11B) +7: VMPTRST Mq | VMPTRST Mq (F3) EndTable GrpTable: Grp10 EndTable GrpTable: Grp11 +# Note: the operands are given by group opcode 0: MOV EndTable GrpTable: Grp12 -2: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128) -4: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128) -6: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128) +2: psrlw Nq,Ib (11B) | vpsrlw Hx,Ux,Ib (66),(11B),(v1) +4: psraw Nq,Ib (11B) | vpsraw Hx,Ux,Ib (66),(11B),(v1) +6: psllw Nq,Ib (11B) | vpsllw Hx,Ux,Ib (66),(11B),(v1) EndTable GrpTable: Grp13 -2: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128) -4: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128) -6: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128) +2: psrld Nq,Ib (11B) | vpsrld Hx,Ux,Ib (66),(11B),(v1) +4: psrad Nq,Ib (11B) | vpsrad Hx,Ux,Ib (66),(11B),(v1) +6: pslld Nq,Ib (11B) | vpslld Hx,Ux,Ib (66),(11B),(v1) EndTable GrpTable: Grp14 -2: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128) -3: psrldq Udq,Ib (66),(11B),(VEX),(o128) -6: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128) -7: pslldq Udq,Ib (66),(11B),(VEX),(o128) +2: psrlq Nq,Ib (11B) | vpsrlq Hx,Ux,Ib (66),(11B),(v1) +3: vpsrldq Hx,Ux,Ib (66),(11B),(v1) +6: psllq Nq,Ib (11B) | vpsllq Hx,Ux,Ib (66),(11B),(v1) +7: vpslldq Hx,Ux,Ib (66),(11B),(v1) EndTable GrpTable: Grp15 -0: fxsave -1: fxstor -2: ldmxcsr (VEX) -3: stmxcsr (VEX) +0: fxsave | RDFSBASE Ry (F3),(11B) +1: fxstor | RDGSBASE Ry (F3),(11B) +2: vldmxcsr Md (v1) | WRFSBASE Ry (F3),(11B) +3: vstmxcsr Md (v1) | WRGSBASE Ry (F3),(11B) 4: XSAVE 5: XRSTOR | lfence (11B) -6: mfence (11B) +6: XSAVEOPT | mfence (11B) 7: clflush | sfence (11B) EndTable @@ -872,6 +926,12 @@ GrpTable: Grp16 3: prefetch T2 EndTable +GrpTable: Grp17 +1: BLSR By,Ey (v) +2: BLSMSK By,Ey (v) +3: BLSI By,Ey (v) +EndTable + # AMD's Prefetch Group GrpTable: GrpP 0: PREFETCH diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 3d11327..23d8e5f 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -27,6 +27,4 @@ obj-$(CONFIG_AMD_NUMA) += amdtopology.o obj-$(CONFIG_ACPI_NUMA) += srat.o obj-$(CONFIG_NUMA_EMU) += numa_emulation.o -obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o - obj-$(CONFIG_MEMTEST) += memtest.o diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c index d0474ad..1fb85db 100644 --- a/arch/x86/mm/extable.c +++ b/arch/x86/mm/extable.c @@ -25,7 +25,7 @@ int fixup_exception(struct pt_regs *regs) if (fixup) { /* If fixup is less than 16, it means uaccess error */ if (fixup->fixup < 16) { - current_thread_info()->uaccess_err = -EFAULT; + current_thread_info()->uaccess_err = 1; regs->ip += fixup->fixup; return 1; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 5db0490..9d74824 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -626,7 +626,7 @@ pgtable_bad(struct pt_regs *regs, unsigned long error_code, static noinline void no_context(struct pt_regs *regs, unsigned long error_code, - unsigned long address) + unsigned long address, int signal, int si_code) { struct task_struct *tsk = current; unsigned long *stackend; @@ -634,8 +634,17 @@ no_context(struct pt_regs *regs, unsigned long error_code, int sig; /* Are we prepared to handle this kernel fault? */ - if (fixup_exception(regs)) + if (fixup_exception(regs)) { + if (current_thread_info()->sig_on_uaccess_error && signal) { + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code | PF_USER; + tsk->thread.cr2 = address; + + /* XXX: hwpoison faults will set the wrong code. */ + force_sig_info_fault(signal, si_code, address, tsk, 0); + } return; + } /* * 32-bit: @@ -755,7 +764,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code, if (is_f00f_bug(regs, address)) return; - no_context(regs, error_code, address); + no_context(regs, error_code, address, SIGSEGV, si_code); } static noinline void @@ -819,7 +828,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address, /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { - no_context(regs, error_code, address); + no_context(regs, error_code, address, SIGBUS, BUS_ADRERR); return; } @@ -854,7 +863,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, if (!(fault & VM_FAULT_RETRY)) up_read(¤t->mm->mmap_sem); if (!(error_code & PF_USER)) - no_context(regs, error_code, address); + no_context(regs, error_code, address, 0, 0); return 1; } if (!(fault & VM_FAULT_ERROR)) @@ -864,7 +873,8 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code, /* Kernel mode? Handle exceptions or die: */ if (!(error_code & PF_USER)) { up_read(¤t->mm->mmap_sem); - no_context(regs, error_code, address); + no_context(regs, error_code, address, + SIGSEGV, SEGV_MAPERR); return 1; } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 87488b9..a298914 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -67,7 +67,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, good_end = max_pfn_mapped << PAGE_SHIFT; base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE); - if (base == MEMBLOCK_ERROR) + if (!base) panic("Cannot find space for the kernel page tables"); pgt_buf_start = base >> PAGE_SHIFT; @@ -80,7 +80,7 @@ static void __init find_early_table_space(unsigned long end, int use_pse, void __init native_pagetable_reserve(u64 start, u64 end) { - memblock_x86_reserve_range(start, end, "PGTABLE"); + memblock_reserve(start, end - start); } struct map_range { @@ -279,8 +279,8 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top) * so that they can be reused for other purposes. * - * On native it just means calling memblock_x86_reserve_range, on Xen it - * also means marking RW the pagetable pages that we allocated before + * On native it just means calling memblock_reserve, on Xen it also + * means marking RW the pagetable pages that we allocated before * but that haven't been used. * * In fact on xen we mark RO the whole range pgt_buf_start - diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 29f7c6d9..0c1da39 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -427,23 +427,17 @@ static void __init add_one_highpage_init(struct page *page) void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn) { - struct range *range; - int nr_range; - int i; - - nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn); - - for (i = 0; i < nr_range; i++) { - struct page *page; - int node_pfn; - - for (node_pfn = range[i].start; node_pfn < range[i].end; - node_pfn++) { - if (!pfn_valid(node_pfn)) - continue; - page = pfn_to_page(node_pfn); - add_one_highpage_init(page); - } + phys_addr_t start, end; + u64 i; + + for_each_free_mem_range(i, nid, &start, &end, NULL) { + unsigned long pfn = clamp_t(unsigned long, PFN_UP(start), + start_pfn, end_pfn); + unsigned long e_pfn = clamp_t(unsigned long, PFN_DOWN(end), + start_pfn, end_pfn); + for ( ; pfn < e_pfn; pfn++) + if (pfn_valid(pfn)) + add_one_highpage_init(pfn_to_page(pfn)); } } #else @@ -650,18 +644,18 @@ void __init initmem_init(void) highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; - memblock_x86_register_active_regions(0, 0, highend_pfn); - sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - memblock_x86_register_active_regions(0, 0, max_low_pfn); - sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif + + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); + sparse_memory_present_with_active_regions(0); + #ifdef CONFIG_FLATMEM max_mapnr = num_physpages; #endif diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index bbaaa00..a8a56ce 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -608,7 +608,7 @@ kernel_physical_mapping_init(unsigned long start, #ifndef CONFIG_NUMA void __init initmem_init(void) { - memblock_x86_register_active_regions(0, 0, max_pfn); + memblock_set_node(0, (phys_addr_t)ULLONG_MAX, 0); } #endif diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c deleted file mode 100644 index 992da5e..0000000 --- a/arch/x86/mm/memblock.c +++ /dev/null @@ -1,348 +0,0 @@ -#include <linux/kernel.h> -#include <linux/types.h> -#include <linux/init.h> -#include <linux/bitops.h> -#include <linux/memblock.h> -#include <linux/bootmem.h> -#include <linux/mm.h> -#include <linux/range.h> - -/* Check for already reserved areas */ -bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align) -{ - struct memblock_region *r; - u64 addr = *addrp, last; - u64 size = *sizep; - bool changed = false; - -again: - last = addr + size; - for_each_memblock(reserved, r) { - if (last > r->base && addr < r->base) { - size = r->base - addr; - changed = true; - goto again; - } - if (last > (r->base + r->size) && addr < (r->base + r->size)) { - addr = round_up(r->base + r->size, align); - size = last - addr; - changed = true; - goto again; - } - if (last <= (r->base + r->size) && addr >= r->base) { - *sizep = 0; - return false; - } - } - if (changed) { - *addrp = addr; - *sizep = size; - } - return changed; -} - -/* - * Find next free range after start, and size is returned in *sizep - */ -u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align) -{ - struct memblock_region *r; - - for_each_memblock(memory, r) { - u64 ei_start = r->base; - u64 ei_last = ei_start + r->size; - u64 addr; - - addr = round_up(ei_start, align); - if (addr < start) - addr = round_up(start, align); - if (addr >= ei_last) - continue; - *sizep = ei_last - addr; - while (memblock_x86_check_reserved_size(&addr, sizep, align)) - ; - - if (*sizep) - return addr; - } - - return MEMBLOCK_ERROR; -} - -static __init struct range *find_range_array(int count) -{ - u64 end, size, mem; - struct range *range; - - size = sizeof(struct range) * count; - end = memblock.current_limit; - - mem = memblock_find_in_range(0, end, size, sizeof(struct range)); - if (mem == MEMBLOCK_ERROR) - panic("can not find more space for range array"); - - /* - * This range is tempoaray, so don't reserve it, it will not be - * overlapped because We will not alloccate new buffer before - * We discard this one - */ - range = __va(mem); - memset(range, 0, size); - - return range; -} - -static void __init memblock_x86_subtract_reserved(struct range *range, int az) -{ - u64 final_start, final_end; - struct memblock_region *r; - - /* Take out region array itself at first*/ - memblock_free_reserved_regions(); - - memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt); - - for_each_memblock(reserved, r) { - memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1); - final_start = PFN_DOWN(r->base); - final_end = PFN_UP(r->base + r->size); - if (final_start >= final_end) - continue; - subtract_range(range, az, final_start, final_end); - } - - /* Put region array back ? */ - memblock_reserve_reserved_regions(); -} - -struct count_data { - int nr; -}; - -static int __init count_work_fn(unsigned long start_pfn, - unsigned long end_pfn, void *datax) -{ - struct count_data *data = datax; - - data->nr++; - - return 0; -} - -static int __init count_early_node_map(int nodeid) -{ - struct count_data data; - - data.nr = 0; - work_with_active_regions(nodeid, count_work_fn, &data); - - return data.nr; -} - -int __init __get_free_all_memory_range(struct range **rangep, int nodeid, - unsigned long start_pfn, unsigned long end_pfn) -{ - int count; - struct range *range; - int nr_range; - - count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2; - - range = find_range_array(count); - nr_range = 0; - - /* - * Use early_node_map[] and memblock.reserved.region to get range array - * at first - */ - nr_range = add_from_early_node_map(range, count, nr_range, nodeid); - subtract_range(range, count, 0, start_pfn); - subtract_range(range, count, end_pfn, -1ULL); - - memblock_x86_subtract_reserved(range, count); - nr_range = clean_sort_range(range, count); - - *rangep = range; - return nr_range; -} - -int __init get_free_all_memory_range(struct range **rangep, int nodeid) -{ - unsigned long end_pfn = -1UL; - -#ifdef CONFIG_X86_32 - end_pfn = max_low_pfn; -#endif - return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn); -} - -static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free) -{ - int i, count; - struct range *range; - int nr_range; - u64 final_start, final_end; - u64 free_size; - struct memblock_region *r; - - count = (memblock.reserved.cnt + memblock.memory.cnt) * 2; - - range = find_range_array(count); - nr_range = 0; - - addr = PFN_UP(addr); - limit = PFN_DOWN(limit); - - for_each_memblock(memory, r) { - final_start = PFN_UP(r->base); - final_end = PFN_DOWN(r->base + r->size); - if (final_start >= final_end) - continue; - if (final_start >= limit || final_end <= addr) - continue; - - nr_range = add_range(range, count, nr_range, final_start, final_end); - } - subtract_range(range, count, 0, addr); - subtract_range(range, count, limit, -1ULL); - - /* Subtract memblock.reserved.region in range ? */ - if (!get_free) - goto sort_and_count_them; - for_each_memblock(reserved, r) { - final_start = PFN_DOWN(r->base); - final_end = PFN_UP(r->base + r->size); - if (final_start >= final_end) - continue; - if (final_start >= limit || final_end <= addr) - continue; - - subtract_range(range, count, final_start, final_end); - } - -sort_and_count_them: - nr_range = clean_sort_range(range, count); - - free_size = 0; - for (i = 0; i < nr_range; i++) - free_size += range[i].end - range[i].start; - - return free_size << PAGE_SHIFT; -} - -u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit) -{ - return __memblock_x86_memory_in_range(addr, limit, true); -} - -u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit) -{ - return __memblock_x86_memory_in_range(addr, limit, false); -} - -void __init memblock_x86_reserve_range(u64 start, u64 end, char *name) -{ - if (start == end) - return; - - if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end)) - return; - - memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name); - - memblock_reserve(start, end - start); -} - -void __init memblock_x86_free_range(u64 start, u64 end) -{ - if (start == end) - return; - - if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end)) - return; - - memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1); - - memblock_free(start, end - start); -} - -/* - * Need to call this function after memblock_x86_register_active_regions, - * so early_node_map[] is filled already. - */ -u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align) -{ - u64 addr; - addr = find_memory_core_early(nid, size, align, start, end); - if (addr != MEMBLOCK_ERROR) - return addr; - - /* Fallback, should already have start end within node range */ - return memblock_find_in_range(start, end, size, align); -} - -/* - * Finds an active region in the address range from start_pfn to last_pfn and - * returns its range in ei_startpfn and ei_endpfn for the memblock entry. - */ -static int __init memblock_x86_find_active_region(const struct memblock_region *ei, - unsigned long start_pfn, - unsigned long last_pfn, - unsigned long *ei_startpfn, - unsigned long *ei_endpfn) -{ - u64 align = PAGE_SIZE; - - *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT; - *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT; - - /* Skip map entries smaller than a page */ - if (*ei_startpfn >= *ei_endpfn) - return 0; - - /* Skip if map is outside the node */ - if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn) - return 0; - - /* Check for overlaps */ - if (*ei_startpfn < start_pfn) - *ei_startpfn = start_pfn; - if (*ei_endpfn > last_pfn) - *ei_endpfn = last_pfn; - - return 1; -} - -/* Walk the memblock.memory map and register active regions within a node */ -void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn, - unsigned long last_pfn) -{ - unsigned long ei_startpfn; - unsigned long ei_endpfn; - struct memblock_region *r; - - for_each_memblock(memory, r) - if (memblock_x86_find_active_region(r, start_pfn, last_pfn, - &ei_startpfn, &ei_endpfn)) - add_active_range(nid, ei_startpfn, ei_endpfn); -} - -/* - * Find the hole size (in bytes) in the memory range. - * @start: starting address of the memory range to scan - * @end: ending address of the memory range to scan - */ -u64 __init memblock_x86_hole_size(u64 start, u64 end) -{ - unsigned long start_pfn = start >> PAGE_SHIFT; - unsigned long last_pfn = end >> PAGE_SHIFT; - unsigned long ei_startpfn, ei_endpfn, ram = 0; - struct memblock_region *r; - - for_each_memblock(memory, r) - if (memblock_x86_find_active_region(r, start_pfn, last_pfn, - &ei_startpfn, &ei_endpfn)) - ram += ei_endpfn - ei_startpfn; - - return end - start - ((u64)ram << PAGE_SHIFT); -} diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c index 92faf3a..c80b9fb 100644 --- a/arch/x86/mm/memtest.c +++ b/arch/x86/mm/memtest.c @@ -34,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad) (unsigned long long) pattern, (unsigned long long) start_bad, (unsigned long long) end_bad); - memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM"); + memblock_reserve(start_bad, end_bad - start_bad); } static void __init memtest(u64 pattern, u64 start_phys, u64 size) @@ -70,24 +70,19 @@ static void __init memtest(u64 pattern, u64 start_phys, u64 size) static void __init do_one_pass(u64 pattern, u64 start, u64 end) { - u64 size = 0; - - while (start < end) { - start = memblock_x86_find_in_range_size(start, &size, 1); - - /* done ? */ - if (start >= end) - break; - if (start + size > end) - size = end - start; - - printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", - (unsigned long long) start, - (unsigned long long) start + size, - (unsigned long long) cpu_to_be64(pattern)); - memtest(pattern, start, size); - - start += size; + u64 i; + phys_addr_t this_start, this_end; + + for_each_free_mem_range(i, MAX_NUMNODES, &this_start, &this_end, NULL) { + this_start = clamp_t(phys_addr_t, this_start, start, end); + this_end = clamp_t(phys_addr_t, this_end, start, end); + if (this_start < this_end) { + printk(KERN_INFO " %010llx - %010llx pattern %016llx\n", + (unsigned long long)this_start, + (unsigned long long)this_end, + (unsigned long long)cpu_to_be64(pattern)); + memtest(pattern, this_start, this_end - this_start); + } } } diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index fbeaaf4..496f494 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -192,8 +192,6 @@ int __init numa_add_memblk(int nid, u64 start, u64 end) /* Initialize NODE_DATA for a node on the local memory */ static void __init setup_node_data(int nid, u64 start, u64 end) { - const u64 nd_low = PFN_PHYS(MAX_DMA_PFN); - const u64 nd_high = PFN_PHYS(max_pfn_mapped); const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); bool remapped = false; u64 nd_pa; @@ -224,17 +222,12 @@ static void __init setup_node_data(int nid, u64 start, u64 end) nd_pa = __pa(nd); remapped = true; } else { - nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high, - nd_size, SMP_CACHE_BYTES); - if (nd_pa == MEMBLOCK_ERROR) - nd_pa = memblock_find_in_range(nd_low, nd_high, - nd_size, SMP_CACHE_BYTES); - if (nd_pa == MEMBLOCK_ERROR) { + nd_pa = memblock_alloc_nid(nd_size, SMP_CACHE_BYTES, nid); + if (!nd_pa) { pr_err("Cannot find %zu bytes in node %d\n", nd_size, nid); return; } - memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA"); nd = __va(nd_pa); } @@ -371,8 +364,7 @@ void __init numa_reset_distance(void) /* numa_distance could be 1LU marking allocation failure, test cnt */ if (numa_distance_cnt) - memblock_x86_free_range(__pa(numa_distance), - __pa(numa_distance) + size); + memblock_free(__pa(numa_distance), size); numa_distance_cnt = 0; numa_distance = NULL; /* enable table creation */ } @@ -395,13 +387,13 @@ static int __init numa_alloc_distance(void) phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), size, PAGE_SIZE); - if (phys == MEMBLOCK_ERROR) { + if (!phys) { pr_warning("NUMA: Warning: can't allocate distance table!\n"); /* don't retry until explicitly reset */ numa_distance = (void *)1LU; return -ENOMEM; } - memblock_x86_reserve_range(phys, phys + size, "NUMA DIST"); + memblock_reserve(phys, size); numa_distance = __va(phys); numa_distance_cnt = cnt; @@ -482,8 +474,8 @@ static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi) numaram = 0; } - e820ram = max_pfn - (memblock_x86_hole_size(0, - PFN_PHYS(max_pfn)) >> PAGE_SHIFT); + e820ram = max_pfn - absent_pages_in_range(0, max_pfn); + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) { printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n", @@ -505,13 +497,10 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) if (WARN_ON(nodes_empty(node_possible_map))) return -EINVAL; - for (i = 0; i < mi->nr_blks; i++) - memblock_x86_register_active_regions(mi->blk[i].nid, - mi->blk[i].start >> PAGE_SHIFT, - mi->blk[i].end >> PAGE_SHIFT); - - /* for out of order entries */ - sort_node_map(); + for (i = 0; i < mi->nr_blks; i++) { + struct numa_memblk *mb = &mi->blk[i]; + memblock_set_node(mb->start, mb->end - mb->start, mb->nid); + } /* * If sections array is gonna be used for pfn -> nid mapping, check @@ -545,6 +534,8 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) setup_node_data(nid, start, end); } + /* Dump memblock with node info and return. */ + memblock_dump_all(); return 0; } @@ -582,7 +573,7 @@ static int __init numa_init(int (*init_func)(void)) nodes_clear(node_possible_map); nodes_clear(node_online_map); memset(&numa_meminfo, 0, sizeof(numa_meminfo)); - remove_all_active_ranges(); + WARN_ON(memblock_set_node(0, ULLONG_MAX, MAX_NUMNODES)); numa_reset_distance(); ret = init_func(); diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 3adebe7..534255a 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -199,23 +199,23 @@ void __init init_alloc_remap(int nid, u64 start, u64 end) /* allocate node memory and the lowmem remap area */ node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES); - if (node_pa == MEMBLOCK_ERROR) { + if (!node_pa) { pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n", size, nid); return; } - memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM"); + memblock_reserve(node_pa, size); remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT, max_low_pfn << PAGE_SHIFT, size, LARGE_PAGE_BYTES); - if (remap_pa == MEMBLOCK_ERROR) { + if (!remap_pa) { pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n", size, nid); - memblock_x86_free_range(node_pa, node_pa + size); + memblock_free(node_pa, size); return; } - memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG"); + memblock_reserve(remap_pa, size); remap_va = phys_to_virt(remap_pa); /* perform actual remap */ diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index dd27f40..92e2711 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -19,7 +19,7 @@ unsigned long __init numa_free_all_bootmem(void) for_each_online_node(i) pages += free_all_bootmem_node(NODE_DATA(i)); - pages += free_all_memory_core_early(MAX_NUMNODES); + pages += free_low_memory_core_early(MAX_NUMNODES); return pages; } diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c index d0ed086..46db568 100644 --- a/arch/x86/mm/numa_emulation.c +++ b/arch/x86/mm/numa_emulation.c @@ -28,6 +28,16 @@ static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi) return -ENOENT; } +static u64 mem_hole_size(u64 start, u64 end) +{ + unsigned long start_pfn = PFN_UP(start); + unsigned long end_pfn = PFN_DOWN(end); + + if (start_pfn < end_pfn) + return PFN_PHYS(absent_pages_in_range(start_pfn, end_pfn)); + return 0; +} + /* * Sets up nid to range from @start to @end. The return value is -errno if * something went wrong, 0 otherwise. @@ -89,7 +99,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * Calculate target node size. x86_32 freaks on __udivdi3() so do * the division in ulong number of pages and convert back. */ - size = max_addr - addr - memblock_x86_hole_size(addr, max_addr); + size = max_addr - addr - mem_hole_size(addr, max_addr); size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes); /* @@ -135,8 +145,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * Continue to add memory to this fake node if its * non-reserved memory is less than the per-node size. */ - while (end - start - - memblock_x86_hole_size(start, end) < size) { + while (end - start - mem_hole_size(start, end) < size) { end += FAKE_NODE_MIN_SIZE; if (end > limit) { end = limit; @@ -150,7 +159,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * this one must extend to the boundary. */ if (end < dma32_end && dma32_end - end - - memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) end = dma32_end; /* @@ -158,8 +167,7 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei, * next node, this one must extend to the end of the * physical node. */ - if (limit - end - - memblock_x86_hole_size(end, limit) < size) + if (limit - end - mem_hole_size(end, limit) < size) end = limit; ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes, @@ -180,7 +188,7 @@ static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size) { u64 end = start + size; - while (end - start - memblock_x86_hole_size(start, end) < size) { + while (end - start - mem_hole_size(start, end) < size) { end += FAKE_NODE_MIN_SIZE; if (end > max_addr) { end = max_addr; @@ -211,8 +219,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * creates a uniform distribution of node sizes across the entire * machine (but not necessarily over physical nodes). */ - min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / - MAX_NUMNODES; + min_size = (max_addr - addr - mem_hole_size(addr, max_addr)) / MAX_NUMNODES; min_size = max(min_size, FAKE_NODE_MIN_SIZE); if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size) min_size = (min_size + FAKE_NODE_MIN_SIZE) & @@ -252,7 +259,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * this one must extend to the boundary. */ if (end < dma32_end && dma32_end - end - - memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) + mem_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE) end = dma32_end; /* @@ -260,8 +267,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei, * next node, this one must extend to the end of the * physical node. */ - if (limit - end - - memblock_x86_hole_size(end, limit) < size) + if (limit - end - mem_hole_size(end, limit) < size) end = limit; ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES, @@ -351,11 +357,11 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped), phys_size, PAGE_SIZE); - if (phys == MEMBLOCK_ERROR) { + if (!phys) { pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); goto no_emu; } - memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST"); + memblock_reserve(phys, phys_size); phys_dist = __va(phys); for (i = 0; i < numa_dist_cnt; i++) @@ -424,7 +430,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt) /* free the copied physical distance table */ if (phys_dist) - memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size); + memblock_free(__pa(phys_dist), phys_size); return; no_emu: diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index f9e5267..eda2acb 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -998,7 +998,7 @@ out_err: } EXPORT_SYMBOL(set_memory_uc); -int _set_memory_array(unsigned long *addr, int addrinarray, +static int _set_memory_array(unsigned long *addr, int addrinarray, unsigned long new_type) { int i, j; diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 81dbfde..fd61b3f 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -69,6 +69,12 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0) return; pxm = pa->proximity_domain; + apic_id = pa->apic_id; + if (!cpu_has_x2apic && (apic_id >= 0xff)) { + printk(KERN_INFO "SRAT: PXM %u -> X2APIC 0x%04x ignored\n", + pxm, apic_id); + return; + } node = setup_node(pxm); if (node < 0) { printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm); @@ -76,7 +82,6 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa) return; } - apic_id = pa->apic_id; if (apic_id >= MAX_LOCAL_APIC) { printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node); return; diff --git a/arch/x86/oprofile/Makefile b/arch/x86/oprofile/Makefile index 446902b..1599f56 100644 --- a/arch/x86/oprofile/Makefile +++ b/arch/x86/oprofile/Makefile @@ -4,9 +4,8 @@ DRIVER_OBJS = $(addprefix ../../../drivers/oprofile/, \ oprof.o cpu_buffer.o buffer_sync.o \ event_buffer.o oprofile_files.o \ oprofilefs.o oprofile_stats.o \ - timer_int.o ) + timer_int.o nmi_timer_int.o ) oprofile-y := $(DRIVER_OBJS) init.o backtrace.o oprofile-$(CONFIG_X86_LOCAL_APIC) += nmi_int.o op_model_amd.o \ op_model_ppro.o op_model_p4.o -oprofile-$(CONFIG_X86_IO_APIC) += nmi_timer_int.o diff --git a/arch/x86/oprofile/init.c b/arch/x86/oprofile/init.c index f148cf6..9e138d0 100644 --- a/arch/x86/oprofile/init.c +++ b/arch/x86/oprofile/init.c @@ -16,37 +16,23 @@ * with the NMI mode driver. */ +#ifdef CONFIG_X86_LOCAL_APIC extern int op_nmi_init(struct oprofile_operations *ops); -extern int op_nmi_timer_init(struct oprofile_operations *ops); extern void op_nmi_exit(void); -extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); +#else +static int op_nmi_init(struct oprofile_operations *ops) { return -ENODEV; } +static void op_nmi_exit(void) { } +#endif -static int nmi_timer; +extern void x86_backtrace(struct pt_regs * const regs, unsigned int depth); int __init oprofile_arch_init(struct oprofile_operations *ops) { - int ret; - - ret = -ENODEV; - -#ifdef CONFIG_X86_LOCAL_APIC - ret = op_nmi_init(ops); -#endif - nmi_timer = (ret != 0); -#ifdef CONFIG_X86_IO_APIC - if (nmi_timer) - ret = op_nmi_timer_init(ops); -#endif ops->backtrace = x86_backtrace; - - return ret; + return op_nmi_init(ops); } - void oprofile_arch_exit(void) { -#ifdef CONFIG_X86_LOCAL_APIC - if (!nmi_timer) - op_nmi_exit(); -#endif + op_nmi_exit(); } diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 75f9528..26b8a85 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -595,24 +595,36 @@ static int __init p4_init(char **cpu_type) return 0; } -static int force_arch_perfmon; -static int force_cpu_type(const char *str, struct kernel_param *kp) +enum __force_cpu_type { + reserved = 0, /* do not force */ + timer, + arch_perfmon, +}; + +static int force_cpu_type; + +static int set_cpu_type(const char *str, struct kernel_param *kp) { - if (!strcmp(str, "arch_perfmon")) { - force_arch_perfmon = 1; + if (!strcmp(str, "timer")) { + force_cpu_type = timer; + printk(KERN_INFO "oprofile: forcing NMI timer mode\n"); + } else if (!strcmp(str, "arch_perfmon")) { + force_cpu_type = arch_perfmon; printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); + } else { + force_cpu_type = 0; } return 0; } -module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); +module_param_call(cpu_type, set_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; struct op_x86_model_spec *spec = &op_ppro_spec; /* default */ - if (force_arch_perfmon && cpu_has_arch_perfmon) + if (force_cpu_type == arch_perfmon && cpu_has_arch_perfmon) return 0; /* @@ -679,6 +691,9 @@ int __init op_nmi_init(struct oprofile_operations *ops) if (!cpu_has_apic) return -ENODEV; + if (force_cpu_type == timer) + return -ENODEV; + switch (vendor) { case X86_VENDOR_AMD: /* Needs to be at least an Athlon (or hammer in 32bit mode) */ diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c deleted file mode 100644 index 7f8052c..0000000 --- a/arch/x86/oprofile/nmi_timer_int.c +++ /dev/null @@ -1,50 +0,0 @@ -/** - * @file nmi_timer_int.c - * - * @remark Copyright 2003 OProfile authors - * @remark Read the file COPYING - * - * @author Zwane Mwaikambo <zwane@linuxpower.ca> - */ - -#include <linux/init.h> -#include <linux/smp.h> -#include <linux/errno.h> -#include <linux/oprofile.h> -#include <linux/rcupdate.h> -#include <linux/kdebug.h> - -#include <asm/nmi.h> -#include <asm/apic.h> -#include <asm/ptrace.h> - -static int profile_timer_exceptions_notify(unsigned int val, struct pt_regs *regs) -{ - oprofile_add_sample(regs, 0); - return NMI_HANDLED; -} - -static int timer_start(void) -{ - if (register_nmi_handler(NMI_LOCAL, profile_timer_exceptions_notify, - 0, "oprofile-timer")) - return 1; - return 0; -} - - -static void timer_stop(void) -{ - unregister_nmi_handler(NMI_LOCAL, "oprofile-timer"); - synchronize_sched(); /* Allow already-started NMIs to complete. */ -} - - -int __init op_nmi_timer_init(struct oprofile_operations *ops) -{ - ops->start = timer_start; - ops->stop = timer_stop; - ops->cpu_type = "timer"; - printk(KERN_INFO "oprofile: using NMI timer interrupt.\n"); - return 0; -} diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 37718f0..4cf9bd0 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -238,7 +238,8 @@ static efi_status_t __init phys_efi_get_time(efi_time_t *tm, spin_lock_irqsave(&rtc_lock, flags); efi_call_phys_prelog(); - status = efi_call_phys2(efi_phys.get_time, tm, tc); + status = efi_call_phys2(efi_phys.get_time, virt_to_phys(tm), + virt_to_phys(tc)); efi_call_phys_epilog(); spin_unlock_irqrestore(&rtc_lock, flags); return status; @@ -352,8 +353,7 @@ void __init efi_memblock_x86_reserve_range(void) boot_params.efi_info.efi_memdesc_size; memmap.desc_version = boot_params.efi_info.efi_memdesc_version; memmap.desc_size = boot_params.efi_info.efi_memdesc_size; - memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size, - "EFI memmap"); + memblock_reserve(pmap, memmap.nr_map * memmap.desc_size); } #if EFI_DEBUG @@ -397,16 +397,14 @@ void __init efi_reserve_boot_services(void) if ((start+size >= virt_to_phys(_text) && start <= virt_to_phys(_end)) || !e820_all_mapped(start, start+size, E820_RAM) || - memblock_x86_check_reserved_size(&start, &size, - 1<<EFI_PAGE_SHIFT)) { + memblock_is_region_reserved(start, size)) { /* Could not reserve, skip it */ md->num_pages = 0; memblock_dbg(PFX "Could not reserve boot range " "[0x%010llx-0x%010llx]\n", start, start+size-1); } else - memblock_x86_reserve_range(start, start+size, - "EFI Boot"); + memblock_reserve(start, size); } } diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile index f820826..d511aa9 100644 --- a/arch/x86/tools/Makefile +++ b/arch/x86/tools/Makefile @@ -18,14 +18,21 @@ chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk quiet_cmd_posttest = TEST $@ cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose) -posttest: $(obj)/test_get_len vmlinux +quiet_cmd_sanitytest = TEST $@ + cmd_sanitytest = $(obj)/insn_sanity $(posttest_64bit) -m 1000000 + +posttest: $(obj)/test_get_len vmlinux $(obj)/insn_sanity $(call cmd,posttest) + $(call cmd,sanitytest) -hostprogs-y := test_get_len +hostprogs-y += test_get_len insn_sanity # -I needed for generated C source and C source which in the kernel tree. HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ +HOSTCFLAGS_insn_sanity.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/ + # Dependencies are also needed. $(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c +$(obj)/insn_sanity.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk index eaf11f5..5f6a5b6 100644 --- a/arch/x86/tools/gen-insn-attr-x86.awk +++ b/arch/x86/tools/gen-insn-attr-x86.awk @@ -47,7 +47,7 @@ BEGIN { sep_expr = "^\\|$" group_expr = "^Grp[0-9A-Za-z]+" - imm_expr = "^[IJAO][a-z]" + imm_expr = "^[IJAOL][a-z]" imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)" @@ -59,6 +59,7 @@ BEGIN { imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)" imm_flag["Ob"] = "INAT_MOFFSET" imm_flag["Ov"] = "INAT_MOFFSET" + imm_flag["Lx"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)" modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])" force64_expr = "\\([df]64\\)" @@ -70,8 +71,12 @@ BEGIN { lprefix3_expr = "\\(F2\\)" max_lprefix = 4 - vexok_expr = "\\(VEX\\)" - vexonly_expr = "\\(oVEX\\)" + # All opcodes starting with lower-case 'v' or with (v1) superscript + # accepts VEX prefix + vexok_opcode_expr = "^v.*" + vexok_expr = "\\(v1\\)" + # All opcodes with (v) superscript supports *only* VEX prefix + vexonly_expr = "\\(v\\)" prefix_expr = "\\(Prefix\\)" prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ" @@ -85,8 +90,8 @@ BEGIN { prefix_num["SEG=GS"] = "INAT_PFX_GS" prefix_num["SEG=SS"] = "INAT_PFX_SS" prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ" - prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2" - prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3" + prefix_num["VEX+1byte"] = "INAT_PFX_VEX2" + prefix_num["VEX+2byte"] = "INAT_PFX_VEX3" clear_vars() } @@ -310,12 +315,10 @@ function convert_operands(count,opnd, i,j,imm,mod) if (match(opcode, fpu_expr)) flags = add_flags(flags, "INAT_MODRM") - # check VEX only code + # check VEX codes if (match(ext, vexonly_expr)) flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY") - - # check VEX only code - if (match(ext, vexok_expr)) + else if (match(ext, vexok_expr) || match(opcode, vexok_opcode_expr)) flags = add_flags(flags, "INAT_VEXOK") # check prefixes diff --git a/arch/x86/tools/insn_sanity.c b/arch/x86/tools/insn_sanity.c new file mode 100644 index 0000000..cc2f8c1 --- /dev/null +++ b/arch/x86/tools/insn_sanity.c @@ -0,0 +1,275 @@ +/* + * x86 decoder sanity test - based on test_get_insn.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright (C) IBM Corporation, 2009 + * Copyright (C) Hitachi, Ltd., 2011 + */ + +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <assert.h> +#include <unistd.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> + +#define unlikely(cond) (cond) +#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0])) + +#include <asm/insn.h> +#include <inat.c> +#include <insn.c> + +/* + * Test of instruction analysis against tampering. + * Feed random binary to instruction decoder and ensure not to + * access out-of-instruction-buffer. + */ + +#define DEFAULT_MAX_ITER 10000 +#define INSN_NOP 0x90 + +static const char *prog; /* Program name */ +static int verbose; /* Verbosity */ +static int x86_64; /* x86-64 bit mode flag */ +static unsigned int seed; /* Random seed */ +static unsigned long iter_start; /* Start of iteration number */ +static unsigned long iter_end = DEFAULT_MAX_ITER; /* End of iteration number */ +static FILE *input_file; /* Input file name */ + +static void usage(const char *err) +{ + if (err) + fprintf(stderr, "Error: %s\n\n", err); + fprintf(stderr, "Usage: %s [-y|-n|-v] [-s seed[,no]] [-m max] [-i input]\n", prog); + fprintf(stderr, "\t-y 64bit mode\n"); + fprintf(stderr, "\t-n 32bit mode\n"); + fprintf(stderr, "\t-v Verbosity(-vv dumps any decoded result)\n"); + fprintf(stderr, "\t-s Give a random seed (and iteration number)\n"); + fprintf(stderr, "\t-m Give a maximum iteration number\n"); + fprintf(stderr, "\t-i Give an input file with decoded binary\n"); + exit(1); +} + +static void dump_field(FILE *fp, const char *name, const char *indent, + struct insn_field *field) +{ + fprintf(fp, "%s.%s = {\n", indent, name); + fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n", + indent, field->value, field->bytes[0], field->bytes[1], + field->bytes[2], field->bytes[3]); + fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent, + field->got, field->nbytes); +} + +static void dump_insn(FILE *fp, struct insn *insn) +{ + fprintf(fp, "Instruction = {\n"); + dump_field(fp, "prefixes", "\t", &insn->prefixes); + dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix); + dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix); + dump_field(fp, "opcode", "\t", &insn->opcode); + dump_field(fp, "modrm", "\t", &insn->modrm); + dump_field(fp, "sib", "\t", &insn->sib); + dump_field(fp, "displacement", "\t", &insn->displacement); + dump_field(fp, "immediate1", "\t", &insn->immediate1); + dump_field(fp, "immediate2", "\t", &insn->immediate2); + fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n", + insn->attr, insn->opnd_bytes, insn->addr_bytes); + fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n", + insn->length, insn->x86_64, insn->kaddr); +} + +static void dump_stream(FILE *fp, const char *msg, unsigned long nr_iter, + unsigned char *insn_buf, struct insn *insn) +{ + int i; + + fprintf(fp, "%s:\n", msg); + + dump_insn(fp, insn); + + fprintf(fp, "You can reproduce this with below command(s);\n"); + + /* Input a decoded instruction sequence directly */ + fprintf(fp, " $ echo "); + for (i = 0; i < MAX_INSN_SIZE; i++) + fprintf(fp, " %02x", insn_buf[i]); + fprintf(fp, " | %s -i -\n", prog); + + if (!input_file) { + fprintf(fp, "Or \n"); + /* Give a seed and iteration number */ + fprintf(fp, " $ %s -s 0x%x,%lu\n", prog, seed, nr_iter); + } +} + +static void init_random_seed(void) +{ + int fd; + + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) + goto fail; + + if (read(fd, &seed, sizeof(seed)) != sizeof(seed)) + goto fail; + + close(fd); + return; +fail: + usage("Failed to open /dev/urandom"); +} + +/* Read given instruction sequence from the input file */ +static int read_next_insn(unsigned char *insn_buf) +{ + char buf[256] = "", *tmp; + int i; + + tmp = fgets(buf, ARRAY_SIZE(buf), input_file); + if (tmp == NULL || feof(input_file)) + return 0; + + for (i = 0; i < MAX_INSN_SIZE; i++) { + insn_buf[i] = (unsigned char)strtoul(tmp, &tmp, 16); + if (*tmp != ' ') + break; + } + + return i; +} + +static int generate_insn(unsigned char *insn_buf) +{ + int i; + + if (input_file) + return read_next_insn(insn_buf); + + /* Fills buffer with random binary up to MAX_INSN_SIZE */ + for (i = 0; i < MAX_INSN_SIZE - 1; i += 2) + *(unsigned short *)(&insn_buf[i]) = random() & 0xffff; + + while (i < MAX_INSN_SIZE) + insn_buf[i++] = random() & 0xff; + + return i; +} + +static void parse_args(int argc, char **argv) +{ + int c; + char *tmp = NULL; + int set_seed = 0; + + prog = argv[0]; + while ((c = getopt(argc, argv, "ynvs:m:i:")) != -1) { + switch (c) { + case 'y': + x86_64 = 1; + break; + case 'n': + x86_64 = 0; + break; + case 'v': + verbose++; + break; + case 'i': + if (strcmp("-", optarg) == 0) + input_file = stdin; + else + input_file = fopen(optarg, "r"); + if (!input_file) + usage("Failed to open input file"); + break; + case 's': + seed = (unsigned int)strtoul(optarg, &tmp, 0); + if (*tmp == ',') { + optarg = tmp + 1; + iter_start = strtoul(optarg, &tmp, 0); + } + if (*tmp != '\0' || tmp == optarg) + usage("Failed to parse seed"); + set_seed = 1; + break; + case 'm': + iter_end = strtoul(optarg, &tmp, 0); + if (*tmp != '\0' || tmp == optarg) + usage("Failed to parse max_iter"); + break; + default: + usage(NULL); + } + } + + /* Check errors */ + if (iter_end < iter_start) + usage("Max iteration number must be bigger than iter-num"); + + if (set_seed && input_file) + usage("Don't use input file (-i) with random seed (-s)"); + + /* Initialize random seed */ + if (!input_file) { + if (!set_seed) /* No seed is given */ + init_random_seed(); + srand(seed); + } +} + +int main(int argc, char **argv) +{ + struct insn insn; + int insns = 0; + int errors = 0; + unsigned long i; + unsigned char insn_buf[MAX_INSN_SIZE * 2]; + + parse_args(argc, argv); + + /* Prepare stop bytes with NOPs */ + memset(insn_buf + MAX_INSN_SIZE, INSN_NOP, MAX_INSN_SIZE); + + for (i = 0; i < iter_end; i++) { + if (generate_insn(insn_buf) <= 0) + break; + + if (i < iter_start) /* Skip to given iteration number */ + continue; + + /* Decode an instruction */ + insn_init(&insn, insn_buf, x86_64); + insn_get_length(&insn); + + if (insn.next_byte <= insn.kaddr || + insn.kaddr + MAX_INSN_SIZE < insn.next_byte) { + /* Access out-of-range memory */ + dump_stream(stderr, "Error: Found an access violation", i, insn_buf, &insn); + errors++; + } else if (verbose && !insn_complete(&insn)) + dump_stream(stdout, "Info: Found an undecodable input", i, insn_buf, &insn); + else if (verbose >= 2) + dump_insn(stdout, &insn); + insns++; + } + + fprintf(stdout, "%s: decoded and checked %d %s instructions with %d errors (seed:0x%x)\n", (errors) ? "Failure" : "Success", insns, (input_file) ? "given" : "random", errors, seed); + + return errors ? 1 : 0; +} diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 1f92865..12eb07b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1215,8 +1215,6 @@ asmlinkage void __init xen_start_kernel(void) local_irq_disable(); early_boot_irqs_disabled = true; - memblock_init(); - xen_raw_console_write("mapping kernel into physical memory\n"); pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); xen_ident_map_ISA(); diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 87f6673..f4bf8aa 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1774,10 +1774,8 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, __xen_write_cr3(true, __pa(pgd)); xen_mc_issue(PARAVIRT_LAZY_CPU); - memblock_x86_reserve_range(__pa(xen_start_info->pt_base), - __pa(xen_start_info->pt_base + - xen_start_info->nr_pt_frames * PAGE_SIZE), - "XEN PAGETABLES"); + memblock_reserve(__pa(xen_start_info->pt_base), + xen_start_info->nr_pt_frames * PAGE_SIZE); return pgd; } @@ -1853,10 +1851,8 @@ pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd, PFN_DOWN(__pa(initial_page_table))); xen_write_cr3(__pa(initial_page_table)); - memblock_x86_reserve_range(__pa(xen_start_info->pt_base), - __pa(xen_start_info->pt_base + - xen_start_info->nr_pt_frames * PAGE_SIZE), - "XEN PAGETABLES"); + memblock_reserve(__pa(xen_start_info->pt_base), + xen_start_info->nr_pt_frames * PAGE_SIZE)); return initial_page_table; } diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index b2c7179..e03c636 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -75,7 +75,7 @@ static void __init xen_add_extra_mem(u64 start, u64 size) if (i == XEN_EXTRA_MEM_MAX_REGIONS) printk(KERN_WARNING "Warning: not enough extra memory regions\n"); - memblock_x86_reserve_range(start, start + size, "XEN EXTRA"); + memblock_reserve(start, size); xen_max_p2m_pfn = PFN_DOWN(start + size); @@ -311,9 +311,8 @@ char * __init xen_memory_setup(void) * - xen_start_info * See comment above "struct start_info" in <xen/interface/xen.h> */ - memblock_x86_reserve_range(__pa(xen_start_info->mfn_list), - __pa(xen_start_info->pt_base), - "XEN START INFO"); + memblock_reserve(__pa(xen_start_info->mfn_list), + xen_start_info->pt_base - xen_start_info->mfn_list); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); |