diff options
Diffstat (limited to 'arch/x86')
112 files changed, 1873 insertions, 1066 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9019523..3ad653d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2125,6 +2125,13 @@ config NET5501 ---help--- This option enables system support for the Soekris Engineering net5501. +config GEOS + bool "Traverse Technologies GEOS System Support (LEDS, GPIO, etc)" + select GPIOLIB + depends on DMI + ---help--- + This option enables system support for the Traverse Technologies GEOS. + endif # X86_32 config AMD_NB diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um index 36ddec6..4be406a 100644 --- a/arch/x86/Makefile.um +++ b/arch/x86/Makefile.um @@ -8,15 +8,11 @@ ELF_ARCH := i386 ELF_FORMAT := elf32-i386 CHECKFLAGS += -D__i386__ -ifeq ("$(origin SUBARCH)", "command line") -ifneq ("$(shell uname -m | sed -e s/i.86/i386/)", "$(SUBARCH)") KBUILD_CFLAGS += $(call cc-option,-m32) KBUILD_AFLAGS += $(call cc-option,-m32) LINK-y += $(call cc-option,-m32) export LDFLAGS -endif -endif # First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. include $(srctree)/arch/x86/Makefile_32.cpu diff --git a/arch/x86/crypto/camellia_glue.c b/arch/x86/crypto/camellia_glue.c index 1ca36a9..3306dc0 100644 --- a/arch/x86/crypto/camellia_glue.c +++ b/arch/x86/crypto/camellia_glue.c @@ -1925,7 +1925,7 @@ static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); -int __init init(void) +static int __init init(void) { if (!force && is_blacklisted_cpu()) { printk(KERN_INFO @@ -1938,7 +1938,7 @@ int __init init(void) return crypto_register_algs(camellia_algs, ARRAY_SIZE(camellia_algs)); } -void __exit fini(void) +static void __exit fini(void) { crypto_unregister_algs(camellia_algs, ARRAY_SIZE(camellia_algs)); } diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c index 408fc0c..922ab24 100644 --- a/arch/x86/crypto/twofish_glue_3way.c +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -668,7 +668,7 @@ static int force; module_param(force, int, 0); MODULE_PARM_DESC(force, "Force module load, ignore CPU blacklist"); -int __init init(void) +static int __init init(void) { if (!force && is_blacklisted_cpu()) { printk(KERN_INFO @@ -681,7 +681,7 @@ int __init init(void) return crypto_register_algs(tf_algs, ARRAY_SIZE(tf_algs)); } -void __exit fini(void) +static void __exit fini(void) { crypto_unregister_algs(tf_algs, ARRAY_SIZE(tf_algs)); } diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c index 4c2e59a..d511d95 100644 --- a/arch/x86/ia32/ia32_aout.c +++ b/arch/x86/ia32/ia32_aout.c @@ -26,7 +26,6 @@ #include <linux/init.h> #include <linux/jiffies.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/pgalloc.h> #include <asm/cacheflush.h> diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index d3eaac4..d854101 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -11,7 +11,6 @@ #include <linux/atomic.h> #include <asm/fixmap.h> #include <asm/mpspec.h> -#include <asm/system.h> #include <asm/msr.h> #define ARCH_APICTIMER_STOPS_ON_C3 1 diff --git a/arch/x86/include/asm/auxvec.h b/arch/x86/include/asm/auxvec.h index 1316b4c..77203ac 100644 --- a/arch/x86/include/asm/auxvec.h +++ b/arch/x86/include/asm/auxvec.h @@ -9,4 +9,11 @@ #endif #define AT_SYSINFO_EHDR 33 +/* entries in ARCH_DLINFO: */ +#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) +# define AT_VECTOR_SIZE_ARCH 2 +#else /* else it's non-compat x86-64 */ +# define AT_VECTOR_SIZE_ARCH 1 +#endif + #endif /* _ASM_X86_AUXVEC_H */ diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h new file mode 100644 index 0000000..c6cd358 --- /dev/null +++ b/arch/x86/include/asm/barrier.h @@ -0,0 +1,116 @@ +#ifndef _ASM_X86_BARRIER_H +#define _ASM_X86_BARRIER_H + +#include <asm/alternative.h> +#include <asm/nops.h> + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ + +#ifdef CONFIG_X86_32 +/* + * Some non-Intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#endif + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * <programlisting> + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * </programlisting> + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * <programlisting> + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * </programlisting> + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() +#else +# define smp_rmb() barrier() +#endif +#ifdef CONFIG_X86_OOSTORE +# define smp_wmb() wmb() +#else +# define smp_wmb() barrier() +#endif +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static __always_inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +#endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h index f654d1b..11e1152 100644 --- a/arch/x86/include/asm/bug.h +++ b/arch/x86/include/asm/bug.h @@ -36,4 +36,8 @@ do { \ #endif /* !CONFIG_BUG */ #include <asm-generic/bug.h> + + +extern void show_regs_common(void); + #endif /* _ASM_X86_BUG_H */ diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 4e12668..9863ee3 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -3,6 +3,7 @@ /* Caches aren't brain-dead on the intel. */ #include <asm-generic/cacheflush.h> +#include <asm/special_insns.h> #ifdef CONFIG_X86_PAT /* diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index b903d5e..2d91580 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -78,8 +78,75 @@ */ #ifdef __KERNEL__ +#include <linux/bug.h> + DECLARE_PER_CPU(unsigned long, cpu_dr7); +#ifndef CONFIG_PARAVIRT +/* + * These special macros can be used to get or set a debugging register + */ +#define get_debugreg(var, register) \ + (var) = native_get_debugreg(register) +#define set_debugreg(value, register) \ + native_set_debugreg(register, value) +#endif + +static inline unsigned long native_get_debugreg(int regno) +{ + unsigned long val = 0; /* Damn you, gcc! */ + + switch (regno) { + case 0: + asm("mov %%db0, %0" :"=r" (val)); + break; + case 1: + asm("mov %%db1, %0" :"=r" (val)); + break; + case 2: + asm("mov %%db2, %0" :"=r" (val)); + break; + case 3: + asm("mov %%db3, %0" :"=r" (val)); + break; + case 6: + asm("mov %%db6, %0" :"=r" (val)); + break; + case 7: + asm("mov %%db7, %0" :"=r" (val)); + break; + default: + BUG(); + } + return val; +} + +static inline void native_set_debugreg(int regno, unsigned long value) +{ + switch (regno) { + case 0: + asm("mov %0, %%db0" ::"r" (value)); + break; + case 1: + asm("mov %0, %%db1" ::"r" (value)); + break; + case 2: + asm("mov %0, %%db2" ::"r" (value)); + break; + case 3: + asm("mov %0, %%db3" ::"r" (value)); + break; + case 6: + asm("mov %0, %%db6" ::"r" (value)); + break; + case 7: + asm("mov %0, %%db7" ::"r" (value)); + break; + default: + BUG(); + } +} + static inline void hw_breakpoint_disable(void) { /* Zero the control register for HW Breakpoint */ diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 5f962df..f27f79a 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -84,7 +84,6 @@ extern unsigned int vdso_enabled; (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486)) #include <asm/processor.h> -#include <asm/system.h> #ifdef CONFIG_X86_32 #include <asm/desc.h> diff --git a/arch/x86/include/asm/exec.h b/arch/x86/include/asm/exec.h new file mode 100644 index 0000000..54c2e1d --- /dev/null +++ b/arch/x86/include/asm/exec.h @@ -0,0 +1 @@ +/* define arch_align_stack() here */ diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h index d09bb03..71ecbcb 100644 --- a/arch/x86/include/asm/futex.h +++ b/arch/x86/include/asm/futex.h @@ -9,7 +9,6 @@ #include <asm/asm.h> #include <asm/errno.h> #include <asm/processor.h> -#include <asm/system.h> #define __futex_atomic_op1(insn, ret, oldval, uaddr, oparg) \ asm volatile("1:\t" insn "\n" \ diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 7ce0798..257d9cc 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -14,7 +14,6 @@ #include <linux/sched.h> #include <linux/hardirq.h> -#include <asm/system.h> struct pt_regs; struct user_i387_struct; diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 77e95f5..332f98c 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h @@ -64,11 +64,15 @@ enum regnames { GDB_PS, /* 17 */ GDB_CS, /* 18 */ GDB_SS, /* 19 */ + GDB_DS, /* 20 */ + GDB_ES, /* 21 */ + GDB_FS, /* 22 */ + GDB_GS, /* 23 */ }; #define GDB_ORIG_AX 57 -#define DBG_MAX_REG_NUM 20 -/* 17 64 bit regs and 3 32 bit regs */ -#define NUMREGBYTES ((17 * 8) + (3 * 4)) +#define DBG_MAX_REG_NUM 24 +/* 17 64 bit regs and 5 32 bit regs */ +#define NUMREGBYTES ((17 * 8) + (5 * 4)) #endif /* ! CONFIG_X86_32 */ static inline void arch_kgdb_breakpoint(void) diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h index 4d8dcbd..e7d1c19 100644 --- a/arch/x86/include/asm/kvm.h +++ b/arch/x86/include/asm/kvm.h @@ -321,4 +321,8 @@ struct kvm_xcrs { __u64 padding[16]; }; +/* definition of registers in kvm_run */ +struct kvm_sync_regs { +}; + #endif /* _ASM_X86_KVM_H */ diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 7b9cfc4..c222e1a 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -176,6 +176,7 @@ struct x86_emulate_ops { void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); + void (*set_rflags)(struct x86_emulate_ctxt *ctxt, ulong val); int (*cpl)(struct x86_emulate_ctxt *ctxt); int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); @@ -388,7 +389,7 @@ bool x86_page_table_writing_insn(struct x86_emulate_ctxt *ctxt); #define EMULATION_INTERCEPTED 2 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code); int emulate_int_real(struct x86_emulate_ctxt *ctxt, int irq); #endif /* _ASM_X86_KVM_X86_EMULATE_H */ diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 52d6640..e216ba0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -29,7 +29,7 @@ #include <asm/msr-index.h> #define KVM_MAX_VCPUS 254 -#define KVM_SOFT_MAX_VCPUS 64 +#define KVM_SOFT_MAX_VCPUS 160 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 @@ -181,13 +181,6 @@ struct kvm_mmu_memory_cache { void *objects[KVM_NR_MEM_OBJS]; }; -#define NR_PTE_CHAIN_ENTRIES 5 - -struct kvm_pte_chain { - u64 *parent_ptes[NR_PTE_CHAIN_ENTRIES]; - struct hlist_node link; -}; - /* * kvm_mmu_page_role, below, is defined as: * @@ -427,12 +420,16 @@ struct kvm_vcpu_arch { u64 last_guest_tsc; u64 last_kernel_ns; - u64 last_tsc_nsec; - u64 last_tsc_write; - u32 virtual_tsc_khz; + u64 last_host_tsc; + u64 tsc_offset_adjustment; + u64 this_tsc_nsec; + u64 this_tsc_write; + u8 this_tsc_generation; bool tsc_catchup; - u32 tsc_catchup_mult; - s8 tsc_catchup_shift; + bool tsc_always_catchup; + s8 virtual_tsc_shift; + u32 virtual_tsc_mult; + u32 virtual_tsc_khz; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -478,6 +475,21 @@ struct kvm_vcpu_arch { u32 id; bool send_user_only; } apf; + + /* OSVW MSRs (AMD only) */ + struct { + u64 length; + u64 status; + } osvw; +}; + +struct kvm_lpage_info { + unsigned long rmap_pde; + int write_count; +}; + +struct kvm_arch_memory_slot { + struct kvm_lpage_info *lpage_info[KVM_NR_PAGE_SIZES - 1]; }; struct kvm_arch { @@ -511,8 +523,12 @@ struct kvm_arch { s64 kvmclock_offset; raw_spinlock_t tsc_write_lock; u64 last_tsc_nsec; - u64 last_tsc_offset; u64 last_tsc_write; + u32 last_tsc_khz; + u64 cur_tsc_nsec; + u64 cur_tsc_write; + u64 cur_tsc_offset; + u8 cur_tsc_generation; struct kvm_xen_hvm_config xen_hvm_config; @@ -644,7 +660,7 @@ struct kvm_x86_ops { u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); - void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment); + void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -652,7 +668,7 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); + void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); @@ -674,6 +690,17 @@ struct kvm_arch_async_pf { extern struct kvm_x86_ops *kvm_x86_ops; +static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, + s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false); +} + +static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true); +} + int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); @@ -741,8 +768,8 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg); -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code); +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code); int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h index 9cdae5d..c8bed0d 100644 --- a/arch/x86/include/asm/local.h +++ b/arch/x86/include/asm/local.h @@ -3,7 +3,6 @@ #include <linux/percpu.h> -#include <asm/system.h> #include <linux/atomic.h> #include <asm/asm.h> diff --git a/arch/x86/include/asm/mc146818rtc.h b/arch/x86/include/asm/mc146818rtc.h index 0e8e85b..d354fb7 100644 --- a/arch/x86/include/asm/mc146818rtc.h +++ b/arch/x86/include/asm/mc146818rtc.h @@ -5,7 +5,6 @@ #define _ASM_X86_MC146818RTC_H #include <asm/io.h> -#include <asm/system.h> #include <asm/processor.h> #include <linux/mc146818rtc.h> diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index bce688d..e21fdd1 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -55,7 +55,6 @@ extern unsigned long init_memory_mapping(unsigned long start, unsigned long end); extern void initmem_init(void); -extern void free_initmem(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c0180fd..aa0f913 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -10,6 +10,7 @@ #include <asm/paravirt_types.h> #ifndef __ASSEMBLY__ +#include <linux/bug.h> #include <linux/types.h> #include <linux/cpumask.h> diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index e8fb2c7..2291895 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -23,6 +23,7 @@ #define ARCH_PERFMON_EVENTSEL_USR (1ULL << 16) #define ARCH_PERFMON_EVENTSEL_OS (1ULL << 17) #define ARCH_PERFMON_EVENTSEL_EDGE (1ULL << 18) +#define ARCH_PERFMON_EVENTSEL_PIN_CONTROL (1ULL << 19) #define ARCH_PERFMON_EVENTSEL_INT (1ULL << 20) #define ARCH_PERFMON_EVENTSEL_ANY (1ULL << 21) #define ARCH_PERFMON_EVENTSEL_ENABLE (1ULL << 22) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 95da14f..a19542c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -14,13 +14,13 @@ struct mm_struct; #include <asm/sigcontext.h> #include <asm/current.h> #include <asm/cpufeature.h> -#include <asm/system.h> #include <asm/page.h> #include <asm/pgtable_types.h> #include <asm/percpu.h> #include <asm/msr.h> #include <asm/desc_defs.h> #include <asm/nops.h> +#include <asm/special_insns.h> #include <linux/personality.h> #include <linux/cpumask.h> @@ -29,6 +29,15 @@ struct mm_struct; #include <linux/math64.h> #include <linux/init.h> #include <linux/err.h> +#include <linux/irqflags.h> + +/* + * We handle most unaligned accesses in hardware. On the other hand + * unaligned DMA can be quite expensive on some Nehalem processors. + * + * Based on this we disable the IP header alignment in network drivers. + */ +#define NET_IP_ALIGN 0 #define HBP_NUM 4 /* @@ -475,61 +484,6 @@ struct thread_struct { unsigned io_bitmap_max; }; -static inline unsigned long native_get_debugreg(int regno) -{ - unsigned long val = 0; /* Damn you, gcc! */ - - switch (regno) { - case 0: - asm("mov %%db0, %0" :"=r" (val)); - break; - case 1: - asm("mov %%db1, %0" :"=r" (val)); - break; - case 2: - asm("mov %%db2, %0" :"=r" (val)); - break; - case 3: - asm("mov %%db3, %0" :"=r" (val)); - break; - case 6: - asm("mov %%db6, %0" :"=r" (val)); - break; - case 7: - asm("mov %%db7, %0" :"=r" (val)); - break; - default: - BUG(); - } - return val; -} - -static inline void native_set_debugreg(int regno, unsigned long value) -{ - switch (regno) { - case 0: - asm("mov %0, %%db0" ::"r" (value)); - break; - case 1: - asm("mov %0, %%db1" ::"r" (value)); - break; - case 2: - asm("mov %0, %%db2" ::"r" (value)); - break; - case 3: - asm("mov %0, %%db3" ::"r" (value)); - break; - case 6: - asm("mov %0, %%db6" ::"r" (value)); - break; - case 7: - asm("mov %0, %%db7" ::"r" (value)); - break; - default: - BUG(); - } -} - /* * Set IOPL bits in EFLAGS from given mask */ @@ -575,14 +529,6 @@ static inline void native_swapgs(void) #define __cpuid native_cpuid #define paravirt_enabled() 0 -/* - * These special macros can be used to get or set a debugging register - */ -#define get_debugreg(var, register) \ - (var) = native_get_debugreg(register) -#define set_debugreg(value, register) \ - native_set_debugreg(register, value) - static inline void load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -1022,4 +968,24 @@ extern bool cpu_has_amd_erratum(const int *); #define cpu_has_amd_erratum(x) (false) #endif /* CONFIG_CPU_SUP_AMD */ +#ifdef CONFIG_X86_32 +/* + * disable hlt during certain critical i/o operations + */ +#define HAVE_DISABLE_HLT +#endif + +void disable_hlt(void); +void enable_hlt(void); + +void cpu_idle_wait(void); + +extern unsigned long arch_align_stack(unsigned long sp); +extern void free_init_pages(char *what, unsigned long begin, unsigned long end); + +void default_idle(void); +bool set_pm_idle_to_default(void); + +void stop_this_cpu(void *dummy); + #endif /* _ASM_X86_PROCESSOR_H */ diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h index 5e64171..1654662 100644 --- a/arch/x86/include/asm/segment.h +++ b/arch/x86/include/asm/segment.h @@ -212,7 +212,61 @@ #ifdef __KERNEL__ #ifndef __ASSEMBLY__ extern const char early_idt_handlers[NUM_EXCEPTION_VECTORS][10]; -#endif -#endif + +/* + * Load a segment. Fall back on loading the zero + * segment if something goes wrong.. + */ +#define loadsegment(seg, value) \ +do { \ + unsigned short __val = (value); \ + \ + asm volatile(" \n" \ + "1: movl %k0,%%" #seg " \n" \ + \ + ".section .fixup,\"ax\" \n" \ + "2: xorl %k0,%k0 \n" \ + " jmp 1b \n" \ + ".previous \n" \ + \ + _ASM_EXTABLE(1b, 2b) \ + \ + : "+r" (__val) : : "memory"); \ +} while (0) + +/* + * Save a segment register away + */ +#define savesegment(seg, value) \ + asm("mov %%" #seg ",%0":"=r" (value) : : "memory") + +/* + * x86_32 user gs accessors. + */ +#ifdef CONFIG_X86_32 +#ifdef CONFIG_X86_32_LAZY_GS +#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) +#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) +#define task_user_gs(tsk) ((tsk)->thread.gs) +#define lazy_save_gs(v) savesegment(gs, (v)) +#define lazy_load_gs(v) loadsegment(gs, (v)) +#else /* X86_32_LAZY_GS */ +#define get_user_gs(regs) (u16)((regs)->gs) +#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) +#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) +#define lazy_save_gs(v) do { } while (0) +#define lazy_load_gs(v) do { } while (0) +#endif /* X86_32_LAZY_GS */ +#endif /* X86_32 */ + +static inline unsigned long get_limit(unsigned long segment) +{ + unsigned long __limit; + asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); + return __limit + 1; +} + +#endif /* !__ASSEMBLY__ */ +#endif /* __KERNEL__ */ #endif /* _ASM_X86_SEGMENT_H */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h new file mode 100644 index 0000000..41fc93a --- /dev/null +++ b/arch/x86/include/asm/special_insns.h @@ -0,0 +1,199 @@ +#ifndef _ASM_X86_SPECIAL_INSNS_H +#define _ASM_X86_SPECIAL_INSNS_H + + +#ifdef __KERNEL__ + +static inline void native_clts(void) +{ + asm volatile("clts"); +} + +/* + * Volatile isn't enough to prevent the compiler from reordering the + * read/write functions for the control registers and messing everything up. + * A memory clobber would solve the problem, but would prevent reordering of + * all loads stores around it, which can hurt performance. Solution is to + * use a variable and mimic reads and writes to it to enforce serialization + */ +static unsigned long __force_order; + +static inline unsigned long native_read_cr0(void) +{ + unsigned long val; + asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr0(unsigned long val) +{ + asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr2(void) +{ + unsigned long val; + asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr2(unsigned long val) +{ + asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr3(void) +{ + unsigned long val; + asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline void native_write_cr3(unsigned long val) +{ + asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); +} + +static inline unsigned long native_read_cr4(void) +{ + unsigned long val; + asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); + return val; +} + +static inline unsigned long native_read_cr4_safe(void) +{ + unsigned long val; + /* This could fault if %cr4 does not exist. In x86_64, a cr4 always + * exists, so it will never fail. */ +#ifdef CONFIG_X86_32 + asm volatile("1: mov %%cr4, %0\n" + "2:\n" + _ASM_EXTABLE(1b, 2b) + : "=r" (val), "=m" (__force_order) : "0" (0)); +#else + val = native_read_cr4(); +#endif + return val; +} + +static inline void native_write_cr4(unsigned long val) +{ + asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); +} + +#ifdef CONFIG_X86_64 +static inline unsigned long native_read_cr8(void) +{ + unsigned long cr8; + asm volatile("movq %%cr8,%0" : "=r" (cr8)); + return cr8; +} + +static inline void native_write_cr8(unsigned long val) +{ + asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); +} +#endif + +static inline void native_wbinvd(void) +{ + asm volatile("wbinvd": : :"memory"); +} + +extern void native_load_gs_index(unsigned); + +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#else + +static inline unsigned long read_cr0(void) +{ + return native_read_cr0(); +} + +static inline void write_cr0(unsigned long x) +{ + native_write_cr0(x); +} + +static inline unsigned long read_cr2(void) +{ + return native_read_cr2(); +} + +static inline void write_cr2(unsigned long x) +{ + native_write_cr2(x); +} + +static inline unsigned long read_cr3(void) +{ + return native_read_cr3(); +} + +static inline void write_cr3(unsigned long x) +{ + native_write_cr3(x); +} + +static inline unsigned long read_cr4(void) +{ + return native_read_cr4(); +} + +static inline unsigned long read_cr4_safe(void) +{ + return native_read_cr4_safe(); +} + +static inline void write_cr4(unsigned long x) +{ + native_write_cr4(x); +} + +static inline void wbinvd(void) +{ + native_wbinvd(); +} + +#ifdef CONFIG_X86_64 + +static inline unsigned long read_cr8(void) +{ + return native_read_cr8(); +} + +static inline void write_cr8(unsigned long x) +{ + native_write_cr8(x); +} + +static inline void load_gs_index(unsigned selector) +{ + native_load_gs_index(selector); +} + +#endif + +/* Clear the 'TS' bit */ +static inline void clts(void) +{ + native_clts(); +} + +#endif/* CONFIG_PARAVIRT */ + +#define stts() write_cr0(read_cr0() | X86_CR0_TS) + +static inline void clflush(volatile void *__p) +{ + asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); +} + +#define nop() asm volatile ("nop") + + +#endif /* __KERNEL__ */ + +#endif /* _ASM_X86_SPECIAL_INSNS_H */ diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 1575177..b5d9533 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -38,7 +38,6 @@ #include <asm/tsc.h> #include <asm/processor.h> #include <asm/percpu.h> -#include <asm/system.h> #include <asm/desc.h> #include <linux/random.h> diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h new file mode 100644 index 0000000..4ec45b3 --- /dev/null +++ b/arch/x86/include/asm/switch_to.h @@ -0,0 +1,129 @@ +#ifndef _ASM_X86_SWITCH_TO_H +#define _ASM_X86_SWITCH_TO_H + +struct task_struct; /* one of the stranger aspects of C forward declarations */ +struct task_struct *__switch_to(struct task_struct *prev, + struct task_struct *next); +struct tss_struct; +void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, + struct tss_struct *tss); + +#ifdef CONFIG_X86_32 + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movl %P[task_canary](%[next]), %%ebx\n\t" \ + "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" +#define __switch_canary_oparam \ + , [stack_canary] "=m" (stack_canary.canary) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* + * Saving eflags is important. It switches not only IOPL between tasks, + * it also protects other tasks from NT leaking through sysenter etc. + */ +#define switch_to(prev, next, last) \ +do { \ + /* \ + * Context-switching clobbers all registers, so we clobber \ + * them explicitly, via unused output variables. \ + * (EAX and EBP is not listed because EBP is saved/restored \ + * explicitly for wchan access and EAX is the return value of \ + * __switch_to()) \ + */ \ + unsigned long ebx, ecx, edx, esi, edi; \ + \ + asm volatile("pushfl\n\t" /* save flags */ \ + "pushl %%ebp\n\t" /* save EBP */ \ + "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ + "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ + "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ + "pushl %[next_ip]\n\t" /* restore EIP */ \ + __switch_canary \ + "jmp __switch_to\n" /* regparm call */ \ + "1:\t" \ + "popl %%ebp\n\t" /* restore EBP */ \ + "popfl\n" /* restore flags */ \ + \ + /* output parameters */ \ + : [prev_sp] "=m" (prev->thread.sp), \ + [prev_ip] "=m" (prev->thread.ip), \ + "=a" (last), \ + \ + /* clobbered output registers: */ \ + "=b" (ebx), "=c" (ecx), "=d" (edx), \ + "=S" (esi), "=D" (edi) \ + \ + __switch_canary_oparam \ + \ + /* input parameters: */ \ + : [next_sp] "m" (next->thread.sp), \ + [next_ip] "m" (next->thread.ip), \ + \ + /* regparm parameters for __switch_to(): */ \ + [prev] "a" (prev), \ + [next] "d" (next) \ + \ + __switch_canary_iparam \ + \ + : /* reloaded segment registers */ \ + "memory"); \ +} while (0) + +#else /* CONFIG_X86_32 */ + +/* frame pointer must be last for get_wchan */ +#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" +#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" + +#define __EXTRA_CLOBBER \ + , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ + "r12", "r13", "r14", "r15" + +#ifdef CONFIG_CC_STACKPROTECTOR +#define __switch_canary \ + "movq %P[task_canary](%%rsi),%%r8\n\t" \ + "movq %%r8,"__percpu_arg([gs_canary])"\n\t" +#define __switch_canary_oparam \ + , [gs_canary] "=m" (irq_stack_union.stack_canary) +#define __switch_canary_iparam \ + , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) +#else /* CC_STACKPROTECTOR */ +#define __switch_canary +#define __switch_canary_oparam +#define __switch_canary_iparam +#endif /* CC_STACKPROTECTOR */ + +/* Save restore flags to clear handle leaking NT */ +#define switch_to(prev, next, last) \ + asm volatile(SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ + "movq "__percpu_arg([current_task])",%%rsi\n\t" \ + __switch_canary \ + "movq %P[thread_info](%%rsi),%%r8\n\t" \ + "movq %%rax,%%rdi\n\t" \ + "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ + "jnz ret_from_fork\n\t" \ + RESTORE_CONTEXT \ + : "=a" (last) \ + __switch_canary_oparam \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)), \ + [_tif_fork] "i" (_TIF_FORK), \ + [thread_info] "i" (offsetof(struct task_struct, stack)), \ + [current_task] "m" (current_task) \ + __switch_canary_iparam \ + : "memory", "cc" __EXTRA_CLOBBER) + +#endif /* CONFIG_X86_32 */ + +#endif /* _ASM_X86_SWITCH_TO_H */ diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h deleted file mode 100644 index 2d2f01c..0000000 --- a/arch/x86/include/asm/system.h +++ /dev/null @@ -1,523 +0,0 @@ -#ifndef _ASM_X86_SYSTEM_H -#define _ASM_X86_SYSTEM_H - -#include <asm/asm.h> -#include <asm/segment.h> -#include <asm/cpufeature.h> -#include <asm/cmpxchg.h> -#include <asm/nops.h> - -#include <linux/kernel.h> -#include <linux/irqflags.h> - -/* entries in ARCH_DLINFO: */ -#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64) -# define AT_VECTOR_SIZE_ARCH 2 -#else /* else it's non-compat x86-64 */ -# define AT_VECTOR_SIZE_ARCH 1 -#endif - -struct task_struct; /* one of the stranger aspects of C forward declarations */ -struct task_struct *__switch_to(struct task_struct *prev, - struct task_struct *next); -struct tss_struct; -void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, - struct tss_struct *tss); -extern void show_regs_common(void); - -#ifdef CONFIG_X86_32 - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movl %P[task_canary](%[next]), %%ebx\n\t" \ - "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" -#define __switch_canary_oparam \ - , [stack_canary] "=m" (stack_canary.canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* - * Saving eflags is important. It switches not only IOPL between tasks, - * it also protects other tasks from NT leaking through sysenter etc. - */ -#define switch_to(prev, next, last) \ -do { \ - /* \ - * Context-switching clobbers all registers, so we clobber \ - * them explicitly, via unused output variables. \ - * (EAX and EBP is not listed because EBP is saved/restored \ - * explicitly for wchan access and EAX is the return value of \ - * __switch_to()) \ - */ \ - unsigned long ebx, ecx, edx, esi, edi; \ - \ - asm volatile("pushfl\n\t" /* save flags */ \ - "pushl %%ebp\n\t" /* save EBP */ \ - "movl %%esp,%[prev_sp]\n\t" /* save ESP */ \ - "movl %[next_sp],%%esp\n\t" /* restore ESP */ \ - "movl $1f,%[prev_ip]\n\t" /* save EIP */ \ - "pushl %[next_ip]\n\t" /* restore EIP */ \ - __switch_canary \ - "jmp __switch_to\n" /* regparm call */ \ - "1:\t" \ - "popl %%ebp\n\t" /* restore EBP */ \ - "popfl\n" /* restore flags */ \ - \ - /* output parameters */ \ - : [prev_sp] "=m" (prev->thread.sp), \ - [prev_ip] "=m" (prev->thread.ip), \ - "=a" (last), \ - \ - /* clobbered output registers: */ \ - "=b" (ebx), "=c" (ecx), "=d" (edx), \ - "=S" (esi), "=D" (edi) \ - \ - __switch_canary_oparam \ - \ - /* input parameters: */ \ - : [next_sp] "m" (next->thread.sp), \ - [next_ip] "m" (next->thread.ip), \ - \ - /* regparm parameters for __switch_to(): */ \ - [prev] "a" (prev), \ - [next] "d" (next) \ - \ - __switch_canary_iparam \ - \ - : /* reloaded segment registers */ \ - "memory"); \ -} while (0) - -/* - * disable hlt during certain critical i/o operations - */ -#define HAVE_DISABLE_HLT -#else - -/* frame pointer must be last for get_wchan */ -#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" -#define RESTORE_CONTEXT "movq %%rbp,%%rsi ; popq %%rbp ; popf\t" - -#define __EXTRA_CLOBBER \ - , "rcx", "rbx", "rdx", "r8", "r9", "r10", "r11", \ - "r12", "r13", "r14", "r15" - -#ifdef CONFIG_CC_STACKPROTECTOR -#define __switch_canary \ - "movq %P[task_canary](%%rsi),%%r8\n\t" \ - "movq %%r8,"__percpu_arg([gs_canary])"\n\t" -#define __switch_canary_oparam \ - , [gs_canary] "=m" (irq_stack_union.stack_canary) -#define __switch_canary_iparam \ - , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) -#else /* CC_STACKPROTECTOR */ -#define __switch_canary -#define __switch_canary_oparam -#define __switch_canary_iparam -#endif /* CC_STACKPROTECTOR */ - -/* Save restore flags to clear handle leaking NT */ -#define switch_to(prev, next, last) \ - asm volatile(SAVE_CONTEXT \ - "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ - "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ - "call __switch_to\n\t" \ - "movq "__percpu_arg([current_task])",%%rsi\n\t" \ - __switch_canary \ - "movq %P[thread_info](%%rsi),%%r8\n\t" \ - "movq %%rax,%%rdi\n\t" \ - "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ - "jnz ret_from_fork\n\t" \ - RESTORE_CONTEXT \ - : "=a" (last) \ - __switch_canary_oparam \ - : [next] "S" (next), [prev] "D" (prev), \ - [threadrsp] "i" (offsetof(struct task_struct, thread.sp)), \ - [ti_flags] "i" (offsetof(struct thread_info, flags)), \ - [_tif_fork] "i" (_TIF_FORK), \ - [thread_info] "i" (offsetof(struct task_struct, stack)), \ - [current_task] "m" (current_task) \ - __switch_canary_iparam \ - : "memory", "cc" __EXTRA_CLOBBER) -#endif - -#ifdef __KERNEL__ - -extern void native_load_gs_index(unsigned); - -/* - * Load a segment. Fall back on loading the zero - * segment if something goes wrong.. - */ -#define loadsegment(seg, value) \ -do { \ - unsigned short __val = (value); \ - \ - asm volatile(" \n" \ - "1: movl %k0,%%" #seg " \n" \ - \ - ".section .fixup,\"ax\" \n" \ - "2: xorl %k0,%k0 \n" \ - " jmp 1b \n" \ - ".previous \n" \ - \ - _ASM_EXTABLE(1b, 2b) \ - \ - : "+r" (__val) : : "memory"); \ -} while (0) - -/* - * Save a segment register away - */ -#define savesegment(seg, value) \ - asm("mov %%" #seg ",%0":"=r" (value) : : "memory") - -/* - * x86_32 user gs accessors. - */ -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_32_LAZY_GS -#define get_user_gs(regs) (u16)({unsigned long v; savesegment(gs, v); v;}) -#define set_user_gs(regs, v) loadsegment(gs, (unsigned long)(v)) -#define task_user_gs(tsk) ((tsk)->thread.gs) -#define lazy_save_gs(v) savesegment(gs, (v)) -#define lazy_load_gs(v) loadsegment(gs, (v)) -#else /* X86_32_LAZY_GS */ -#define get_user_gs(regs) (u16)((regs)->gs) -#define set_user_gs(regs, v) do { (regs)->gs = (v); } while (0) -#define task_user_gs(tsk) (task_pt_regs(tsk)->gs) -#define lazy_save_gs(v) do { } while (0) -#define lazy_load_gs(v) do { } while (0) -#endif /* X86_32_LAZY_GS */ -#endif /* X86_32 */ - -static inline unsigned long get_limit(unsigned long segment) -{ - unsigned long __limit; - asm("lsll %1,%0" : "=r" (__limit) : "r" (segment)); - return __limit + 1; -} - -static inline void native_clts(void) -{ - asm volatile("clts"); -} - -/* - * Volatile isn't enough to prevent the compiler from reordering the - * read/write functions for the control registers and messing everything up. - * A memory clobber would solve the problem, but would prevent reordering of - * all loads stores around it, which can hurt performance. Solution is to - * use a variable and mimic reads and writes to it to enforce serialization - */ -static unsigned long __force_order; - -static inline unsigned long native_read_cr0(void) -{ - unsigned long val; - asm volatile("mov %%cr0,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr0(unsigned long val) -{ - asm volatile("mov %0,%%cr0": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr2(void) -{ - unsigned long val; - asm volatile("mov %%cr2,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr2(unsigned long val) -{ - asm volatile("mov %0,%%cr2": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr3(void) -{ - unsigned long val; - asm volatile("mov %%cr3,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline void native_write_cr3(unsigned long val) -{ - asm volatile("mov %0,%%cr3": : "r" (val), "m" (__force_order)); -} - -static inline unsigned long native_read_cr4(void) -{ - unsigned long val; - asm volatile("mov %%cr4,%0\n\t" : "=r" (val), "=m" (__force_order)); - return val; -} - -static inline unsigned long native_read_cr4_safe(void) -{ - unsigned long val; - /* This could fault if %cr4 does not exist. In x86_64, a cr4 always - * exists, so it will never fail. */ -#ifdef CONFIG_X86_32 - asm volatile("1: mov %%cr4, %0\n" - "2:\n" - _ASM_EXTABLE(1b, 2b) - : "=r" (val), "=m" (__force_order) : "0" (0)); -#else - val = native_read_cr4(); -#endif - return val; -} - -static inline void native_write_cr4(unsigned long val) -{ - asm volatile("mov %0,%%cr4": : "r" (val), "m" (__force_order)); -} - -#ifdef CONFIG_X86_64 -static inline unsigned long native_read_cr8(void) -{ - unsigned long cr8; - asm volatile("movq %%cr8,%0" : "=r" (cr8)); - return cr8; -} - -static inline void native_write_cr8(unsigned long val) -{ - asm volatile("movq %0,%%cr8" :: "r" (val) : "memory"); -} -#endif - -static inline void native_wbinvd(void) -{ - asm volatile("wbinvd": : :"memory"); -} - -#ifdef CONFIG_PARAVIRT -#include <asm/paravirt.h> -#else - -static inline unsigned long read_cr0(void) -{ - return native_read_cr0(); -} - -static inline void write_cr0(unsigned long x) -{ - native_write_cr0(x); -} - -static inline unsigned long read_cr2(void) -{ - return native_read_cr2(); -} - -static inline void write_cr2(unsigned long x) -{ - native_write_cr2(x); -} - -static inline unsigned long read_cr3(void) -{ - return native_read_cr3(); -} - -static inline void write_cr3(unsigned long x) -{ - native_write_cr3(x); -} - -static inline unsigned long read_cr4(void) -{ - return native_read_cr4(); -} - -static inline unsigned long read_cr4_safe(void) -{ - return native_read_cr4_safe(); -} - -static inline void write_cr4(unsigned long x) -{ - native_write_cr4(x); -} - -static inline void wbinvd(void) -{ - native_wbinvd(); -} - -#ifdef CONFIG_X86_64 - -static inline unsigned long read_cr8(void) -{ - return native_read_cr8(); -} - -static inline void write_cr8(unsigned long x) -{ - native_write_cr8(x); -} - -static inline void load_gs_index(unsigned selector) -{ - native_load_gs_index(selector); -} - -#endif - -/* Clear the 'TS' bit */ -static inline void clts(void) -{ - native_clts(); -} - -#endif/* CONFIG_PARAVIRT */ - -#define stts() write_cr0(read_cr0() | X86_CR0_TS) - -#endif /* __KERNEL__ */ - -static inline void clflush(volatile void *__p) -{ - asm volatile("clflush %0" : "+m" (*(volatile char __force *)__p)); -} - -#define nop() asm volatile ("nop") - -void disable_hlt(void); -void enable_hlt(void); - -void cpu_idle_wait(void); - -extern unsigned long arch_align_stack(unsigned long sp); -extern void free_init_pages(char *what, unsigned long begin, unsigned long end); - -void default_idle(void); -bool set_pm_idle_to_default(void); - -void stop_this_cpu(void *dummy); - -/* - * Force strict CPU ordering. - * And yes, this is required on UP too when we're talking - * to devices. - */ -#ifdef CONFIG_X86_32 -/* - * Some non-Intel clones support out of order store. wmb() ceases to be a - * nop for these. - */ -#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) -#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) -#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) -#else -#define mb() asm volatile("mfence":::"memory") -#define rmb() asm volatile("lfence":::"memory") -#define wmb() asm volatile("sfence" ::: "memory") -#endif - -/** - * read_barrier_depends - Flush all pending reads that subsequents reads - * depend on. - * - * No data-dependent reads from memory-like regions are ever reordered - * over this barrier. All reads preceding this primitive are guaranteed - * to access memory (but not necessarily other CPUs' caches) before any - * reads following this primitive that depend on the data return by - * any of the preceding reads. This primitive is much lighter weight than - * rmb() on most CPUs, and is never heavier weight than is - * rmb(). - * - * These ordering constraints are respected by both the local CPU - * and the compiler. - * - * Ordering is not guaranteed by anything other than these primitives, - * not even by data dependencies. See the documentation for - * memory_barrier() for examples and URLs to more information. - * - * For example, the following code would force ordering (the initial - * value of "a" is zero, "b" is one, and "p" is "&a"): - * - * <programlisting> - * CPU 0 CPU 1 - * - * b = 2; - * memory_barrier(); - * p = &b; q = p; - * read_barrier_depends(); - * d = *q; - * </programlisting> - * - * because the read of "*q" depends on the read of "p" and these - * two reads are separated by a read_barrier_depends(). However, - * the following code, with the same initial values for "a" and "b": - * - * <programlisting> - * CPU 0 CPU 1 - * - * a = 2; - * memory_barrier(); - * b = 3; y = b; - * read_barrier_depends(); - * x = a; - * </programlisting> - * - * does not enforce ordering, since there is no data dependency between - * the read of "a" and the read of "b". Therefore, on some CPUs, such - * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() - * in cases like this where there are no data dependencies. - **/ - -#define read_barrier_depends() do { } while (0) - -#ifdef CONFIG_SMP -#define smp_mb() mb() -#ifdef CONFIG_X86_PPRO_FENCE -# define smp_rmb() rmb() -#else -# define smp_rmb() barrier() -#endif -#ifdef CONFIG_X86_OOSTORE -# define smp_wmb() wmb() -#else -# define smp_wmb() barrier() -#endif -#define smp_read_barrier_depends() read_barrier_depends() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) -#else -#define smp_mb() barrier() -#define smp_rmb() barrier() -#define smp_wmb() barrier() -#define smp_read_barrier_depends() do { } while (0) -#define set_mb(var, value) do { var = value; barrier(); } while (0) -#endif - -/* - * Stop RDTSC speculation. This is needed when you need to use RDTSC - * (or get_cycles or vread that possibly accesses the TSC) in a defined - * code region. - * - * (Could use an alternative three way for this if there was one.) - */ -static __always_inline void rdtsc_barrier(void) -{ - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); -} - -/* - * We handle most unaligned accesses in hardware. On the other hand - * unaligned DMA can be quite expensive on some Nehalem processors. - * - * Based on this we disable the IP header alignment in network drivers. - */ -#define NET_IP_ALIGN 0 -#endif /* _ASM_X86_SYSTEM_H */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 169be89..c0e108e 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -5,7 +5,7 @@ #include <linux/sched.h> #include <asm/processor.h> -#include <asm/system.h> +#include <asm/special_insns.h> #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 15d9915..c91e8b9 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -61,7 +61,7 @@ extern void check_tsc_sync_source(int cpu); extern void check_tsc_sync_target(void); extern int notsc_setup(char *); -extern void save_sched_clock_state(void); -extern void restore_sched_clock_state(void); +extern void tsc_save_sched_clock_state(void); +extern void tsc_restore_sched_clock_state(void); #endif /* _ASM_X86_TSC_H */ diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 815285b..8b38be2 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -5,13 +5,8 @@ #include <linux/clocksource.h> struct vsyscall_gtod_data { - seqlock_t lock; + seqcount_t seq; - /* open coded 'struct timespec' */ - time_t wall_time_sec; - u32 wall_time_nsec; - - struct timezone sys_tz; struct { /* extract of a clocksource struct */ int vclock_mode; cycle_t cycle_last; @@ -19,8 +14,16 @@ struct vsyscall_gtod_data { u32 mult; u32 shift; } clock; - struct timespec wall_to_monotonic; + + /* open coded 'struct timespec' */ + time_t wall_time_sec; + u32 wall_time_nsec; + u32 monotonic_time_nsec; + time_t monotonic_time_sec; + + struct timezone sys_tz; struct timespec wall_time_coarse; + struct timespec monotonic_time_coarse; }; extern struct vsyscall_gtod_data vsyscall_gtod_data; diff --git a/arch/x86/include/asm/virtext.h b/arch/x86/include/asm/virtext.h index e0f9aa1..5da71c2 100644 --- a/arch/x86/include/asm/virtext.h +++ b/arch/x86/include/asm/virtext.h @@ -16,7 +16,6 @@ #define _ASM_X86_VIRTEX_H #include <asm/processor.h> -#include <asm/system.h> #include <asm/vmx.h> #include <asm/svm.h> diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h index 517d476..baaca8d 100644 --- a/arch/x86/include/asm/x86_init.h +++ b/arch/x86/include/asm/x86_init.h @@ -145,9 +145,11 @@ struct x86_init_ops { /** * struct x86_cpuinit_ops - platform specific cpu hotplug setups * @setup_percpu_clockev: set up the per cpu clock event device + * @early_percpu_clock_init: early init of the per cpu clock event device */ struct x86_cpuinit_ops { void (*setup_percpu_clockev)(void); + void (*early_percpu_clock_init)(void); void (*fixup_cpu_id)(struct cpuinfo_x86 *c, int node); }; @@ -160,6 +162,8 @@ struct x86_cpuinit_ops { * @is_untracked_pat_range exclude from PAT logic * @nmi_init enable NMI on cpus * @i8042_detect pre-detect if i8042 controller exists + * @save_sched_clock_state: save state for sched_clock() on suspend + * @restore_sched_clock_state: restore state for sched_clock() on resume */ struct x86_platform_ops { unsigned long (*calibrate_tsc)(void); @@ -171,6 +175,8 @@ struct x86_platform_ops { void (*nmi_init)(void); unsigned char (*get_nmi_reason)(void); int (*i8042_detect)(void); + void (*save_sched_clock_state)(void); + void (*restore_sched_clock_state)(void); }; struct pci_dev; diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index a1f2db5..cbf0c9d 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -56,6 +56,7 @@ DEFINE_GUEST_HANDLE(int); DEFINE_GUEST_HANDLE(long); DEFINE_GUEST_HANDLE(void); DEFINE_GUEST_HANDLE(uint64_t); +DEFINE_GUEST_HANDLE(uint32_t); #endif #ifndef HYPERVISOR_VIRT_START diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index f50e7fb..d2b7f27 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -14,6 +14,7 @@ #include <acpi/processor.h> #include <asm/acpi.h> #include <asm/mwait.h> +#include <asm/special_insns.h> /* * Initialize bm_flags based on the CPU cache properties diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 5d56931..459e78c 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -231,7 +231,6 @@ #include <linux/syscore_ops.h> #include <linux/i8253.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/desc.h> #include <asm/olpc.h> diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b240323..67e2583 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -18,6 +18,7 @@ #include <asm/archrandom.h> #include <asm/hypervisor.h> #include <asm/processor.h> +#include <asm/debugreg.h> #include <asm/sections.h> #include <linux/topology.h> #include <linux/cpumask.h> diff --git a/arch/x86/kernel/cpu/mcheck/p5.c b/arch/x86/kernel/cpu/mcheck/p5.c index 5c0e653..2d5454c 100644 --- a/arch/x86/kernel/cpu/mcheck/p5.c +++ b/arch/x86/kernel/cpu/mcheck/p5.c @@ -9,7 +9,6 @@ #include <linux/smp.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/mce.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 67bb17a..47a1870 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -25,7 +25,6 @@ #include <linux/cpu.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/apic.h> #include <asm/idle.h> #include <asm/mce.h> diff --git a/arch/x86/kernel/cpu/mcheck/winchip.c b/arch/x86/kernel/cpu/mcheck/winchip.c index 54060f5..2d7998f 100644 --- a/arch/x86/kernel/cpu/mcheck/winchip.c +++ b/arch/x86/kernel/cpu/mcheck/winchip.c @@ -8,7 +8,6 @@ #include <linux/init.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/mce.h> #include <asm/msr.h> diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 97b2635..75772ae 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -12,7 +12,6 @@ #include <asm/processor-flags.h> #include <asm/cpufeature.h> #include <asm/tlbflush.h> -#include <asm/system.h> #include <asm/mtrr.h> #include <asm/msr.h> #include <asm/pat.h> diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 0a18d16..fa2900c 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -643,14 +643,14 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) /* Prefer fixed purpose counters */ if (x86_pmu.num_counters_fixed) { idx = X86_PMC_IDX_FIXED; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_MAX) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { + for_each_set_bit_from(idx, c->idxmsk, X86_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) goto done; } diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index a524353..39472dd 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -43,7 +43,6 @@ #include <asm/processor.h> #include <asm/msr.h> -#include <asm/system.h> static struct class *cpuid_class; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 6104852..36d1853 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -15,7 +15,6 @@ #include <linux/delay.h> #include <linux/atomic.h> -#include <asm/system.h> #include <asm/timer.h> #include <asm/hw_irq.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 313fb5c..6d5fc8c 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -16,7 +16,6 @@ #include <linux/delay.h> #include <linux/atomic.h> -#include <asm/system.h> #include <asm/timer.h> #include <asm/hw_irq.h> #include <asm/pgtable.h> @@ -306,10 +305,10 @@ void __init native_init_IRQ(void) * us. (some of these will be overridden and become * 'special' SMP interrupts) */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { + i = FIRST_EXTERNAL_VECTOR; + for_each_clear_bit_from(i, used_vectors, NR_VECTORS) { /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - if (!test_bit(i, used_vectors)) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); + set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } if (!acpi_ioapic && !of_ioapic) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index faba577..db6720e 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -46,7 +46,6 @@ #include <asm/debugreg.h> #include <asm/apicdef.h> -#include <asm/system.h> #include <asm/apic.h> #include <asm/nmi.h> @@ -67,8 +66,6 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "ss", 4, offsetof(struct pt_regs, ss) }, { "ds", 4, offsetof(struct pt_regs, ds) }, { "es", 4, offsetof(struct pt_regs, es) }, - { "fs", 4, -1 }, - { "gs", 4, -1 }, #else { "ax", 8, offsetof(struct pt_regs, ax) }, { "bx", 8, offsetof(struct pt_regs, bx) }, @@ -90,7 +87,11 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "flags", 4, offsetof(struct pt_regs, flags) }, { "cs", 4, offsetof(struct pt_regs, cs) }, { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, -1 }, + { "es", 4, -1 }, #endif + { "fs", 4, -1 }, + { "gs", 4, -1 }, }; int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 44842d7..f8492da6 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -136,6 +136,15 @@ int kvm_register_clock(char *txt) return ret; } +static void kvm_save_sched_clock_state(void) +{ +} + +static void kvm_restore_sched_clock_state(void) +{ + kvm_register_clock("primary cpu clock, resume"); +} + #ifdef CONFIG_X86_LOCAL_APIC static void __cpuinit kvm_setup_secondary_clock(void) { @@ -144,8 +153,6 @@ static void __cpuinit kvm_setup_secondary_clock(void) * we shouldn't fail. */ WARN_ON(kvm_register_clock("secondary cpu clock")); - /* ok, done with our trickery, call native */ - setup_secondary_APIC_clock(); } #endif @@ -194,9 +201,11 @@ void __init kvmclock_init(void) x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.setup_percpu_clockev = + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif + x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; + x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC machine_ops.crash_shutdown = kvm_crash_shutdown; diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index ea69726..ebc9873 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -15,7 +15,6 @@ #include <linux/vmalloc.h> #include <linux/uaccess.h> -#include <asm/system.h> #include <asm/ldt.h> #include <asm/desc.h> #include <asm/mmu_context.h> diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c index a3fa43b..5b19e4d 100644 --- a/arch/x86/kernel/machine_kexec_32.c +++ b/arch/x86/kernel/machine_kexec_32.c @@ -23,7 +23,6 @@ #include <asm/apic.h> #include <asm/cpufeature.h> #include <asm/desc.h> -#include <asm/system.h> #include <asm/cacheflush.h> #include <asm/debugreg.h> diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c index 177183c..7eb1e2b 100644 --- a/arch/x86/kernel/mca_32.c +++ b/arch/x86/kernel/mca_32.c @@ -43,7 +43,6 @@ #include <linux/mca.h> #include <linux/kprobes.h> #include <linux/slab.h> -#include <asm/system.h> #include <asm/io.h> #include <linux/proc_fs.h> #include <linux/mman.h> diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 925179f..f21fd94 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -26,7 +26,6 @@ #include <linux/gfp.h> #include <linux/jump_label.h> -#include <asm/system.h> #include <asm/page.h> #include <asm/pgtable.h> diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 9635676..eb11369 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -40,7 +40,6 @@ #include <asm/processor.h> #include <asm/msr.h> -#include <asm/system.h> static struct class *msr_class; diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index ada2f99..ab13760 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -26,6 +26,7 @@ #include <asm/bug.h> #include <asm/paravirt.h> +#include <asm/debugreg.h> #include <asm/desc.h> #include <asm/setup.h> #include <asm/pgtable.h> @@ -37,6 +38,7 @@ #include <asm/apic.h> #include <asm/tlbflush.h> #include <asm/timer.h> +#include <asm/special_insns.h> /* nop stub */ void _paravirt_nop(void) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 726494b..6ac5782 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -42,7 +42,6 @@ #include <asm/calgary.h> #include <asm/tce.h> #include <asm/pci-direct.h> -#include <asm/system.h> #include <asm/dma.h> #include <asm/rio.h> #include <asm/bios_ebda.h> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 1c4d769..28e5e06 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -262,10 +262,11 @@ rootfs_initcall(pci_iommu_init); static __devinit void via_no_dac(struct pci_dev *dev) { - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { + if (forbid_dac == 0) { dev_info(&dev->dev, "disabling DAC on VIA PCI bridge\n"); forbid_dac = 1; } } -DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, via_no_dac); #endif diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 29309c4..a33afaa 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -18,7 +18,6 @@ #include <trace/events/power.h> #include <linux/hw_breakpoint.h> #include <asm/cpu.h> -#include <asm/system.h> #include <asm/apic.h> #include <asm/syscalls.h> #include <asm/idle.h> diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index ea207c2..ae68473 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -38,7 +38,6 @@ #include <linux/kdebug.h> #include <asm/pgtable.h> -#include <asm/system.h> #include <asm/ldt.h> #include <asm/processor.h> #include <asm/i387.h> @@ -55,6 +54,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> +#include <asm/switch_to.h> asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index ce5e34f..2b154da 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -37,7 +37,6 @@ #include <linux/ftrace.h> #include <asm/pgtable.h> -#include <asm/system.h> #include <asm/processor.h> #include <asm/i387.h> #include <asm/fpu-internal.h> @@ -49,6 +48,7 @@ #include <asm/idle.h> #include <asm/syscalls.h> #include <asm/debugreg.h> +#include <asm/switch_to.h> asmlinkage extern void ret_from_fork(void); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 78f05e4..8a634c8 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -24,7 +24,6 @@ #include <asm/uaccess.h> #include <asm/pgtable.h> -#include <asm/system.h> #include <asm/processor.h> #include <asm/i387.h> #include <asm/fpu-internal.h> diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 8863888..1a29015 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -90,7 +90,6 @@ #include <asm/processor.h> #include <asm/bugs.h> -#include <asm/system.h> #include <asm/vsyscall.h> #include <asm/cpu.h> #include <asm/desc.h> @@ -509,15 +508,6 @@ static void __init memblock_x86_reserve_range_setup_data(void) #ifdef CONFIG_KEXEC -static inline unsigned long long get_total_mem(void) -{ - unsigned long long total; - - total = max_pfn - min_low_pfn; - - return total << PAGE_SHIFT; -} - /* * Keep the crash kernel below this limit. On 32 bits earlier kernels * would limit the kernel to the low 512 MiB due to mapping restrictions. @@ -536,7 +526,7 @@ static void __init reserve_crashkernel(void) unsigned long long crash_size, crash_base; int ret; - total_mem = get_total_mem(); + total_mem = memblock_phys_mem_size(); ret = parse_crashkernel(boot_command_line, total_mem, &crash_size, &crash_base); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index e578a79..5104a2b 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -255,6 +255,7 @@ notrace static void __cpuinit start_secondary(void *unused) * most necessary things. */ cpu_init(); + x86_cpuinit.early_percpu_clock_init(); preempt_disable(); smp_callin(); diff --git a/arch/x86/kernel/tce_64.c b/arch/x86/kernel/tce_64.c index 9e540fe..ab40954 100644 --- a/arch/x86/kernel/tce_64.c +++ b/arch/x86/kernel/tce_64.c @@ -34,6 +34,7 @@ #include <asm/tce.h> #include <asm/calgary.h> #include <asm/proto.h> +#include <asm/cacheflush.h> /* flush a tce at 'tceaddr' to main memory */ static inline void flush_tce(void* tceaddr) diff --git a/arch/x86/kernel/tls.c b/arch/x86/kernel/tls.c index bcfec2d..9d9d2f9 100644 --- a/arch/x86/kernel/tls.c +++ b/arch/x86/kernel/tls.c @@ -6,7 +6,6 @@ #include <asm/uaccess.h> #include <asm/desc.h> -#include <asm/system.h> #include <asm/ldt.h> #include <asm/processor.h> #include <asm/proto.h> diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ec61d4c..860f126 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -50,7 +50,6 @@ #include <asm/processor.h> #include <asm/debugreg.h> #include <linux/atomic.h> -#include <asm/system.h> #include <asm/traps.h> #include <asm/desc.h> #include <asm/i387.h> diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 183c592..fc0a147 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -630,7 +630,7 @@ static void set_cyc2ns_scale(unsigned long cpu_khz, int cpu) static unsigned long long cyc2ns_suspend; -void save_sched_clock_state(void) +void tsc_save_sched_clock_state(void) { if (!sched_clock_stable) return; @@ -646,7 +646,7 @@ void save_sched_clock_state(void) * that sched_clock() continues from the point where it was left off during * suspend. */ -void restore_sched_clock_state(void) +void tsc_restore_sched_clock_state(void) { unsigned long long offset; unsigned long flags; @@ -933,6 +933,16 @@ static int __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } + + /* + * Trust the results of the earlier calibration on systems + * exporting a reliable TSC. + */ + if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + clocksource_register_khz(&clocksource_tsc, tsc_khz); + return 0; + } + schedule_delayed_work(&tsc_irqwork, 0); return 0; } diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index b07ba93..d5c6986 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -52,10 +52,7 @@ #include "vsyscall_trace.h" DEFINE_VVAR(int, vgetcpu_mode); -DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = -{ - .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), -}; +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data); static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; @@ -80,20 +77,15 @@ early_param("vsyscall", vsyscall_setup); void update_vsyscall_tz(void) { - unsigned long flags; - - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); - /* sys_tz has changed */ vsyscall_gtod_data.sys_tz = sys_tz; - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, struct clocksource *clock, u32 mult) { - unsigned long flags; + struct timespec monotonic; - write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + write_seqcount_begin(&vsyscall_gtod_data.seq); /* copy vsyscall data */ vsyscall_gtod_data.clock.vclock_mode = clock->archdata.vclock_mode; @@ -101,12 +93,19 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, vsyscall_gtod_data.clock.mask = clock->mask; vsyscall_gtod_data.clock.mult = mult; vsyscall_gtod_data.clock.shift = clock->shift; + vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = *wtm; + + monotonic = timespec_add(*wall_time, *wtm); + vsyscall_gtod_data.monotonic_time_sec = monotonic.tv_sec; + vsyscall_gtod_data.monotonic_time_nsec = monotonic.tv_nsec; + vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); + vsyscall_gtod_data.monotonic_time_coarse = + timespec_add(vsyscall_gtod_data.wall_time_coarse, *wtm); - write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + write_seqcount_end(&vsyscall_gtod_data.seq); } static void warn_bad_vsyscall(const char *level, struct pt_regs *regs, diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 947a06c..e9f265f 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -91,6 +91,7 @@ struct x86_init_ops x86_init __initdata = { }; struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { + .early_percpu_clock_init = x86_init_noop, .setup_percpu_clockev = setup_secondary_APIC_clock, .fixup_cpu_id = x86_default_fixup_cpu_id, }; @@ -107,7 +108,9 @@ struct x86_platform_ops x86_platform = { .is_untracked_pat_range = is_ISA_range, .nmi_init = default_nmi_init, .get_nmi_reason = default_get_nmi_reason, - .i8042_detect = default_i8042_detect + .i8042_detect = default_i8042_detect, + .save_sched_clock_state = tsc_save_sched_clock_state, + .restore_sched_clock_state = tsc_restore_sched_clock_state, }; EXPORT_SYMBOL_GPL(x86_platform); diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 89b02bf..9fed5be 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -236,7 +236,7 @@ static int do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, const u32 kvm_supported_word6_x86_features = F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ | F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | - F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | + F(3DNOWPREFETCH) | F(OSVW) | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); /* cpuid 0xC0000001.edx */ diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index 5b97e17..26d1fb4 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -43,4 +43,12 @@ static inline bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) return best && (best->ebx & bit(X86_FEATURE_FSGSBASE)); } +static inline bool guest_cpuid_has_osvw(struct kvm_vcpu *vcpu) +{ + struct kvm_cpuid_entry2 *best; + + best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); + return best && (best->ecx & bit(X86_FEATURE_OSVW)); +} + #endif diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0982507..8375622 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -57,6 +57,7 @@ #define OpDS 23ull /* DS */ #define OpFS 24ull /* FS */ #define OpGS 25ull /* GS */ +#define OpMem8 26ull /* 8-bit zero extended memory operand */ #define OpBits 5 /* Width of operand field */ #define OpMask ((1ull << OpBits) - 1) @@ -101,6 +102,7 @@ #define SrcAcc (OpAcc << SrcShift) #define SrcImmU16 (OpImmU16 << SrcShift) #define SrcDX (OpDX << SrcShift) +#define SrcMem8 (OpMem8 << SrcShift) #define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ @@ -858,8 +860,7 @@ static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, } static void decode_register_operand(struct x86_emulate_ctxt *ctxt, - struct operand *op, - int inhibit_bytereg) + struct operand *op) { unsigned reg = ctxt->modrm_reg; int highbyte_regs = ctxt->rex_prefix == 0; @@ -876,7 +877,7 @@ static void decode_register_operand(struct x86_emulate_ctxt *ctxt, } op->type = OP_REG; - if ((ctxt->d & ByteOp) && !inhibit_bytereg) { + if (ctxt->d & ByteOp) { op->addr.reg = decode_register(reg, ctxt->regs, highbyte_regs); op->bytes = 1; } else { @@ -1151,6 +1152,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, return 1; } +static int read_interrupt_descriptor(struct x86_emulate_ctxt *ctxt, + u16 index, struct desc_struct *desc) +{ + struct desc_ptr dt; + ulong addr; + + ctxt->ops->get_idt(ctxt, &dt); + + if (dt.size < index * 8 + 7) + return emulate_gp(ctxt, index << 3 | 0x2); + + addr = dt.address + index * 8; + return ctxt->ops->read_std(ctxt, addr, desc, sizeof *desc, + &ctxt->exception); +} + static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, u16 selector, struct desc_ptr *dt) { @@ -1227,6 +1244,8 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, seg_desc.type = 3; seg_desc.p = 1; seg_desc.s = 1; + if (ctxt->mode == X86EMUL_MODE_VM86) + seg_desc.dpl = 3; goto load; } @@ -1891,6 +1910,17 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, ss->p = 1; } +static bool vendor_intel(struct x86_emulate_ctxt *ctxt) +{ + u32 eax, ebx, ecx, edx; + + eax = ecx = 0; + return ctxt->ops->get_cpuid(ctxt, &eax, &ebx, &ecx, &edx) + && ebx == X86EMUL_CPUID_VENDOR_GenuineIntel_ebx + && ecx == X86EMUL_CPUID_VENDOR_GenuineIntel_ecx + && edx == X86EMUL_CPUID_VENDOR_GenuineIntel_edx; +} + static bool em_syscall_is_enabled(struct x86_emulate_ctxt *ctxt) { struct x86_emulate_ops *ops = ctxt->ops; @@ -2007,6 +2037,14 @@ static int em_sysenter(struct x86_emulate_ctxt *ctxt) if (ctxt->mode == X86EMUL_MODE_REAL) return emulate_gp(ctxt, 0); + /* + * Not recognized on AMD in compat mode (but is recognized in legacy + * mode). + */ + if ((ctxt->mode == X86EMUL_MODE_PROT32) && (efer & EFER_LMA) + && !vendor_intel(ctxt)) + return emulate_ud(ctxt); + /* XXX sysenter/sysexit have not been tested in 64bit mode. * Therefore, we inject an #UD. */ @@ -2306,6 +2344,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, 0); ctxt->_eip = tss->eip; ctxt->eflags = tss->eflags | 2; + + /* General purpose registers */ ctxt->regs[VCPU_REGS_RAX] = tss->eax; ctxt->regs[VCPU_REGS_RCX] = tss->ecx; ctxt->regs[VCPU_REGS_RDX] = tss->edx; @@ -2328,6 +2368,24 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); /* + * If we're switching between Protected Mode and VM86, we need to make + * sure to update the mode before loading the segment descriptors so + * that the selectors are interpreted correctly. + * + * Need to get rflags to the vcpu struct immediately because it + * influences the CPL which is checked at least when loading the segment + * descriptors and when pushing an error code to the new kernel stack. + * + * TODO Introduce a separate ctxt->ops->set_cpl callback + */ + if (ctxt->eflags & X86_EFLAGS_VM) + ctxt->mode = X86EMUL_MODE_VM86; + else + ctxt->mode = X86EMUL_MODE_PROT32; + + ctxt->ops->set_rflags(ctxt, ctxt->eflags); + + /* * Now load segment descriptors. If fault happenes at this stage * it is handled in a context of new task */ @@ -2401,7 +2459,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, } static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ops *ops = ctxt->ops; @@ -2423,12 +2481,35 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, /* FIXME: check that next_tss_desc is tss */ - if (reason != TASK_SWITCH_IRET) { - if ((tss_selector & 3) > next_tss_desc.dpl || - ops->cpl(ctxt) > next_tss_desc.dpl) - return emulate_gp(ctxt, 0); + /* + * Check privileges. The three cases are task switch caused by... + * + * 1. jmp/call/int to task gate: Check against DPL of the task gate + * 2. Exception/IRQ/iret: No check is performed + * 3. jmp/call to TSS: Check agains DPL of the TSS + */ + if (reason == TASK_SWITCH_GATE) { + if (idt_index != -1) { + /* Software interrupts */ + struct desc_struct task_gate_desc; + int dpl; + + ret = read_interrupt_descriptor(ctxt, idt_index, + &task_gate_desc); + if (ret != X86EMUL_CONTINUE) + return ret; + + dpl = task_gate_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, (idt_index << 3) | 0x2); + } + } else if (reason != TASK_SWITCH_IRET) { + int dpl = next_tss_desc.dpl; + if ((tss_selector & 3) > dpl || ops->cpl(ctxt) > dpl) + return emulate_gp(ctxt, tss_selector); } + desc_limit = desc_limit_scaled(&next_tss_desc); if (!next_tss_desc.p || ((desc_limit < 0x67 && (next_tss_desc.type & 8)) || @@ -2481,7 +2562,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, } int emulator_task_switch(struct x86_emulate_ctxt *ctxt, - u16 tss_selector, int reason, + u16 tss_selector, int idt_index, int reason, bool has_error_code, u32 error_code) { int rc; @@ -2489,7 +2570,7 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, ctxt->_eip = ctxt->eip; ctxt->dst.type = OP_NONE; - rc = emulator_do_task_switch(ctxt, tss_selector, reason, + rc = emulator_do_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (rc == X86EMUL_CONTINUE) @@ -3514,13 +3595,13 @@ static struct opcode twobyte_table[256] = { I(DstMem | SrcReg | ModRM | BitOp | Lock, em_btr), I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, G(BitOp, group8), I(DstMem | SrcReg | ModRM | BitOp | Lock | PageTable, em_btc), I(DstReg | SrcMem | ModRM, em_bsf), I(DstReg | SrcMem | ModRM, em_bsr), - D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), + D(DstReg | SrcMem8 | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xC0 - 0xCF */ D2bv(DstMem | SrcReg | ModRM | Lock), N, D(DstMem | SrcReg | ModRM | Mov), @@ -3602,9 +3683,7 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, switch (d) { case OpReg: - decode_register_operand(ctxt, op, - op == &ctxt->dst && - ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); + decode_register_operand(ctxt, op); break; case OpImmUByte: rc = decode_imm(ctxt, op, 1, false); @@ -3656,6 +3735,9 @@ static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, case OpImm: rc = decode_imm(ctxt, op, imm_size(ctxt), true); break; + case OpMem8: + ctxt->memop.bytes = 1; + goto mem_common; case OpMem16: ctxt->memop.bytes = 2; goto mem_common; diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index b6a7353..81cf4fa 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -307,6 +307,7 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) if (val & 0x10) { s->init4 = val & 1; s->last_irr = 0; + s->irr &= s->elcr; s->imr = 0; s->priority_add = 0; s->special_mask = 0; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 31bfc69..8584322 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -433,7 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_INIT: - if (level) { + if (!trig_mode || level) { result = 1; vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -731,7 +731,7 @@ static void start_apic_timer(struct kvm_lapic *apic) u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; u64 ns = 0; struct kvm_vcpu *vcpu = apic->vcpu; - unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); + unsigned long this_tsc_khz = vcpu->arch.virtual_tsc_khz; unsigned long flags; if (unlikely(!tscdeadline || !this_tsc_khz)) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 224b02c..4cb1642 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -688,9 +688,8 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, { unsigned long idx; - idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); - return &slot->lpage_info[level - 2][idx]; + idx = gfn_to_index(gfn, slot->base_gfn, level); + return &slot->arch.lpage_info[level - 2][idx]; } static void account_shadowed(struct kvm *kvm, gfn_t gfn) @@ -946,7 +945,7 @@ static void pte_list_walk(unsigned long *pte_list, pte_list_walk_fn fn) } } -static unsigned long *__gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level, +static unsigned long *__gfn_to_rmap(gfn_t gfn, int level, struct kvm_memory_slot *slot) { struct kvm_lpage_info *linfo; @@ -966,7 +965,7 @@ static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) struct kvm_memory_slot *slot; slot = gfn_to_memslot(kvm, gfn); - return __gfn_to_rmap(kvm, gfn, level, slot); + return __gfn_to_rmap(gfn, level, slot); } static bool rmap_can_add(struct kvm_vcpu *vcpu) @@ -988,7 +987,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) return pte_list_add(vcpu, spte, rmapp); } -static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) +static u64 *rmap_next(unsigned long *rmapp, u64 *spte) { return pte_list_next(rmapp, spte); } @@ -1018,8 +1017,8 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, u64 *spte; int i, write_protected = 0; - rmapp = __gfn_to_rmap(kvm, gfn, PT_PAGE_TABLE_LEVEL, slot); - spte = rmap_next(kvm, rmapp, NULL); + rmapp = __gfn_to_rmap(gfn, PT_PAGE_TABLE_LEVEL, slot); + spte = rmap_next(rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); @@ -1027,14 +1026,14 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, mmu_spte_update(spte, *spte & ~PT_WRITABLE_MASK); write_protected = 1; } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } /* check for huge page mappings */ for (i = PT_DIRECTORY_LEVEL; i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { - rmapp = __gfn_to_rmap(kvm, gfn, i, slot); - spte = rmap_next(kvm, rmapp, NULL); + rmapp = __gfn_to_rmap(gfn, i, slot); + spte = rmap_next(rmapp, NULL); while (spte) { BUG_ON(!(*spte & PT_PRESENT_MASK)); BUG_ON(!is_large_pte(*spte)); @@ -1045,7 +1044,7 @@ int kvm_mmu_rmap_write_protect(struct kvm *kvm, u64 gfn, spte = NULL; write_protected = 1; } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } } @@ -1066,7 +1065,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, unsigned long *rmapp, u64 *spte; int need_tlb_flush = 0; - while ((spte = rmap_next(kvm, rmapp, NULL))) { + while ((spte = rmap_next(rmapp, NULL))) { BUG_ON(!(*spte & PT_PRESENT_MASK)); rmap_printk("kvm_rmap_unmap_hva: spte %p %llx\n", spte, *spte); drop_spte(kvm, spte); @@ -1085,14 +1084,14 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, WARN_ON(pte_huge(*ptep)); new_pfn = pte_pfn(*ptep); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { BUG_ON(!is_shadow_present_pte(*spte)); rmap_printk("kvm_set_pte_rmapp: spte %p %llx\n", spte, *spte); need_flush = 1; if (pte_write(*ptep)) { drop_spte(kvm, spte); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); } else { new_spte = *spte &~ (PT64_BASE_ADDR_MASK); new_spte |= (u64)new_pfn << PAGE_SHIFT; @@ -1102,7 +1101,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp, new_spte &= ~shadow_accessed_mask; mmu_spte_clear_track_bits(spte); mmu_spte_set(spte, new_spte); - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } } if (need_flush) @@ -1176,7 +1175,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) return kvm_unmap_rmapp(kvm, rmapp, data); - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { int _young; u64 _spte = *spte; @@ -1186,7 +1185,7 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, young = 1; clear_bit(PT_ACCESSED_SHIFT, (unsigned long *)spte); } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } return young; } @@ -1205,7 +1204,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, if (!shadow_accessed_mask) goto out; - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { u64 _spte = *spte; BUG_ON(!(_spte & PT_PRESENT_MASK)); @@ -1214,7 +1213,7 @@ static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, young = 1; break; } - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } out: return young; @@ -1391,11 +1390,6 @@ struct kvm_mmu_pages { unsigned int nr; }; -#define for_each_unsync_children(bitmap, idx) \ - for (idx = find_first_bit(bitmap, 512); \ - idx < 512; \ - idx = find_next_bit(bitmap, 512, idx+1)) - static int mmu_pages_add(struct kvm_mmu_pages *pvec, struct kvm_mmu_page *sp, int idx) { @@ -1417,7 +1411,7 @@ static int __mmu_unsync_walk(struct kvm_mmu_page *sp, { int i, ret, nr_unsync_leaf = 0; - for_each_unsync_children(sp->unsync_child_bitmap, i) { + for_each_set_bit(i, sp->unsync_child_bitmap, 512) { struct kvm_mmu_page *child; u64 ent = sp->spt[i]; @@ -1803,6 +1797,7 @@ static void drop_large_spte(struct kvm_vcpu *vcpu, u64 *sptep) { if (is_large_pte(*sptep)) { drop_spte(vcpu->kvm, sptep); + --vcpu->kvm->stat.lpages; kvm_flush_remote_tlbs(vcpu->kvm); } } @@ -3190,15 +3185,14 @@ static bool sync_mmio_spte(u64 *sptep, gfn_t gfn, unsigned access, #undef PTTYPE static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, - struct kvm_mmu *context, - int level) + struct kvm_mmu *context) { int maxphyaddr = cpuid_maxphyaddr(vcpu); u64 exb_bit_rsvd = 0; if (!context->nx) exb_bit_rsvd = rsvd_bits(63, 63); - switch (level) { + switch (context->root_level) { case PT32_ROOT_LEVEL: /* no rsvd bits for 2 level 4K page table entries */ context->rsvd_bits_mask[0][1] = 0; @@ -3256,8 +3250,9 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) { context->nx = is_nx(vcpu); + context->root_level = level; - reset_rsvds_bits_mask(vcpu, context, level); + reset_rsvds_bits_mask(vcpu, context); ASSERT(is_pae(vcpu)); context->new_cr3 = paging_new_cr3; @@ -3267,7 +3262,6 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, context->invlpg = paging64_invlpg; context->update_pte = paging64_update_pte; context->free = paging_free; - context->root_level = level; context->shadow_root_level = level; context->root_hpa = INVALID_PAGE; context->direct_map = false; @@ -3284,8 +3278,9 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, struct kvm_mmu *context) { context->nx = false; + context->root_level = PT32_ROOT_LEVEL; - reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); + reset_rsvds_bits_mask(vcpu, context); context->new_cr3 = paging_new_cr3; context->page_fault = paging32_page_fault; @@ -3294,7 +3289,6 @@ static int paging32_init_context(struct kvm_vcpu *vcpu, context->sync_page = paging32_sync_page; context->invlpg = paging32_invlpg; context->update_pte = paging32_update_pte; - context->root_level = PT32_ROOT_LEVEL; context->shadow_root_level = PT32E_ROOT_LEVEL; context->root_hpa = INVALID_PAGE; context->direct_map = false; @@ -3325,7 +3319,6 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->get_cr3 = get_cr3; context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; - context->nx = is_nx(vcpu); if (!is_paging(vcpu)) { context->nx = false; @@ -3333,19 +3326,19 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->root_level = 0; } else if (is_long_mode(vcpu)) { context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL); - context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT64_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging64_gva_to_gpa; } else if (is_pae(vcpu)) { context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL); - context->gva_to_gpa = paging64_gva_to_gpa; context->root_level = PT32E_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging64_gva_to_gpa; } else { context->nx = false; - reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL); - context->gva_to_gpa = paging32_gva_to_gpa; context->root_level = PT32_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, context); + context->gva_to_gpa = paging32_gva_to_gpa; } return 0; @@ -3408,18 +3401,18 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested; } else if (is_long_mode(vcpu)) { g_context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL); g_context->root_level = PT64_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else if (is_pae(vcpu)) { g_context->nx = is_nx(vcpu); - reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL); g_context->root_level = PT32E_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging64_gva_to_gpa_nested; } else { g_context->nx = false; - reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL); g_context->root_level = PT32_ROOT_LEVEL; + reset_rsvds_bits_mask(vcpu, g_context); g_context->gva_to_gpa = paging32_gva_to_gpa_nested; } @@ -3555,7 +3548,7 @@ static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, * If we're seeing too many writes to a page, it may no longer be a page table, * or we may be forking, in which case it is better to unmap the page. */ -static bool detect_write_flooding(struct kvm_mmu_page *sp, u64 *spte) +static bool detect_write_flooding(struct kvm_mmu_page *sp) { /* * Skip write-flooding detected for the sp whose level is 1, because @@ -3664,10 +3657,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mask.cr0_wp = mask.cr4_pae = mask.nxe = 1; for_each_gfn_indirect_valid_sp(vcpu->kvm, sp, gfn, node) { - spte = get_written_sptes(sp, gpa, &npte); - if (detect_write_misaligned(sp, gpa, bytes) || - detect_write_flooding(sp, spte)) { + detect_write_flooding(sp)) { zap_page |= !!kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list); ++vcpu->kvm->stat.mmu_flooded; diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index ea7b4fd..715da5a 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -200,13 +200,13 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slot = gfn_to_memslot(kvm, sp->gfn); rmapp = &slot->rmap[sp->gfn - slot->base_gfn]; - spte = rmap_next(kvm, rmapp, NULL); + spte = rmap_next(rmapp, NULL); while (spte) { if (is_writable_pte(*spte)) audit_printk(kvm, "shadow page has writable " "mappings: gfn %llx role %x\n", sp->gfn, sp->role.word); - spte = rmap_next(kvm, rmapp, spte); + spte = rmap_next(rmapp, spte); } } diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 7aad544..a73f0c1 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -33,10 +33,11 @@ static struct kvm_arch_event_perf_mapping { [4] = { 0x2e, 0x41, PERF_COUNT_HW_CACHE_MISSES }, [5] = { 0xc4, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [6] = { 0xc5, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, + [7] = { 0x00, 0x30, PERF_COUNT_HW_REF_CPU_CYCLES }, }; /* mapping between fixed pmc index and arch_events array */ -int fixed_pmc_events[] = {1, 0, 2}; +int fixed_pmc_events[] = {1, 0, 7}; static bool pmc_is_gp(struct kvm_pmc *pmc) { @@ -210,6 +211,9 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) unsigned config, type = PERF_TYPE_RAW; u8 event_select, unit_mask; + if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) + printk_once("kvm pmu: pin control bit is ignored\n"); + pmc->eventsel = eventsel; stop_counter(pmc); @@ -220,7 +224,7 @@ static void reprogram_gp_counter(struct kvm_pmc *pmc, u64 eventsel) event_select = eventsel & ARCH_PERFMON_EVENTSEL_EVENT; unit_mask = (eventsel & ARCH_PERFMON_EVENTSEL_UMASK) >> 8; - if (!(event_select & (ARCH_PERFMON_EVENTSEL_EDGE | + if (!(eventsel & (ARCH_PERFMON_EVENTSEL_EDGE | ARCH_PERFMON_EVENTSEL_INV | ARCH_PERFMON_EVENTSEL_CMASK))) { config = find_arch_event(&pmc->vcpu->arch.pmu, event_select, @@ -413,7 +417,7 @@ int kvm_pmu_read_pmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data) struct kvm_pmc *counters; u64 ctr; - pmc &= (3u << 30) - 1; + pmc &= ~(3u << 30); if (!fixed && pmc >= pmu->nr_arch_gp_counters) return 1; if (fixed && pmc >= pmu->nr_arch_fixed_counters) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index e385214..e334389 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -111,6 +111,12 @@ struct nested_state { #define MSRPM_OFFSETS 16 static u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; +/* + * Set osvw_len to higher value when updated Revision Guides + * are published and we know what the new status bits are + */ +static uint64_t osvw_len = 4, osvw_status; + struct vcpu_svm { struct kvm_vcpu vcpu; struct vmcb *vmcb; @@ -177,11 +183,13 @@ static bool npt_enabled = true; #else static bool npt_enabled; #endif -static int npt = 1; +/* allow nested paging (virtualized MMU) for all guests */ +static int npt = true; module_param(npt, int, S_IRUGO); -static int nested = 1; +/* allow nested virtualization in KVM/SVM */ +static int nested = true; module_param(nested, int, S_IRUGO); static void svm_flush_tlb(struct kvm_vcpu *vcpu); @@ -557,6 +565,27 @@ static void svm_init_erratum_383(void) erratum_383_found = true; } +static void svm_init_osvw(struct kvm_vcpu *vcpu) +{ + /* + * Guests should see errata 400 and 415 as fixed (assuming that + * HLT and IO instructions are intercepted). + */ + vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; + vcpu->arch.osvw.status = osvw_status & ~(6ULL); + + /* + * By increasing VCPU's osvw.length to 3 we are telling the guest that + * all osvw.status bits inside that length, including bit 0 (which is + * reserved for erratum 298), are valid. However, if host processor's + * osvw_len is 0 then osvw_status[0] carries no information. We need to + * be conservative here and therefore we tell the guest that erratum 298 + * is present (because we really don't know). + */ + if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) + vcpu->arch.osvw.status |= 1; +} + static int has_svm(void) { const char *msg; @@ -623,6 +652,36 @@ static int svm_hardware_enable(void *garbage) __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; } + + /* + * Get OSVW bits. + * + * Note that it is possible to have a system with mixed processor + * revisions and therefore different OSVW bits. If bits are not the same + * on different processors then choose the worst case (i.e. if erratum + * is present on one processor and not on another then assume that the + * erratum is present everywhere). + */ + if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { + uint64_t len, status = 0; + int err; + + len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, &err); + if (!err) + status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, + &err); + + if (err) + osvw_status = osvw_len = 0; + else { + if (len < osvw_len) + osvw_len = len; + osvw_status |= status; + osvw_status &= (1ULL << osvw_len) - 1; + } + } else + osvw_status = osvw_len = 0; + svm_init_erratum_383(); amd_pmu_enable_virt(); @@ -910,20 +969,25 @@ static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) return _tsc; } -static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { struct vcpu_svm *svm = to_svm(vcpu); u64 ratio; u64 khz; - /* TSC scaling supported? */ - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) + /* Guest TSC same frequency as host TSC? */ + if (!scale) { + svm->tsc_ratio = TSC_RATIO_DEFAULT; return; + } - /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ - if (user_tsc_khz == 0) { - vcpu->arch.virtual_tsc_khz = 0; - svm->tsc_ratio = TSC_RATIO_DEFAULT; + /* TSC scaling supported? */ + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); return; } @@ -938,7 +1002,6 @@ static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) user_tsc_khz); return; } - vcpu->arch.virtual_tsc_khz = user_tsc_khz; svm->tsc_ratio = ratio; } @@ -958,10 +1021,14 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { struct vcpu_svm *svm = to_svm(vcpu); + WARN_ON(adjustment < 0); + if (host) + adjustment = svm_scale_tsc(vcpu, adjustment); + svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) svm->nested.hsave->control.tsc_offset += adjustment; @@ -1191,6 +1258,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) if (kvm_vcpu_is_bsp(&svm->vcpu)) svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; + svm_init_osvw(&svm->vcpu); + return &svm->vcpu; free_page4: @@ -1268,6 +1337,21 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu) wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); } +static void svm_update_cpl(struct kvm_vcpu *vcpu) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int cpl; + + if (!is_protmode(vcpu)) + cpl = 0; + else if (svm->vmcb->save.rflags & X86_EFLAGS_VM) + cpl = 3; + else + cpl = svm->vmcb->save.cs.selector & 0x3; + + svm->vmcb->save.cpl = cpl; +} + static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) { return to_svm(vcpu)->vmcb->save.rflags; @@ -1275,7 +1359,11 @@ static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + unsigned long old_rflags = to_svm(vcpu)->vmcb->save.rflags; + to_svm(vcpu)->vmcb->save.rflags = rflags; + if ((old_rflags ^ rflags) & X86_EFLAGS_VM) + svm_update_cpl(vcpu); } static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) @@ -1543,9 +1631,7 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; } if (seg == VCPU_SREG_CS) - svm->vmcb->save.cpl - = (svm->vmcb->save.cs.attrib - >> SVM_SELECTOR_DPL_SHIFT) & 3; + svm_update_cpl(vcpu); mark_dirty(svm->vmcb, VMCB_SEG); } @@ -2735,7 +2821,10 @@ static int task_switch_interception(struct vcpu_svm *svm) (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) skip_emulated_instruction(&svm->vcpu); - if (kvm_task_switch(&svm->vcpu, tss_selector, reason, + if (int_type != SVM_EXITINTINFO_TYPE_SOFT) + int_vec = -1; + + if (kvm_task_switch(&svm->vcpu, tss_selector, int_vec, reason, has_error_code, error_code) == EMULATE_FAIL) { svm->vcpu.run->exit_reason = KVM_EXIT_INTERNAL_ERROR; svm->vcpu.run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 246490f..280751c 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -70,9 +70,6 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO); static bool __read_mostly vmm_exclusive = 1; module_param(vmm_exclusive, bool, S_IRUGO); -static bool __read_mostly yield_on_hlt = 1; -module_param(yield_on_hlt, bool, S_IRUGO); - static bool __read_mostly fasteoi = 1; module_param(fasteoi, bool, S_IRUGO); @@ -1655,17 +1652,6 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu) vmx_set_interrupt_shadow(vcpu, 0); } -static void vmx_clear_hlt(struct kvm_vcpu *vcpu) -{ - /* Ensure that we clear the HLT state in the VMCS. We don't need to - * explicitly skip the instruction because if the HLT state is set, then - * the instruction is already executing and RIP has already been - * advanced. */ - if (!yield_on_hlt && - vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT) - vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE); -} - /* * KVM wants to inject page-faults which it got to the guest. This function * checks whether in a nested guest, we need to inject them to L1 or L2. @@ -1678,7 +1664,7 @@ static int nested_pf_handled(struct kvm_vcpu *vcpu) struct vmcs12 *vmcs12 = get_vmcs12(vcpu); /* TODO: also check PFEC_MATCH/MASK, not just EB.PF. */ - if (!(vmcs12->exception_bitmap & PF_VECTOR)) + if (!(vmcs12->exception_bitmap & (1u << PF_VECTOR))) return 0; nested_vmx_vmexit(vcpu); @@ -1718,7 +1704,6 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, intr_info |= INTR_TYPE_HARD_EXCEPTION; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); - vmx_clear_hlt(vcpu); } static bool vmx_rdtscp_supported(void) @@ -1817,13 +1802,19 @@ u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) } /* - * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ - * ioctl. In this case the call-back should update internal vmx state to make - * the changes effective. + * Engage any workarounds for mis-matched TSC rates. Currently limited to + * software catchup for faster rates on slower CPUs. */ -static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) { - /* Nothing to do here */ + if (!scale) + return; + + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + } else + WARN(1, "user requested TSC rate below hardware speed\n"); } /* @@ -1850,7 +1841,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) } } -static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) +static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) { u64 offset = vmcs_read64(TSC_OFFSET); vmcs_write64(TSC_OFFSET, offset + adjustment); @@ -2219,6 +2210,9 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) msr = find_msr_entry(vmx, msr_index); if (msr) { msr->data = data; + if (msr - vmx->guest_msrs < vmx->save_nmsrs) + kvm_set_shared_msr(msr->index, msr->data, + msr->mask); break; } ret = kvm_set_msr_common(vcpu, msr_index, data); @@ -2399,7 +2393,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) &_pin_based_exec_control) < 0) return -EIO; - min = + min = CPU_BASED_HLT_EXITING | #ifdef CONFIG_X86_64 CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING | @@ -2414,9 +2408,6 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) CPU_BASED_INVLPG_EXITING | CPU_BASED_RDPMC_EXITING; - if (yield_on_hlt) - min |= CPU_BASED_HLT_EXITING; - opt = CPU_BASED_TPR_SHADOW | CPU_BASED_USE_MSR_BITMAPS | CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; @@ -4003,7 +3994,6 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) } else intr |= INTR_TYPE_EXT_INTR; vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); - vmx_clear_hlt(vcpu); } static void vmx_inject_nmi(struct kvm_vcpu *vcpu) @@ -4035,7 +4025,6 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); - vmx_clear_hlt(vcpu); } static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) @@ -4672,9 +4661,10 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) bool has_error_code = false; u32 error_code = 0; u16 tss_selector; - int reason, type, idt_v; + int reason, type, idt_v, idt_index; idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK); + idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK); type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK); exit_qualification = vmcs_readl(EXIT_QUALIFICATION); @@ -4712,8 +4702,9 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) type != INTR_TYPE_NMI_INTR)) skip_emulated_instruction(vcpu); - if (kvm_task_switch(vcpu, tss_selector, reason, - has_error_code, error_code) == EMULATE_FAIL) { + if (kvm_task_switch(vcpu, tss_selector, + type == INTR_TYPE_SOFT_INTR ? idt_index : -1, reason, + has_error_code, error_code) == EMULATE_FAIL) { vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; vcpu->run->internal.ndata = 0; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 54696b5..4044ce0 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -97,6 +97,10 @@ EXPORT_SYMBOL_GPL(kvm_has_tsc_control); u32 kvm_max_guest_tsc_khz; EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); +/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ +static u32 tsc_tolerance_ppm = 250; +module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -969,50 +973,51 @@ static inline u64 get_kernel_ns(void) static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); unsigned long max_tsc_khz; -static inline int kvm_tsc_changes_freq(void) +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { - int cpu = get_cpu(); - int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && - cpufreq_quick_get(cpu) != 0; - put_cpu(); - return ret; + return pvclock_scale_delta(nsec, vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); } -u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +static u32 adjust_tsc_khz(u32 khz, s32 ppm) { - if (vcpu->arch.virtual_tsc_khz) - return vcpu->arch.virtual_tsc_khz; - else - return __this_cpu_read(cpu_tsc_khz); + u64 v = (u64)khz * (1000000 + ppm); + do_div(v, 1000000); + return v; } -static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) +static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { - u64 ret; - - WARN_ON(preemptible()); - if (kvm_tsc_changes_freq()) - printk_once(KERN_WARNING - "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * vcpu_tsc_khz(vcpu); - do_div(ret, USEC_PER_SEC); - return ret; -} + u32 thresh_lo, thresh_hi; + int use_scaling = 0; -static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) -{ /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &vcpu->arch.tsc_catchup_shift, - &vcpu->arch.tsc_catchup_mult); + &vcpu->arch.virtual_tsc_shift, + &vcpu->arch.virtual_tsc_mult); + vcpu->arch.virtual_tsc_khz = this_tsc_khz; + + /* + * Compute the variation in TSC rate which is acceptable + * within the range of tolerance and decide if the + * rate being applied is within that bounds of the hardware + * rate. If so, no scaling or compensation need be done. + */ + thresh_lo = adjust_tsc_khz(tsc_khz, -tsc_tolerance_ppm); + thresh_hi = adjust_tsc_khz(tsc_khz, tsc_tolerance_ppm); + if (this_tsc_khz < thresh_lo || this_tsc_khz > thresh_hi) { + pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); + use_scaling = 1; + } + kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { - u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->arch.tsc_catchup_mult, - vcpu->arch.tsc_catchup_shift); - tsc += vcpu->arch.last_tsc_write; + u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.this_tsc_nsec, + vcpu->arch.virtual_tsc_mult, + vcpu->arch.virtual_tsc_shift); + tsc += vcpu->arch.this_tsc_write; return tsc; } @@ -1021,48 +1026,88 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) struct kvm *kvm = vcpu->kvm; u64 offset, ns, elapsed; unsigned long flags; - s64 sdiff; + s64 usdiff; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; - sdiff = data - kvm->arch.last_tsc_write; - if (sdiff < 0) - sdiff = -sdiff; + + /* n.b - signed multiplication and division required */ + usdiff = data - kvm->arch.last_tsc_write; +#ifdef CONFIG_X86_64 + usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz; +#else + /* do_div() only does unsigned */ + asm("idivl %2; xor %%edx, %%edx" + : "=A"(usdiff) + : "A"(usdiff * 1000), "rm"(vcpu->arch.virtual_tsc_khz)); +#endif + do_div(elapsed, 1000); + usdiff -= elapsed; + if (usdiff < 0) + usdiff = -usdiff; /* - * Special case: close write to TSC within 5 seconds of - * another CPU is interpreted as an attempt to synchronize - * The 5 seconds is to accommodate host load / swapping as - * well as any reset of TSC during the boot process. - * - * In that case, for a reliable TSC, we can match TSC offsets, - * or make a best guest using elapsed value. - */ - if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && - elapsed < 5ULL * NSEC_PER_SEC) { + * Special case: TSC write with a small delta (1 second) of virtual + * cycle time against real time is interpreted as an attempt to + * synchronize the CPU. + * + * For a reliable TSC, we can match TSC offsets, and for an unstable + * TSC, we add elapsed time in this computation. We could let the + * compensation code attempt to catch up if we fall behind, but + * it's better to try to match offsets from the beginning. + */ + if (usdiff < USEC_PER_SEC && + vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { if (!check_tsc_unstable()) { - offset = kvm->arch.last_tsc_offset; + offset = kvm->arch.cur_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { u64 delta = nsec_to_cycles(vcpu, elapsed); - offset += delta; + data += delta; + offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } - ns = kvm->arch.last_tsc_nsec; + } else { + /* + * We split periods of matched TSC writes into generations. + * For each generation, we track the original measured + * nanosecond time, offset, and write, so if TSCs are in + * sync, we can match exact offset, and if not, we can match + * exact software computaion in compute_guest_tsc() + * + * These values are tracked in kvm->arch.cur_xxx variables. + */ + kvm->arch.cur_tsc_generation++; + kvm->arch.cur_tsc_nsec = ns; + kvm->arch.cur_tsc_write = data; + kvm->arch.cur_tsc_offset = offset; + pr_debug("kvm: new tsc generation %u, clock %llu\n", + kvm->arch.cur_tsc_generation, data); } + + /* + * We also track th most recent recorded KHZ, write and time to + * allow the matching interval to be extended at each write. + */ kvm->arch.last_tsc_nsec = ns; kvm->arch.last_tsc_write = data; - kvm->arch.last_tsc_offset = offset; - kvm_x86_ops->write_tsc_offset(vcpu, offset); - raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); + kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; /* Reset of TSC must disable overshoot protection below */ vcpu->arch.hv_clock.tsc_timestamp = 0; - vcpu->arch.last_tsc_write = data; - vcpu->arch.last_tsc_nsec = ns; + vcpu->arch.last_guest_tsc = data; + + /* Keep track of which generation this VCPU has synchronized to */ + vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; + vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; + vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; + + kvm_x86_ops->write_tsc_offset(vcpu, offset); + raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); } + EXPORT_SYMBOL_GPL(kvm_write_tsc); static int kvm_guest_time_update(struct kvm_vcpu *v) @@ -1078,7 +1123,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_save(flags); tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); kernel_ns = get_kernel_ns(); - this_tsc_khz = vcpu_tsc_khz(v); + this_tsc_khz = __get_cpu_var(cpu_tsc_khz); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -1098,7 +1143,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) if (vcpu->tsc_catchup) { u64 tsc = compute_guest_tsc(v, kernel_ns); if (tsc > tsc_timestamp) { - kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp); + adjust_tsc_offset_guest(v, tsc - tsc_timestamp); tsc_timestamp = tsc; } } @@ -1130,7 +1175,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) * observed by the guest and ensure the new system time is greater. */ max_kernel_ns = 0; - if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) { + if (vcpu->hv_clock.tsc_timestamp) { max_kernel_ns = vcpu->last_guest_tsc - vcpu->hv_clock.tsc_timestamp; max_kernel_ns = pvclock_scale_delta(max_kernel_ns, @@ -1504,6 +1549,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) case MSR_K7_HWCR: data &= ~(u64)0x40; /* ignore flush filter disable */ data &= ~(u64)0x100; /* ignore ignne emulation enable */ + data &= ~(u64)0x8; /* ignore TLB cache disable */ if (data != 0) { pr_unimpl(vcpu, "unimplemented HWCR wrmsr: 0x%llx\n", data); @@ -1676,6 +1722,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) */ pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data); break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.length = data; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + vcpu->arch.osvw.status = data; + break; default: if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) return xen_hvm_config(vcpu, data); @@ -1960,6 +2016,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) */ data = 0xbe702111; break; + case MSR_AMD64_OSVW_ID_LENGTH: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.length; + break; + case MSR_AMD64_OSVW_STATUS: + if (!guest_cpuid_has_osvw(vcpu)) + return 1; + data = vcpu->arch.osvw.status; + break; default: if (kvm_pmu_msr(vcpu, msr)) return kvm_pmu_get_msr(vcpu, msr, pdata); @@ -2080,6 +2146,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: case KVM_CAP_GET_TSC_KHZ: + case KVM_CAP_PCI_2_3: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2214,19 +2281,23 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) } kvm_x86_ops->vcpu_load(vcpu, cpu); - if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { - /* Make sure TSC doesn't go backwards */ - s64 tsc_delta; - u64 tsc; - tsc = kvm_x86_ops->read_l1_tsc(vcpu); - tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : - tsc - vcpu->arch.last_guest_tsc; + /* Apply any externally detected TSC adjustments (due to suspend) */ + if (unlikely(vcpu->arch.tsc_offset_adjustment)) { + adjust_tsc_offset_host(vcpu, vcpu->arch.tsc_offset_adjustment); + vcpu->arch.tsc_offset_adjustment = 0; + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + } + if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { + s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : + native_read_tsc() - vcpu->arch.last_host_tsc; if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { - kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta); + u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, + vcpu->arch.last_guest_tsc); + kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); @@ -2243,7 +2314,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + vcpu->arch.last_host_tsc = native_read_tsc(); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, @@ -2785,26 +2856,21 @@ long kvm_arch_vcpu_ioctl(struct file *filp, u32 user_tsc_khz; r = -EINVAL; - if (!kvm_has_tsc_control) - break; - user_tsc_khz = (u32)arg; if (user_tsc_khz >= kvm_max_guest_tsc_khz) goto out; - kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); + if (user_tsc_khz == 0) + user_tsc_khz = tsc_khz; + + kvm_set_tsc_khz(vcpu, user_tsc_khz); r = 0; goto out; } case KVM_GET_TSC_KHZ: { - r = -EIO; - if (check_tsc_unstable()) - goto out; - - r = vcpu_tsc_khz(vcpu); - + r = vcpu->arch.virtual_tsc_khz; goto out; } default: @@ -2815,6 +2881,11 @@ out: return r; } +int kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) +{ + return VM_FAULT_SIGBUS; +} + static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) { int ret; @@ -2998,6 +3069,8 @@ static void write_protect_slot(struct kvm *kvm, unsigned long *dirty_bitmap, unsigned long nr_dirty_pages) { + spin_lock(&kvm->mmu_lock); + /* Not many dirty pages compared to # of shadow pages. */ if (nr_dirty_pages < kvm->arch.n_used_mmu_pages) { unsigned long gfn_offset; @@ -3005,16 +3078,13 @@ static void write_protect_slot(struct kvm *kvm, for_each_set_bit(gfn_offset, dirty_bitmap, memslot->npages) { unsigned long gfn = memslot->base_gfn + gfn_offset; - spin_lock(&kvm->mmu_lock); kvm_mmu_rmap_write_protect(kvm, gfn, memslot); - spin_unlock(&kvm->mmu_lock); } kvm_flush_remote_tlbs(kvm); - } else { - spin_lock(&kvm->mmu_lock); + } else kvm_mmu_slot_remove_write_access(kvm, memslot->id); - spin_unlock(&kvm->mmu_lock); - } + + spin_unlock(&kvm->mmu_lock); } /* @@ -3133,6 +3203,9 @@ long kvm_arch_vm_ioctl(struct file *filp, r = -EEXIST; if (kvm->arch.vpic) goto create_irqchip_unlock; + r = -EINVAL; + if (atomic_read(&kvm->online_vcpus)) + goto create_irqchip_unlock; r = -ENOMEM; vpic = kvm_create_pic(kvm); if (vpic) { @@ -4063,6 +4136,11 @@ static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) return res; } +static void emulator_set_rflags(struct x86_emulate_ctxt *ctxt, ulong val) +{ + kvm_set_rflags(emul_to_vcpu(ctxt), val); +} + static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) { return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); @@ -4244,6 +4322,7 @@ static struct x86_emulate_ops emulate_ops = { .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, + .set_rflags = emulator_set_rflags, .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, @@ -5288,6 +5367,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) profile_hit(KVM_PROFILING, (void *)rip); } + if (unlikely(vcpu->arch.tsc_always_catchup)) + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); kvm_lapic_sync_from_vapic(vcpu); @@ -5587,15 +5668,15 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, return 0; } -int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, - bool has_error_code, u32 error_code) +int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, + int reason, bool has_error_code, u32 error_code) { struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; int ret; init_emulate_ctxt(vcpu); - ret = emulator_task_switch(ctxt, tss_selector, reason, + ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, has_error_code, error_code); if (ret) @@ -5928,13 +6009,88 @@ int kvm_arch_hardware_enable(void *garbage) struct kvm *kvm; struct kvm_vcpu *vcpu; int i; + int ret; + u64 local_tsc; + u64 max_tsc = 0; + bool stable, backwards_tsc = false; kvm_shared_msr_cpu_online(); - list_for_each_entry(kvm, &vm_list, vm_list) - kvm_for_each_vcpu(i, vcpu, kvm) - if (vcpu->cpu == smp_processor_id()) - kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); - return kvm_x86_ops->hardware_enable(garbage); + ret = kvm_x86_ops->hardware_enable(garbage); + if (ret != 0) + return ret; + + local_tsc = native_read_tsc(); + stable = !check_tsc_unstable(); + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + if (!stable && vcpu->cpu == smp_processor_id()) + set_bit(KVM_REQ_CLOCK_UPDATE, &vcpu->requests); + if (stable && vcpu->arch.last_host_tsc > local_tsc) { + backwards_tsc = true; + if (vcpu->arch.last_host_tsc > max_tsc) + max_tsc = vcpu->arch.last_host_tsc; + } + } + } + + /* + * Sometimes, even reliable TSCs go backwards. This happens on + * platforms that reset TSC during suspend or hibernate actions, but + * maintain synchronization. We must compensate. Fortunately, we can + * detect that condition here, which happens early in CPU bringup, + * before any KVM threads can be running. Unfortunately, we can't + * bring the TSCs fully up to date with real time, as we aren't yet far + * enough into CPU bringup that we know how much real time has actually + * elapsed; our helper function, get_kernel_ns() will be using boot + * variables that haven't been updated yet. + * + * So we simply find the maximum observed TSC above, then record the + * adjustment to TSC in each VCPU. When the VCPU later gets loaded, + * the adjustment will be applied. Note that we accumulate + * adjustments, in case multiple suspend cycles happen before some VCPU + * gets a chance to run again. In the event that no KVM threads get a + * chance to run, we will miss the entire elapsed period, as we'll have + * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may + * loose cycle time. This isn't too big a deal, since the loss will be + * uniform across all VCPUs (not to mention the scenario is extremely + * unlikely). It is possible that a second hibernate recovery happens + * much faster than a first, causing the observed TSC here to be + * smaller; this would require additional padding adjustment, which is + * why we set last_host_tsc to the local tsc observed here. + * + * N.B. - this code below runs only on platforms with reliable TSC, + * as that is the only way backwards_tsc is set above. Also note + * that this runs for ALL vcpus, which is not a bug; all VCPUs should + * have the same delta_cyc adjustment applied if backwards_tsc + * is detected. Note further, this adjustment is only done once, + * as we reset last_host_tsc on all VCPUs to stop this from being + * called multiple times (one for each physical CPU bringup). + * + * Platforms with unnreliable TSCs don't have to deal with this, they + * will be compensated by the logic in vcpu_load, which sets the TSC to + * catchup mode. This will catchup all VCPUs to real time, but cannot + * guarantee that they stay in perfect synchronization. + */ + if (backwards_tsc) { + u64 delta_cyc = max_tsc - local_tsc; + list_for_each_entry(kvm, &vm_list, vm_list) { + kvm_for_each_vcpu(i, vcpu, kvm) { + vcpu->arch.tsc_offset_adjustment += delta_cyc; + vcpu->arch.last_host_tsc = local_tsc; + } + + /* + * We have to disable TSC offset matching.. if you were + * booting a VM while issuing an S4 host suspend.... + * you may have some problem. Solving this issue is + * left as an exercise to the reader. + */ + kvm->arch.last_tsc_nsec = 0; + kvm->arch.last_tsc_write = 0; + } + + } + return 0; } void kvm_arch_hardware_disable(void *garbage) @@ -5958,6 +6114,11 @@ void kvm_arch_check_processor_compat(void *rtn) kvm_x86_ops->check_processor_compatibility(rtn); } +bool kvm_vcpu_compatible(struct kvm_vcpu *vcpu) +{ + return irqchip_in_kernel(vcpu->kvm) == (vcpu->arch.apic != NULL); +} + int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) { struct page *page; @@ -5980,7 +6141,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - kvm_init_tsc_catchup(vcpu, max_tsc_khz); + kvm_set_tsc_khz(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) @@ -6032,8 +6193,11 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) free_page((unsigned long)vcpu->arch.pio_data); } -int kvm_arch_init_vm(struct kvm *kvm) +int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) { + if (type) + return -EINVAL; + INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); @@ -6093,6 +6257,65 @@ void kvm_arch_destroy_vm(struct kvm *kvm) put_page(kvm->arch.ept_identity_pagetable); } +void kvm_arch_free_memslot(struct kvm_memory_slot *free, + struct kvm_memory_slot *dont) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + if (!dont || free->arch.lpage_info[i] != dont->arch.lpage_info[i]) { + vfree(free->arch.lpage_info[i]); + free->arch.lpage_info[i] = NULL; + } + } +} + +int kvm_arch_create_memslot(struct kvm_memory_slot *slot, unsigned long npages) +{ + int i; + + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + unsigned long ugfn; + int lpages; + int level = i + 2; + + lpages = gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; + + slot->arch.lpage_info[i] = + vzalloc(lpages * sizeof(*slot->arch.lpage_info[i])); + if (!slot->arch.lpage_info[i]) + goto out_free; + + if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i][0].write_count = 1; + if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) + slot->arch.lpage_info[i][lpages - 1].write_count = 1; + ugfn = slot->userspace_addr >> PAGE_SHIFT; + /* + * If the gfn and userspace address are not aligned wrt each + * other, or if explicitly asked to, disable large page + * support for this slot + */ + if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1) || + !kvm_largepages_enabled()) { + unsigned long j; + + for (j = 0; j < lpages; ++j) + slot->arch.lpage_info[i][j].write_count = 1; + } + } + + return 0; + +out_free: + for (i = 0; i < KVM_NR_PAGE_SIZES - 1; ++i) { + vfree(slot->arch.lpage_info[i]); + slot->arch.lpage_info[i] = NULL; + } + return -ENOMEM; +} + int kvm_arch_prepare_memory_region(struct kvm *kvm, struct kvm_memory_slot *memslot, struct kvm_memory_slot old, diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 6cabf65..4f0cec7 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -12,7 +12,6 @@ #include <asm/page_types.h> #include <asm/sections.h> #include <asm/setup.h> -#include <asm/system.h> #include <asm/tlbflush.h> #include <asm/tlb.h> #include <asm/proto.h> diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 8663f6c..575d86f 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -35,7 +35,6 @@ #include <asm/asm.h> #include <asm/bios_ebda.h> #include <asm/processor.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/dma.h> diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 436a030..fc18be0 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -35,7 +35,6 @@ #include <asm/processor.h> #include <asm/bios_ebda.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> diff --git a/arch/x86/mm/kmemcheck/selftest.c b/arch/x86/mm/kmemcheck/selftest.c index 036efbe..aef7140 100644 --- a/arch/x86/mm/kmemcheck/selftest.c +++ b/arch/x86/mm/kmemcheck/selftest.c @@ -1,3 +1,4 @@ +#include <linux/bug.h> #include <linux/kernel.h> #include "opcode.h" diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c index cac7184..a69bcb8 100644 --- a/arch/x86/mm/pgtable_32.c +++ b/arch/x86/mm/pgtable_32.c @@ -10,7 +10,6 @@ #include <linux/spinlock.h> #include <linux/module.h> -#include <asm/system.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> #include <asm/fixmap.h> diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c index 49a5cb5..ed2835e 100644 --- a/arch/x86/pci/acpi.c +++ b/arch/x86/pci/acpi.c @@ -416,7 +416,12 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_pci_root *root) kfree(sd); } else { get_current_resources(device, busnum, domain, &resources); - if (list_empty(&resources)) + + /* + * _CRS with no apertures is normal, so only fall back to + * defaults or native bridge info if we're ignoring _CRS. + */ + if (!pci_use_crs) x86_pci_root_bus_resources(busnum, &resources); bus = pci_create_root_bus(NULL, busnum, &pci_root_ops, sd, &resources); diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 6dd8955..d0e6e40 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -164,11 +164,11 @@ DECLARE_PCI_FIXUP_RESUME(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8367_0, pci_fixup_ */ static void __devinit pci_fixup_transparent_bridge(struct pci_dev *dev) { - if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && - (dev->device & 0xff00) == 0x2400) + if ((dev->device & 0xff00) == 0x2400) dev->transparent = 1; } -DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixup_transparent_bridge); +DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, + PCI_CLASS_BRIDGE_PCI, 8, pci_fixup_transparent_bridge); /* * Fixup for C1 Halt Disconnect problem on nForce2 systems. @@ -322,9 +322,6 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev) struct pci_bus *bus; u16 config; - if ((pdev->class >> 8) != PCI_CLASS_DISPLAY_VGA) - return; - /* Is VGA routed to us? */ bus = pdev->bus; while (bus) { @@ -353,7 +350,8 @@ static void __devinit pci_fixup_video(struct pci_dev *pdev) dev_printk(KERN_DEBUG, &pdev->dev, "Boot video device\n"); } } -DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_video); +DECLARE_PCI_FIXUP_CLASS_FINAL(PCI_ANY_ID, PCI_ANY_ID, + PCI_CLASS_DISPLAY_VGA, 8, pci_fixup_video); static const struct dmi_system_id __devinitconst msi_k8t_dmi_table[] = { diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 91821a1..831971e 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -39,6 +39,87 @@ #include <asm/io_apic.h> +/* + * This list of dynamic mappings is for temporarily maintaining + * original BIOS BAR addresses for possible reinstatement. + */ +struct pcibios_fwaddrmap { + struct list_head list; + struct pci_dev *dev; + resource_size_t fw_addr[DEVICE_COUNT_RESOURCE]; +}; + +static LIST_HEAD(pcibios_fwaddrmappings); +static DEFINE_SPINLOCK(pcibios_fwaddrmap_lock); + +/* Must be called with 'pcibios_fwaddrmap_lock' lock held. */ +static struct pcibios_fwaddrmap *pcibios_fwaddrmap_lookup(struct pci_dev *dev) +{ + struct pcibios_fwaddrmap *map; + + WARN_ON(!spin_is_locked(&pcibios_fwaddrmap_lock)); + + list_for_each_entry(map, &pcibios_fwaddrmappings, list) + if (map->dev == dev) + return map; + + return NULL; +} + +static void +pcibios_save_fw_addr(struct pci_dev *dev, int idx, resource_size_t fw_addr) +{ + unsigned long flags; + struct pcibios_fwaddrmap *map; + + spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags); + map = pcibios_fwaddrmap_lookup(dev); + if (!map) { + spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags); + map = kzalloc(sizeof(*map), GFP_KERNEL); + if (!map) + return; + + map->dev = pci_dev_get(dev); + map->fw_addr[idx] = fw_addr; + INIT_LIST_HEAD(&map->list); + + spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags); + list_add_tail(&map->list, &pcibios_fwaddrmappings); + } else + map->fw_addr[idx] = fw_addr; + spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags); +} + +resource_size_t pcibios_retrieve_fw_addr(struct pci_dev *dev, int idx) +{ + unsigned long flags; + struct pcibios_fwaddrmap *map; + resource_size_t fw_addr = 0; + + spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags); + map = pcibios_fwaddrmap_lookup(dev); + if (map) + fw_addr = map->fw_addr[idx]; + spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags); + + return fw_addr; +} + +static void pcibios_fw_addr_list_del(void) +{ + unsigned long flags; + struct pcibios_fwaddrmap *entry, *next; + + spin_lock_irqsave(&pcibios_fwaddrmap_lock, flags); + list_for_each_entry_safe(entry, next, &pcibios_fwaddrmappings, list) { + list_del(&entry->list); + pci_dev_put(entry->dev); + kfree(entry); + } + spin_unlock_irqrestore(&pcibios_fwaddrmap_lock, flags); +} + static int skip_isa_ioresource_align(struct pci_dev *dev) { @@ -182,7 +263,8 @@ static void __init pcibios_allocate_resources(int pass) idx, r, disabled, pass); if (pci_claim_resource(dev, idx) < 0) { /* We'll assign a new address later */ - dev->fw_addr[idx] = r->start; + pcibios_save_fw_addr(dev, + idx, r->start); r->end -= r->start; r->start = 0; } @@ -228,6 +310,7 @@ static int __init pcibios_assign_resources(void) } pci_assign_unassigned_resources(); + pcibios_fw_addr_list_del(); return 0; } diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c index cb29191..140942f 100644 --- a/arch/x86/pci/mrst.c +++ b/arch/x86/pci/mrst.c @@ -43,6 +43,8 @@ #define PCI_FIXED_BAR_4_SIZE 0x14 #define PCI_FIXED_BAR_5_SIZE 0x1c +static int pci_soc_mode = 0; + /** * fixed_bar_cap - return the offset of the fixed BAR cap if found * @bus: PCI bus @@ -148,7 +150,9 @@ static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg) */ if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE) return 0; - if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0))) + if (bus == 0 && (devfn == PCI_DEVFN(2, 0) + || devfn == PCI_DEVFN(0, 0) + || devfn == PCI_DEVFN(3, 0))) return 1; return 0; /* langwell on others */ } @@ -231,14 +235,43 @@ struct pci_ops pci_mrst_ops = { */ int __init pci_mrst_init(void) { - printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n"); + printk(KERN_INFO "Intel MID platform detected, using MID PCI ops\n"); pci_mmcfg_late_init(); pcibios_enable_irq = mrst_pci_irq_enable; pci_root_ops = pci_mrst_ops; + pci_soc_mode = 1; /* Continue with standard init */ return 1; } +/* Langwell devices are not true pci devices, they are not subject to 10 ms + * d3 to d0 delay required by pci spec. + */ +static void __devinit pci_d3delay_fixup(struct pci_dev *dev) +{ + /* PCI fixups are effectively decided compile time. If we have a dual + SoC/non-SoC kernel we don't want to mangle d3 on non SoC devices */ + if (!pci_soc_mode) + return; + /* true pci devices in lincroft should allow type 1 access, the rest + * are langwell fake pci devices. + */ + if (type1_access_ok(dev->bus->number, dev->devfn, PCI_DEVICE_ID)) + return; + dev->d3_delay = 0; +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_d3delay_fixup); + +static void __devinit mrst_power_off_unused_dev(struct pci_dev *dev) +{ + pci_set_power_state(dev, PCI_D3cold); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0801, mrst_power_off_unused_dev); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0809, mrst_power_off_unused_dev); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x080C, mrst_power_off_unused_dev); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0812, mrst_power_off_unused_dev); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0815, mrst_power_off_unused_dev); + /* * Langwell devices reside at fixed offsets, don't try to move them. */ @@ -248,6 +281,9 @@ static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev) u32 size; int i; + if (!pci_soc_mode) + return; + /* Must have extended configuration space */ if (dev->cfg_size < PCIE_CAP_OFFSET + 4) return; diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c index d99346e..7415aa9 100644 --- a/arch/x86/pci/xen.c +++ b/arch/x86/pci/xen.c @@ -324,6 +324,32 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) out: return ret; } + +static void xen_initdom_restore_msi_irqs(struct pci_dev *dev, int irq) +{ + int ret = 0; + + if (pci_seg_supported) { + struct physdev_pci_device restore_ext; + + restore_ext.seg = pci_domain_nr(dev->bus); + restore_ext.bus = dev->bus->number; + restore_ext.devfn = dev->devfn; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi_ext, + &restore_ext); + if (ret == -ENOSYS) + pci_seg_supported = false; + WARN(ret && ret != -ENOSYS, "restore_msi_ext -> %d\n", ret); + } + if (!pci_seg_supported) { + struct physdev_restore_msi restore; + + restore.bus = dev->bus->number; + restore.devfn = dev->devfn; + ret = HYPERVISOR_physdev_op(PHYSDEVOP_restore_msi, &restore); + WARN(ret && ret != -ENOSYS, "restore_msi -> %d\n", ret); + } +} #endif static void xen_teardown_msi_irqs(struct pci_dev *dev) @@ -446,6 +472,7 @@ int __init pci_xen_initial_domain(void) #ifdef CONFIG_PCI_MSI x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs; x86_msi.teardown_msi_irq = xen_teardown_msi_irq; + x86_msi.restore_msi_irqs = xen_initdom_restore_msi_irqs; #endif xen_setup_acpi_sci(); __acpi_register_gsi = acpi_register_gsi_xen; diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts index e70be38..ce874f8 100644 --- a/arch/x86/platform/ce4100/falconfalls.dts +++ b/arch/x86/platform/ce4100/falconfalls.dts @@ -208,16 +208,19 @@ interrupts = <14 1>; }; - gpio@b,1 { + pcigpio: gpio@b,1 { + #gpio-cells = <2>; + #interrupt-cells = <2>; compatible = "pci8086,2e67.2", "pci8086,2e67", "pciclassff0000", "pciclassff00"; - #gpio-cells = <2>; reg = <0x15900 0x0 0x0 0x0 0x0>; interrupts = <15 1>; + interrupt-controller; gpio-controller; + intel,muxctl = <0>; }; i2c-controller@b,2 { diff --git a/arch/x86/platform/geode/Makefile b/arch/x86/platform/geode/Makefile index 246b788..5b51194 100644 --- a/arch/x86/platform/geode/Makefile +++ b/arch/x86/platform/geode/Makefile @@ -1,2 +1,3 @@ obj-$(CONFIG_ALIX) += alix.o obj-$(CONFIG_NET5501) += net5501.o +obj-$(CONFIG_GEOS) += geos.o diff --git a/arch/x86/platform/geode/geos.c b/arch/x86/platform/geode/geos.c new file mode 100644 index 0000000..c2e6d53 --- /dev/null +++ b/arch/x86/platform/geode/geos.c @@ -0,0 +1,128 @@ +/* + * System Specific setup for Traverse Technologies GEOS. + * At the moment this means setup of GPIO control of LEDs. + * + * Copyright (C) 2008 Constantin Baranov <const@mimas.ru> + * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com> + * and Philip Prindeville <philipp@redfish-solutions.com> + * + * TODO: There are large similarities with leds-net5501.c + * by Alessandro Zummo <a.zummo@towertech.it> + * In the future leds-net5501.c should be migrated over to platform + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/string.h> +#include <linux/module.h> +#include <linux/leds.h> +#include <linux/platform_device.h> +#include <linux/gpio.h> +#include <linux/input.h> +#include <linux/gpio_keys.h> +#include <linux/dmi.h> + +#include <asm/geode.h> + +static struct gpio_keys_button geos_gpio_buttons[] = { + { + .code = KEY_RESTART, + .gpio = 3, + .active_low = 1, + .desc = "Reset button", + .type = EV_KEY, + .wakeup = 0, + .debounce_interval = 100, + .can_disable = 0, + } +}; +static struct gpio_keys_platform_data geos_buttons_data = { + .buttons = geos_gpio_buttons, + .nbuttons = ARRAY_SIZE(geos_gpio_buttons), + .poll_interval = 20, +}; + +static struct platform_device geos_buttons_dev = { + .name = "gpio-keys-polled", + .id = 1, + .dev = { + .platform_data = &geos_buttons_data, + } +}; + +static struct gpio_led geos_leds[] = { + { + .name = "geos:1", + .gpio = 6, + .default_trigger = "default-on", + .active_low = 1, + }, + { + .name = "geos:2", + .gpio = 25, + .default_trigger = "default-off", + .active_low = 1, + }, + { + .name = "geos:3", + .gpio = 27, + .default_trigger = "default-off", + .active_low = 1, + }, +}; + +static struct gpio_led_platform_data geos_leds_data = { + .num_leds = ARRAY_SIZE(geos_leds), + .leds = geos_leds, +}; + +static struct platform_device geos_leds_dev = { + .name = "leds-gpio", + .id = -1, + .dev.platform_data = &geos_leds_data, +}; + +static struct __initdata platform_device *geos_devs[] = { + &geos_buttons_dev, + &geos_leds_dev, +}; + +static void __init register_geos(void) +{ + /* Setup LED control through leds-gpio driver */ + platform_add_devices(geos_devs, ARRAY_SIZE(geos_devs)); +} + +static int __init geos_init(void) +{ + const char *vendor, *product; + + if (!is_geode()) + return 0; + + vendor = dmi_get_system_info(DMI_SYS_VENDOR); + if (!vendor || strcmp(vendor, "Traverse Technologies")) + return 0; + + product = dmi_get_system_info(DMI_PRODUCT_NAME); + if (!product || strcmp(product, "Geos")) + return 0; + + printk(KERN_INFO "%s: system is recognized as \"%s %s\"\n", + KBUILD_MODNAME, vendor, product); + + register_geos(); + + return 0; +} + +module_init(geos_init); + +MODULE_AUTHOR("Philip Prindeville <philipp@redfish-solutions.com>"); +MODULE_DESCRIPTION("Traverse Technologies Geos System Setup"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index 721e652..e0a3723 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -28,6 +28,8 @@ #include <linux/module.h> #include <linux/notifier.h> #include <linux/mfd/intel_msic.h> +#include <linux/gpio.h> +#include <linux/i2c/tc35876x.h> #include <asm/setup.h> #include <asm/mpspec_def.h> @@ -675,6 +677,19 @@ static void *msic_thermal_platform_data(void *info) return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_THERMAL); } +/* tc35876x DSI-LVDS bridge chip and panel platform data */ +static void *tc35876x_platform_data(void *data) +{ + static struct tc35876x_platform_data pdata; + + /* gpio pins set to -1 will not be used by the driver */ + pdata.gpio_bridge_reset = get_gpio_by_name("LCMB_RXEN"); + pdata.gpio_panel_bl_en = get_gpio_by_name("6S6P_BL_EN"); + pdata.gpio_panel_vadd = get_gpio_by_name("EN_VREG_LCD_V3P3"); + + return &pdata; +} + static const struct devs_id __initconst device_ids[] = { {"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data}, {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data}, @@ -687,6 +702,7 @@ static const struct devs_id __initconst device_ids[] = { {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data}, {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, {"mpu3050", SFI_DEV_TYPE_I2C, 1, &mpu3050_platform_data}, + {"i2c_disp_brig", SFI_DEV_TYPE_I2C, 0, &tc35876x_platform_data}, /* MSIC subdevices */ {"msic_battery", SFI_DEV_TYPE_IPC, 1, &msic_battery_platform_data}, diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 4889655..4793683 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -115,7 +115,7 @@ static void __save_processor_state(struct saved_context *ctxt) void save_processor_state(void) { __save_processor_state(&saved_context); - save_sched_clock_state(); + x86_platform.save_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(save_processor_state); @@ -231,8 +231,8 @@ static void __restore_processor_state(struct saved_context *ctxt) /* Needed by apm.c */ void restore_processor_state(void) { + x86_platform.restore_sched_clock_state(); __restore_processor_state(&saved_context); - restore_sched_clock_state(); } #ifdef CONFIG_X86_32 EXPORT_SYMBOL(restore_processor_state); diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c index 3769079..74202c1 100644 --- a/arch/x86/power/hibernate_32.c +++ b/arch/x86/power/hibernate_32.c @@ -10,7 +10,6 @@ #include <linux/suspend.h> #include <linux/bootmem.h> -#include <asm/system.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/mmzone.h> diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig index b2b54d2..9926e11 100644 --- a/arch/x86/um/Kconfig +++ b/arch/x86/um/Kconfig @@ -15,8 +15,8 @@ config UML_X86 select GENERIC_FIND_FIRST_BIT config 64BIT - bool - default SUBARCH = "x86_64" + bool "64-bit kernel" if SUBARCH = "x86" + default SUBARCH != "i386" config X86_32 def_bool !64BIT diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h index 2c32df6..04f82e0 100644 --- a/arch/x86/um/asm/processor.h +++ b/arch/x86/um/asm/processor.h @@ -17,6 +17,16 @@ #define ARCH_IS_STACKGROW(address) \ (address + 65536 + 32 * sizeof(unsigned long) >= UPT_SP(¤t->thread.regs.regs)) +#include <asm/user.h> + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__("rep;nop": : :"memory"); +} + +#define cpu_relax() rep_nop() + #include <asm/processor-generic.h> #endif diff --git a/arch/x86/um/asm/processor_32.h b/arch/x86/um/asm/processor_32.h index 018f732..6c6689e 100644 --- a/arch/x86/um/asm/processor_32.h +++ b/arch/x86/um/asm/processor_32.h @@ -45,16 +45,6 @@ static inline void arch_copy_thread(struct arch_thread *from, memcpy(&to->tls_array, &from->tls_array, sizeof(from->tls_array)); } -#include <asm/user.h> - -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) -{ - __asm__ __volatile__("rep;nop": : :"memory"); -} - -#define cpu_relax() rep_nop() - /* * Default implementation of macro that returns current * instruction pointer ("program counter"). Stolen diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h index 61de92d..4b02a84 100644 --- a/arch/x86/um/asm/processor_64.h +++ b/arch/x86/um/asm/processor_64.h @@ -14,14 +14,6 @@ struct arch_thread { struct faultinfo faultinfo; }; -/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ -static inline void rep_nop(void) -{ - __asm__ __volatile__("rep;nop": : :"memory"); -} - -#define cpu_relax() rep_nop() - #define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \ .debugregs_seq = 0, \ .fs = 0, \ @@ -37,8 +29,6 @@ static inline void arch_copy_thread(struct arch_thread *from, to->fs = from->fs; } -#include <asm/user.h> - #define current_text_addr() \ ({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; }) diff --git a/arch/x86/um/bugs_32.c b/arch/x86/um/bugs_32.c index a1fba5f..17d88cf 100644 --- a/arch/x86/um/bugs_32.c +++ b/arch/x86/um/bugs_32.c @@ -13,8 +13,6 @@ static int host_has_cmov = 1; static jmp_buf cmov_test_return; -#define TASK_PID(task) *((int *) &(((char *) (task))[HOST_TASK_PID])) - static void cmov_sigill_test_handler(int sig) { host_has_cmov = 0; @@ -51,7 +49,7 @@ void arch_examine_signal(int sig, struct uml_pt_regs *regs) * This is testing for a cmov (0x0f 0x4x) instruction causing a * SIGILL in init. */ - if ((sig != SIGILL) || (TASK_PID(get_current()) != 1)) + if ((sig != SIGILL) || (get_current_pid() != 1)) return; if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) { diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c index 639900a..f40281e 100644 --- a/arch/x86/um/mem_32.c +++ b/arch/x86/um/mem_32.c @@ -23,14 +23,6 @@ static int __init gate_vma_init(void) gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; gate_vma.vm_page_prot = __P101; - /* - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - gate_vma.vm_flags |= VM_ALWAYSDUMP; - return 0; } __initcall(gate_vma_init); diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c index 91f4ec9..af91901 100644 --- a/arch/x86/um/vdso/vma.c +++ b/arch/x86/um/vdso/vma.c @@ -64,8 +64,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdsop); up_write(&mm->mmap_sem); diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index 6bc0e72..885eff4 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -70,100 +70,98 @@ notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) return ret; } +notrace static long vdso_fallback_gtod(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + return ret; +} + + notrace static inline long vgetns(void) { long v; cycles_t cycles; if (gtod->clock.vclock_mode == VCLOCK_TSC) cycles = vread_tsc(); - else + else if (gtod->clock.vclock_mode == VCLOCK_HPET) cycles = vread_hpet(); + else + return 0; v = (cycles - gtod->clock.cycle_last) & gtod->clock.mask; return (v * gtod->clock.mult) >> gtod->clock.shift; } -notrace static noinline int do_realtime(struct timespec *ts) +/* Code size doesn't matter (vdso is 4k anyway) and this is faster. */ +notrace static int __always_inline do_realtime(struct timespec *ts) { unsigned long seq, ns; + int mode; + do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; ts->tv_sec = gtod->wall_time_sec; ts->tv_nsec = gtod->wall_time_nsec; ns = vgetns(); - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + timespec_add_ns(ts, ns); - return 0; + return mode; } -notrace static noinline int do_monotonic(struct timespec *ts) +notrace static int do_monotonic(struct timespec *ts) { - unsigned long seq, ns, secs; + unsigned long seq, ns; + int mode; + do { - seq = read_seqbegin(>od->lock); - secs = gtod->wall_time_sec; - ns = gtod->wall_time_nsec + vgetns(); - secs += gtod->wall_to_monotonic.tv_sec; - ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); - - /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec - * are all guaranteed to be nonnegative. - */ - while (ns >= NSEC_PER_SEC) { - ns -= NSEC_PER_SEC; - ++secs; - } - ts->tv_sec = secs; - ts->tv_nsec = ns; + seq = read_seqcount_begin(>od->seq); + mode = gtod->clock.vclock_mode; + ts->tv_sec = gtod->monotonic_time_sec; + ts->tv_nsec = gtod->monotonic_time_nsec; + ns = vgetns(); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); + timespec_add_ns(ts, ns); - return 0; + return mode; } -notrace static noinline int do_realtime_coarse(struct timespec *ts) +notrace static int do_realtime_coarse(struct timespec *ts) { unsigned long seq; do { - seq = read_seqbegin(>od->lock); + seq = read_seqcount_begin(>od->seq); ts->tv_sec = gtod->wall_time_coarse.tv_sec; ts->tv_nsec = gtod->wall_time_coarse.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); + } while (unlikely(read_seqcount_retry(>od->seq, seq))); return 0; } -notrace static noinline int do_monotonic_coarse(struct timespec *ts) +notrace static int do_monotonic_coarse(struct timespec *ts) { - unsigned long seq, ns, secs; + unsigned long seq; do { - seq = read_seqbegin(>od->lock); - secs = gtod->wall_time_coarse.tv_sec; - ns = gtod->wall_time_coarse.tv_nsec; - secs += gtod->wall_to_monotonic.tv_sec; - ns += gtod->wall_to_monotonic.tv_nsec; - } while (unlikely(read_seqretry(>od->lock, seq))); - - /* wall_time_nsec and wall_to_monotonic.tv_nsec are - * guaranteed to be between 0 and NSEC_PER_SEC. - */ - if (ns >= NSEC_PER_SEC) { - ns -= NSEC_PER_SEC; - ++secs; - } - ts->tv_sec = secs; - ts->tv_nsec = ns; + seq = read_seqcount_begin(>od->seq); + ts->tv_sec = gtod->monotonic_time_coarse.tv_sec; + ts->tv_nsec = gtod->monotonic_time_coarse.tv_nsec; + } while (unlikely(read_seqcount_retry(>od->seq, seq))); return 0; } notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) { + int ret = VCLOCK_NONE; + switch (clock) { case CLOCK_REALTIME: - if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) - return do_realtime(ts); + ret = do_realtime(ts); break; case CLOCK_MONOTONIC: - if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) - return do_monotonic(ts); + ret = do_monotonic(ts); break; case CLOCK_REALTIME_COARSE: return do_realtime_coarse(ts); @@ -171,32 +169,33 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) return do_monotonic_coarse(ts); } - return vdso_fallback_gettime(clock, ts); + if (ret == VCLOCK_NONE) + return vdso_fallback_gettime(clock, ts); + return 0; } int clock_gettime(clockid_t, struct timespec *) __attribute__((weak, alias("__vdso_clock_gettime"))); notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) { - long ret; - if (likely(gtod->clock.vclock_mode != VCLOCK_NONE)) { - if (likely(tv != NULL)) { - BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != - offsetof(struct timespec, tv_nsec) || - sizeof(*tv) != sizeof(struct timespec)); - do_realtime((struct timespec *)tv); - tv->tv_usec /= 1000; - } - if (unlikely(tz != NULL)) { - /* Avoid memcpy. Some old compilers fail to inline it */ - tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest; - tz->tz_dsttime = gtod->sys_tz.tz_dsttime; - } - return 0; + long ret = VCLOCK_NONE; + + if (likely(tv != NULL)) { + BUILD_BUG_ON(offsetof(struct timeval, tv_usec) != + offsetof(struct timespec, tv_nsec) || + sizeof(*tv) != sizeof(struct timespec)); + ret = do_realtime((struct timespec *)tv); + tv->tv_usec /= 1000; } - asm("syscall" : "=a" (ret) : - "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); - return ret; + if (unlikely(tz != NULL)) { + /* Avoid memcpy. Some old compilers fail to inline it */ + tz->tz_minuteswest = gtod->sys_tz.tz_minuteswest; + tz->tz_dsttime = gtod->sys_tz.tz_dsttime; + } + + if (ret == VCLOCK_NONE) + return vdso_fallback_gtod(tv, tz); + return 0; } int gettimeofday(struct timeval *, struct timezone *) __attribute__((weak, alias("__vdso_gettimeofday"))); diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c index 468d591..a944020 100644 --- a/arch/x86/vdso/vdso32-setup.c +++ b/arch/x86/vdso/vdso32-setup.c @@ -250,13 +250,7 @@ static int __init gate_vma_init(void) gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; gate_vma.vm_page_prot = __P101; - /* - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully interpretable later - * without matching up the same kernel and hardware config to see - * what PC values meant. - */ - gate_vma.vm_flags |= VM_ALWAYSDUMP; + return 0; } @@ -343,17 +337,10 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) if (compat_uses_vma || !compat) { /* * MAYWRITE to allow gdb to COW and set breakpoints - * - * Make sure the vDSO gets into every core dump. - * Dumping its contents makes post-mortem fully - * interpretable later without matching up the same - * kernel and hardware config to see what PC values - * meant. */ ret = install_special_mapping(mm, addr, PAGE_SIZE, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso32_pages); if (ret) diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 153407c..17e1827 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -124,8 +124,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) ret = install_special_mapping(mm, addr, vdso_size, VM_READ|VM_EXEC| - VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| - VM_ALWAYSDUMP, + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, vdso_pages); if (ret) { current->mm->context.vdso = NULL; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 4172af8..b132ade 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -62,6 +62,15 @@ #include <asm/reboot.h> #include <asm/stackprotector.h> #include <asm/hypervisor.h> +#include <asm/mwait.h> + +#ifdef CONFIG_ACPI +#include <linux/acpi.h> +#include <asm/acpi.h> +#include <acpi/pdc_intel.h> +#include <acpi/processor.h> +#include <xen/interface/platform.h> +#endif #include "xen-ops.h" #include "mmu.h" @@ -200,13 +209,17 @@ static void __init xen_banner(void) static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; +static __read_mostly unsigned int cpuid_leaf1_ecx_set_mask; +static __read_mostly unsigned int cpuid_leaf5_ecx_val; +static __read_mostly unsigned int cpuid_leaf5_edx_val; + static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { unsigned maskebx = ~0; unsigned maskecx = ~0; unsigned maskedx = ~0; - + unsigned setecx = 0; /* * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. @@ -214,9 +227,18 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, switch (*ax) { case 1: maskecx = cpuid_leaf1_ecx_mask; + setecx = cpuid_leaf1_ecx_set_mask; maskedx = cpuid_leaf1_edx_mask; break; + case CPUID_MWAIT_LEAF: + /* Synthesize the values.. */ + *ax = 0; + *bx = 0; + *cx = cpuid_leaf5_ecx_val; + *dx = cpuid_leaf5_edx_val; + return; + case 0xb: /* Suppress extended topology stuff */ maskebx = 0; @@ -232,9 +254,75 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, *bx &= maskebx; *cx &= maskecx; + *cx |= setecx; *dx &= maskedx; + } +static bool __init xen_check_mwait(void) +{ +#ifdef CONFIG_ACPI + struct xen_platform_op op = { + .cmd = XENPF_set_processor_pminfo, + .u.set_pminfo.id = -1, + .u.set_pminfo.type = XEN_PM_PDC, + }; + uint32_t buf[3]; + unsigned int ax, bx, cx, dx; + unsigned int mwait_mask; + + /* We need to determine whether it is OK to expose the MWAIT + * capability to the kernel to harvest deeper than C3 states from ACPI + * _CST using the processor_harvest_xen.c module. For this to work, we + * need to gather the MWAIT_LEAF values (which the cstate.c code + * checks against). The hypervisor won't expose the MWAIT flag because + * it would break backwards compatibility; so we will find out directly + * from the hardware and hypercall. + */ + if (!xen_initial_domain()) + return false; + + ax = 1; + cx = 0; + + native_cpuid(&ax, &bx, &cx, &dx); + + mwait_mask = (1 << (X86_FEATURE_EST % 32)) | + (1 << (X86_FEATURE_MWAIT % 32)); + + if ((cx & mwait_mask) != mwait_mask) + return false; + + /* We need to emulate the MWAIT_LEAF and for that we need both + * ecx and edx. The hypercall provides only partial information. + */ + + ax = CPUID_MWAIT_LEAF; + bx = 0; + cx = 0; + dx = 0; + + native_cpuid(&ax, &bx, &cx, &dx); + + /* Ask the Hypervisor whether to clear ACPI_PDC_C_C2C3_FFH. If so, + * don't expose MWAIT_LEAF and let ACPI pick the IOPORT version of C3. + */ + buf[0] = ACPI_PDC_REVISION_ID; + buf[1] = 1; + buf[2] = (ACPI_PDC_C_CAPABILITY_SMP | ACPI_PDC_EST_CAPABILITY_SWSMP); + + set_xen_guest_handle(op.u.set_pminfo.pdc, buf); + + if ((HYPERVISOR_dom0_op(&op) == 0) && + (buf[2] & (ACPI_PDC_C_C1_FFH | ACPI_PDC_C_C2C3_FFH))) { + cpuid_leaf5_ecx_val = cx; + cpuid_leaf5_edx_val = dx; + } + return true; +#else + return false; +#endif +} static void __init xen_init_cpuid_mask(void) { unsigned int ax, bx, cx, dx; @@ -261,6 +349,9 @@ static void __init xen_init_cpuid_mask(void) /* Xen will set CR4.OSXSAVE if supported and not disabled by force */ if ((cx & xsave_mask) != xsave_mask) cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */ + + if (xen_check_mwait()) + cpuid_leaf1_ecx_set_mask = (1 << (X86_FEATURE_MWAIT % 32)); } static void xen_set_debugreg(int reg, unsigned long val) @@ -777,11 +868,11 @@ static DEFINE_PER_CPU(unsigned long, xen_cr0_value); static unsigned long xen_read_cr0(void) { - unsigned long cr0 = percpu_read(xen_cr0_value); + unsigned long cr0 = this_cpu_read(xen_cr0_value); if (unlikely(cr0 == 0)) { cr0 = native_read_cr0(); - percpu_write(xen_cr0_value, cr0); + this_cpu_write(xen_cr0_value, cr0); } return cr0; @@ -791,7 +882,7 @@ static void xen_write_cr0(unsigned long cr0) { struct multicall_space mcs; - percpu_write(xen_cr0_value, cr0); + this_cpu_write(xen_cr0_value, cr0); /* Only pay attention to cr0.TS; everything else is ignored. */ diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c index 8bbb465..1573376 100644 --- a/arch/x86/xen/irq.c +++ b/arch/x86/xen/irq.c @@ -26,7 +26,7 @@ static unsigned long xen_save_fl(void) struct vcpu_info *vcpu; unsigned long flags; - vcpu = percpu_read(xen_vcpu); + vcpu = this_cpu_read(xen_vcpu); /* flag has opposite sense of mask */ flags = !vcpu->evtchn_upcall_mask; @@ -50,7 +50,7 @@ static void xen_restore_fl(unsigned long flags) make sure we're don't switch CPUs between getting the vcpu pointer and updating the mask. */ preempt_disable(); - vcpu = percpu_read(xen_vcpu); + vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = flags; preempt_enable_no_resched(); @@ -72,7 +72,7 @@ static void xen_irq_disable(void) make sure we're don't switch CPUs between getting the vcpu pointer and updating the mask. */ preempt_disable(); - percpu_read(xen_vcpu)->evtchn_upcall_mask = 1; + this_cpu_read(xen_vcpu)->evtchn_upcall_mask = 1; preempt_enable_no_resched(); } PV_CALLEE_SAVE_REGS_THUNK(xen_irq_disable); @@ -86,7 +86,7 @@ static void xen_irq_enable(void) the caller is confused and is trying to re-enable interrupts on an indeterminate processor. */ - vcpu = percpu_read(xen_vcpu); + vcpu = this_cpu_read(xen_vcpu); vcpu->evtchn_upcall_mask = 0; /* Doesn't matter if we get preempted here, because any diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 95c1cf6..988828b 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1071,14 +1071,14 @@ static void drop_other_mm_ref(void *info) struct mm_struct *mm = info; struct mm_struct *active_mm; - active_mm = percpu_read(cpu_tlbstate.active_mm); + active_mm = this_cpu_read(cpu_tlbstate.active_mm); - if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK) + if (active_mm == mm && this_cpu_read(cpu_tlbstate.state) != TLBSTATE_OK) leave_mm(smp_processor_id()); /* If this cpu still has a stale cr3 reference, then make sure it has been flushed. */ - if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) + if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd)) load_cr3(swapper_pg_dir); } @@ -1185,17 +1185,17 @@ static void __init xen_pagetable_setup_done(pgd_t *base) static void xen_write_cr2(unsigned long cr2) { - percpu_read(xen_vcpu)->arch.cr2 = cr2; + this_cpu_read(xen_vcpu)->arch.cr2 = cr2; } static unsigned long xen_read_cr2(void) { - return percpu_read(xen_vcpu)->arch.cr2; + return this_cpu_read(xen_vcpu)->arch.cr2; } unsigned long xen_read_cr2_direct(void) { - return percpu_read(xen_vcpu_info.arch.cr2); + return this_cpu_read(xen_vcpu_info.arch.cr2); } static void xen_flush_tlb(void) @@ -1278,12 +1278,12 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, static unsigned long xen_read_cr3(void) { - return percpu_read(xen_cr3); + return this_cpu_read(xen_cr3); } static void set_current_cr3(void *v) { - percpu_write(xen_current_cr3, (unsigned long)v); + this_cpu_write(xen_current_cr3, (unsigned long)v); } static void __xen_write_cr3(bool kernel, unsigned long cr3) @@ -1306,7 +1306,7 @@ static void __xen_write_cr3(bool kernel, unsigned long cr3) xen_extend_mmuext_op(&op); if (kernel) { - percpu_write(xen_cr3, cr3); + this_cpu_write(xen_cr3, cr3); /* Update xen_current_cr3 once the batch has actually been submitted. */ @@ -1322,7 +1322,7 @@ static void xen_write_cr3(unsigned long cr3) /* Update while interrupts are disabled, so its atomic with respect to ipis */ - percpu_write(xen_cr3, cr3); + this_cpu_write(xen_cr3, cr3); __xen_write_cr3(true, cr3); diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h index dee79b7..9c2e74f 100644 --- a/arch/x86/xen/multicalls.h +++ b/arch/x86/xen/multicalls.h @@ -47,7 +47,7 @@ static inline void xen_mc_issue(unsigned mode) xen_mc_flush(); /* restore flags saved in xen_mc_batch */ - local_irq_restore(percpu_read(xen_mc_irq_flags)); + local_irq_restore(this_cpu_read(xen_mc_irq_flags)); } /* Set up a callback to be called when the current batch is flushed */ diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index e03c636..1ba8dff 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -10,6 +10,7 @@ #include <linux/pm.h> #include <linux/memblock.h> #include <linux/cpuidle.h> +#include <linux/cpufreq.h> #include <asm/elf.h> #include <asm/vdso.h> @@ -420,7 +421,7 @@ void __init xen_arch_setup(void) boot_cpu_data.hlt_works_ok = 1; #endif disable_cpuidle(); - boot_option_idle_override = IDLE_HALT; + disable_cpufreq(); WARN_ON(set_pm_idle_to_default()); fiddle_vdso(); } diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 501d4e0..02900e8 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -75,8 +75,14 @@ static void __cpuinit cpu_bringup(void) xen_setup_cpu_clockevents(); + notify_cpu_starting(cpu); + + ipi_call_lock(); set_cpu_online(cpu, true); - percpu_write(cpu_state, CPU_ONLINE); + ipi_call_unlock(); + + this_cpu_write(cpu_state, CPU_ONLINE); + wmb(); /* We can take interrupts now: we're officially "up". */ |