diff options
Diffstat (limited to 'arch/x86')
94 files changed, 4156 insertions, 2251 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild index 0e10323..0e9dec6 100644 --- a/arch/x86/Kbuild +++ b/arch/x86/Kbuild @@ -15,3 +15,4 @@ obj-y += vdso/ obj-$(CONFIG_IA32_EMULATION) += ia32/ obj-y += platform/ +obj-y += net/ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 4168e5d..da34972 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -17,8 +17,6 @@ config X86_64 config X86 def_bool y select HAVE_AOUT if X86_32 - select HAVE_READQ - select HAVE_WRITEQ select HAVE_UNSTABLE_SCHED_CLOCK select HAVE_IDE select HAVE_OPROFILE @@ -66,12 +64,12 @@ config X86 select HAVE_GENERIC_HARDIRQS select HAVE_SPARSE_IRQ select GENERIC_FIND_FIRST_BIT - select GENERIC_FIND_NEXT_BIT select GENERIC_IRQ_PROBE select GENERIC_PENDING_IRQ if SMP select GENERIC_IRQ_SHOW select IRQ_FORCED_THREADING select USE_GENERIC_SMP_HELPERS if SMP + select HAVE_BPF_JIT if (X86_64 && NET) config INSTRUCTION_DECODER def_bool (KPROBES || PERF_EVENTS) @@ -916,6 +914,7 @@ config TOSHIBA config I8K tristate "Dell laptop support" + select HWMON ---help--- This adds a driver to safely access the System Management Mode of the CPU on the Dell Inspiron 8000. The System Management Mode diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 615e188..c0f8a5c 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -66,26 +66,6 @@ config DEBUG_STACKOVERFLOW This option will cause messages to be printed if free stack space drops below a certain limit. -config DEBUG_STACK_USAGE - bool "Stack utilization instrumentation" - depends on DEBUG_KERNEL - ---help--- - Enables the display of the minimum amount of free stack which each - task has ever had available in the sysrq-T and sysrq-P debug output. - - This option will slow down process creation somewhat. - -config DEBUG_PER_CPU_MAPS - bool "Debug access to per_cpu maps" - depends on DEBUG_KERNEL - depends on SMP - ---help--- - Say Y to verify that the per_cpu map being accessed has - been setup. Adds a fair amount of code to kernel memory - and decreases performance. - - Say N if unsure. - config X86_PTDUMP bool "Export kernel pagetable layout to userspace via debugfs" depends on DEBUG_KERNEL diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 6f98726..2bf18059f 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -10,7 +10,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index ee01a9d..22a0dc8 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -11,7 +11,6 @@ CONFIG_TASK_IO_ACCOUNTING=y CONFIG_AUDIT=y CONFIG_LOG_BUF_SHIFT=18 CONFIG_CGROUPS=y -CONFIG_CGROUP_NS=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y CONFIG_CGROUP_CPUACCT=y diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index 1a58ad8..c04f1b7 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -2,8 +2,6 @@ # Arch-specific CryptoAPI modules. # -obj-$(CONFIG_CRYPTO_FPU) += fpu.o - obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o @@ -24,6 +22,6 @@ aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o -aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o +aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index 2577613..feee8ff 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -94,6 +94,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); + +int crypto_fpu_init(void); +void crypto_fpu_exit(void); + #ifdef CONFIG_X86_64 asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in, unsigned int len, u8 *iv); @@ -1257,6 +1261,8 @@ static int __init aesni_init(void) return -ENODEV; } + if ((err = crypto_fpu_init())) + goto fpu_err; if ((err = crypto_register_alg(&aesni_alg))) goto aes_err; if ((err = crypto_register_alg(&__aesni_alg))) @@ -1334,6 +1340,7 @@ blk_ecb_err: __aes_err: crypto_unregister_alg(&aesni_alg); aes_err: +fpu_err: return err; } @@ -1363,6 +1370,8 @@ static void __exit aesni_exit(void) crypto_unregister_alg(&blk_ecb_alg); crypto_unregister_alg(&__aesni_alg); crypto_unregister_alg(&aesni_alg); + + crypto_fpu_exit(); } module_init(aesni_init); diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c index 1a8f864..98d7a18 100644 --- a/arch/x86/crypto/fpu.c +++ b/arch/x86/crypto/fpu.c @@ -150,18 +150,12 @@ static struct crypto_template crypto_fpu_tmpl = { .module = THIS_MODULE, }; -static int __init crypto_fpu_module_init(void) +int __init crypto_fpu_init(void) { return crypto_register_template(&crypto_fpu_tmpl); } -static void __exit crypto_fpu_module_exit(void) +void __exit crypto_fpu_exit(void) { crypto_unregister_template(&crypto_fpu_tmpl); } - -module_init(crypto_fpu_module_init); -module_exit(crypto_fpu_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_DESCRIPTION("FPU block cipher wrapper"); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 849a9d2..95f5826 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -848,4 +848,5 @@ ia32_sys_call_table: .quad compat_sys_open_by_handle_at .quad compat_sys_clock_adjtime .quad sys_syncfs + .quad compat_sys_sendmmsg /* 345 */ ia32_syscall_end: diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index a0c46f0..4a0b7c7 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -381,6 +381,26 @@ struct apic { extern struct apic *apic; /* + * APIC drivers are probed based on how they are listed in the .apicdrivers + * section. So the order is important and enforced by the ordering + * of different apic driver files in the Makefile. + * + * For the files having two apic drivers, we use apic_drivers() + * to enforce the order with in them. + */ +#define apic_driver(sym) \ + static struct apic *__apicdrivers_##sym __used \ + __aligned(sizeof(struct apic *)) \ + __section(.apicdrivers) = { &sym } + +#define apic_drivers(sym1, sym2) \ + static struct apic *__apicdrivers_##sym1##sym2[2] __used \ + __aligned(sizeof(struct apic *)) \ + __section(.apicdrivers) = { &sym1, &sym2 } + +extern struct apic *__apicdrivers[], *__apicdrivers_end[]; + +/* * APIC functionality to boot other CPUs - only used on SMP: */ #ifdef CONFIG_SMP @@ -458,15 +478,10 @@ static inline unsigned default_get_apic_id(unsigned long x) #define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469 #ifdef CONFIG_X86_64 -extern struct apic apic_flat; -extern struct apic apic_physflat; -extern struct apic apic_x2apic_cluster; -extern struct apic apic_x2apic_phys; extern int default_acpi_madt_oem_check(char *, char *); extern void apic_send_IPI_self(int vector); -extern struct apic apic_x2apic_uv_x; DECLARE_PER_CPU(int, x2apic_extra_bits); extern int default_cpu_present_to_apicid(int mps_cpu); @@ -480,7 +495,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert) return; } -extern void generic_bigsmp_probe(void); +extern struct apic *generic_bigsmp_probe(void); #ifdef CONFIG_X86_LOCAL_APIC @@ -516,8 +531,6 @@ extern struct apic apic_noop; #ifdef CONFIG_X86_32 -extern struct apic apic_default; - static inline int noop_x86_32_early_logical_apicid(int cpu) { return BAD_APICID; diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index 0722730..d02804d 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -38,7 +38,6 @@ #include <linux/string.h> #include <linux/compiler.h> -#include <asm-generic/int-ll64.h> #include <asm/page.h> #include <xen/xen.h> @@ -87,27 +86,6 @@ build_mmio_write(__writel, "l", unsigned int, "r", ) build_mmio_read(readq, "q", unsigned long, "=r", :"memory") build_mmio_write(writeq, "q", unsigned long, "r", :"memory") -#else - -static inline __u64 readq(const volatile void __iomem *addr) -{ - const volatile u32 __iomem *p = addr; - u32 low, high; - - low = readl(p); - high = readl(p + 1); - - return low + ((u64)high << 32); -} - -static inline void writeq(__u64 val, volatile void __iomem *addr) -{ - writel(val, addr); - writel(val >> 32, addr+4); -} - -#endif - #define readq_relaxed(a) readq(a) #define __raw_readq(a) readq(a) @@ -117,6 +95,8 @@ static inline void writeq(__u64 val, volatile void __iomem *addr) #define readq readq #define writeq writeq +#endif + /** * virt_to_phys - map virtual addresses to physical * @address: address to remap diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h index a97a240..690d1cc 100644 --- a/arch/x86/include/asm/io_apic.h +++ b/arch/x86/include/asm/io_apic.h @@ -105,12 +105,12 @@ struct IR_IO_APIC_route_entry { * # of IO-APICs and # of IRQ routing registers */ extern int nr_ioapics; -extern int nr_ioapic_registers[MAX_IO_APICS]; -#define MP_MAX_IOAPIC_PIN 127 +extern int mpc_ioapic_id(int ioapic); +extern unsigned int mpc_ioapic_addr(int ioapic); +extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic); -/* I/O APIC entries */ -extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; +#define MP_MAX_IOAPIC_PIN 127 /* # of MP IRQ source entries */ extern int mp_irq_entries; @@ -152,11 +152,9 @@ extern void ioapic_insert_resources(void); int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr); -extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); -extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); -extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); -extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); +extern int save_ioapic_entries(void); +extern void mask_ioapic_entries(void); +extern int restore_ioapic_entries(void); extern int get_nr_irqs_gsi(void); @@ -192,19 +190,13 @@ struct io_apic_irq_attr; static inline int io_apic_set_pci_routing(struct device *dev, int irq, struct io_apic_irq_attr *irq_attr) { return 0; } -static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void) -{ - return NULL; -} - -static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { } -static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent) +static inline int save_ioapic_entries(void) { return -ENOMEM; } -static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { } -static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent) +static inline void mask_ioapic_entries(void) { } +static inline int restore_ioapic_entries(void) { return -ENOMEM; } diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h index 396f5b5..77e95f5 100644 --- a/arch/x86/include/asm/kgdb.h +++ b/arch/x86/include/asm/kgdb.h @@ -77,6 +77,7 @@ static inline void arch_kgdb_breakpoint(void) } #define BREAK_INSTR_SIZE 1 #define CACHE_FLUSH_IS_SAFE 1 +#define GDB_ADJUSTS_BREAK_OFFSET extern int kgdb_ll_trap(int cmd, const char *str, struct pt_regs *regs, long err, int trap, int sig); diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 0f52135..0049211 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -14,6 +14,8 @@ #include <asm/desc_defs.h> struct x86_emulate_ctxt; +enum x86_intercept; +enum x86_intercept_stage; struct x86_exception { u8 vector; @@ -24,6 +26,24 @@ struct x86_exception { }; /* + * This struct is used to carry enough information from the instruction + * decoder to main KVM so that a decision can be made whether the + * instruction needs to be intercepted or not. + */ +struct x86_instruction_info { + u8 intercept; /* which intercept */ + u8 rep_prefix; /* rep prefix? */ + u8 modrm_mod; /* mod part of modrm */ + u8 modrm_reg; /* index of register used */ + u8 modrm_rm; /* rm part of modrm */ + u64 src_val; /* value of source operand */ + u8 src_bytes; /* size of source operand */ + u8 dst_bytes; /* size of destination operand */ + u8 ad_bytes; /* size of src/dst address */ + u64 next_rip; /* rip following the instruction */ +}; + +/* * x86_emulate_ops: * * These operations represent the instruction emulator's interface to memory. @@ -62,6 +82,7 @@ struct x86_exception { #define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ #define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ #define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ +#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */ struct x86_emulate_ops { /* @@ -71,8 +92,9 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*read_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*read_std)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, + unsigned int bytes, struct x86_exception *fault); /* @@ -82,8 +104,8 @@ struct x86_emulate_ops { * @val: [OUT] Value write to memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to write to memory. */ - int (*write_std)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*write_std)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* * fetch: Read bytes of standard (non-emulated/special) memory. @@ -92,8 +114,8 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*fetch)(unsigned long addr, void *val, - unsigned int bytes, struct kvm_vcpu *vcpu, + int (*fetch)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, struct x86_exception *fault); /* @@ -102,11 +124,9 @@ struct x86_emulate_ops { * @val: [OUT] Value read from memory, zero-extended to 'u_long'. * @bytes: [IN ] Number of bytes to read from memory. */ - int (*read_emulated)(unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); + int (*read_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, + struct x86_exception *fault); /* * write_emulated: Write bytes to emulated/special memory area. @@ -115,11 +135,10 @@ struct x86_emulate_ops { * required). * @bytes: [IN ] Number of bytes to write to memory. */ - int (*write_emulated)(unsigned long addr, - const void *val, + int (*write_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *val, unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); + struct x86_exception *fault); /* * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an @@ -129,40 +148,54 @@ struct x86_emulate_ops { * @new: [IN ] Value to write to @addr. * @bytes: [IN ] Number of bytes to access using CMPXCHG. */ - int (*cmpxchg_emulated)(unsigned long addr, + int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *old, const void *new, unsigned int bytes, - struct x86_exception *fault, - struct kvm_vcpu *vcpu); - - int (*pio_in_emulated)(int size, unsigned short port, void *val, - unsigned int count, struct kvm_vcpu *vcpu); - - int (*pio_out_emulated)(int size, unsigned short port, const void *val, - unsigned int count, struct kvm_vcpu *vcpu); - - bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3, - int seg, struct kvm_vcpu *vcpu); - void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3, - int seg, struct kvm_vcpu *vcpu); - u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); - void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); - unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); - void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); - void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); - ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); - int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); - int (*cpl)(struct kvm_vcpu *vcpu); - int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); - int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); - int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); - int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); + struct x86_exception *fault); + void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr); + + int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count); + + int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, const void *val, + unsigned int count); + + bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector, + struct desc_struct *desc, u32 *base3, int seg); + void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector, + struct desc_struct *desc, u32 base3, int seg); + unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt, + int seg); + void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt); + ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr); + int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val); + int (*cpl)(struct x86_emulate_ctxt *ctxt); + int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest); + int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value); + int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data); + int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata); + void (*halt)(struct x86_emulate_ctxt *ctxt); + void (*wbinvd)(struct x86_emulate_ctxt *ctxt); + int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt); + void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */ + void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */ + int (*intercept)(struct x86_emulate_ctxt *ctxt, + struct x86_instruction_info *info, + enum x86_intercept_stage stage); }; +typedef u32 __attribute__((vector_size(16))) sse128_t; + /* Type, address-of, and value of an instruction's operand. */ struct operand { - enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; + enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type; unsigned int bytes; union { unsigned long orig_val; @@ -174,11 +207,13 @@ struct operand { ulong ea; unsigned seg; } mem; + unsigned xmm; } addr; union { unsigned long val; u64 val64; char valptr[sizeof(unsigned long) + 2]; + sse128_t vec_val; }; }; @@ -197,6 +232,7 @@ struct read_cache { struct decode_cache { u8 twobyte; u8 b; + u8 intercept; u8 lock_prefix; u8 rep_prefix; u8 op_bytes; @@ -209,6 +245,7 @@ struct decode_cache { u8 seg_override; unsigned int d; int (*execute)(struct x86_emulate_ctxt *ctxt); + int (*check_perm)(struct x86_emulate_ctxt *ctxt); unsigned long regs[NR_VCPU_REGS]; unsigned long eip; /* modrm */ @@ -227,17 +264,15 @@ struct x86_emulate_ctxt { struct x86_emulate_ops *ops; /* Register state before/after emulation. */ - struct kvm_vcpu *vcpu; - unsigned long eflags; unsigned long eip; /* eip before instruction emulation */ /* Emulated execution mode, represented by an X86EMUL_MODE value. */ int mode; - u32 cs_base; /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; + bool guest_mode; /* guest running a nested guest */ bool perm_ok; /* do not check permissions if true */ bool only_vendor_specific_insn; @@ -249,8 +284,8 @@ struct x86_emulate_ctxt { }; /* Repeat String Operation Prefix */ -#define REPE_PREFIX 1 -#define REPNE_PREFIX 2 +#define REPE_PREFIX 0xf3 +#define REPNE_PREFIX 0xf2 /* Execution mode, passed to the emulator. */ #define X86EMUL_MODE_REAL 0 /* Real mode. */ @@ -259,6 +294,69 @@ struct x86_emulate_ctxt { #define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ #define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ +/* any protected mode */ +#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \ + X86EMUL_MODE_PROT64) + +enum x86_intercept_stage { + X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */ + X86_ICPT_PRE_EXCEPT, + X86_ICPT_POST_EXCEPT, + X86_ICPT_POST_MEMACCESS, +}; + +enum x86_intercept { + x86_intercept_none, + x86_intercept_cr_read, + x86_intercept_cr_write, + x86_intercept_clts, + x86_intercept_lmsw, + x86_intercept_smsw, + x86_intercept_dr_read, + x86_intercept_dr_write, + x86_intercept_lidt, + x86_intercept_sidt, + x86_intercept_lgdt, + x86_intercept_sgdt, + x86_intercept_lldt, + x86_intercept_sldt, + x86_intercept_ltr, + x86_intercept_str, + x86_intercept_rdtsc, + x86_intercept_rdpmc, + x86_intercept_pushf, + x86_intercept_popf, + x86_intercept_cpuid, + x86_intercept_rsm, + x86_intercept_iret, + x86_intercept_intn, + x86_intercept_invd, + x86_intercept_pause, + x86_intercept_hlt, + x86_intercept_invlpg, + x86_intercept_invlpga, + x86_intercept_vmrun, + x86_intercept_vmload, + x86_intercept_vmsave, + x86_intercept_vmmcall, + x86_intercept_stgi, + x86_intercept_clgi, + x86_intercept_skinit, + x86_intercept_rdtscp, + x86_intercept_icebp, + x86_intercept_wbinvd, + x86_intercept_monitor, + x86_intercept_mwait, + x86_intercept_rdmsr, + x86_intercept_wrmsr, + x86_intercept_in, + x86_intercept_ins, + x86_intercept_out, + x86_intercept_outs, + + nr_x86_intercepts +}; + /* Host execution mode. */ #if defined(CONFIG_X86_32) #define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 @@ -270,6 +368,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len); #define EMULATION_FAILED -1 #define EMULATION_OK 0 #define EMULATION_RESTART 1 +#define EMULATION_INTERCEPTED 2 int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); int emulator_task_switch(struct x86_emulate_ctxt *ctxt, u16 tss_selector, int reason, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index c8af099..d2ac8e2 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -30,14 +30,30 @@ #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 +#define KVM_MMIO_SIZE 16 #define KVM_PIO_PAGE_OFFSET 1 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2 +#define CR0_RESERVED_BITS \ + (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ + | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ + | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) + #define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) #define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) #define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 0xFFFFFF0000000000ULL) +#define CR4_RESERVED_BITS \ + (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ + | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ + | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ + | X86_CR4_OSXSAVE \ + | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) + +#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) + + #define INVALID_PAGE (~(hpa_t)0) #define VALID_PAGE(x) ((x) != INVALID_PAGE) @@ -118,6 +134,9 @@ enum kvm_reg { enum kvm_reg_ex { VCPU_EXREG_PDPTR = NR_VCPU_REGS, VCPU_EXREG_CR3, + VCPU_EXREG_RFLAGS, + VCPU_EXREG_CPL, + VCPU_EXREG_SEGMENTS, }; enum { @@ -256,7 +275,7 @@ struct kvm_mmu { struct kvm_mmu_page *sp); void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte, unsigned long mmu_seq); + u64 *spte, const void *pte); hpa_t root_hpa; int root_level; int shadow_root_level; @@ -340,7 +359,6 @@ struct kvm_vcpu_arch { struct fpu guest_fpu; u64 xcr0; - gva_t mmio_fault_cr2; struct kvm_pio_request pio; void *pio_data; @@ -367,18 +385,22 @@ struct kvm_vcpu_arch { /* emulate context */ struct x86_emulate_ctxt emulate_ctxt; + bool emulate_regs_need_sync_to_vcpu; + bool emulate_regs_need_sync_from_vcpu; gpa_t time; struct pvclock_vcpu_time_info hv_clock; unsigned int hw_tsc_khz; unsigned int time_offset; struct page *time_page; - u64 last_host_tsc; u64 last_guest_tsc; u64 last_kernel_ns; u64 last_tsc_nsec; u64 last_tsc_write; + u32 virtual_tsc_khz; bool tsc_catchup; + u32 tsc_catchup_mult; + s8 tsc_catchup_shift; bool nmi_pending; bool nmi_injected; @@ -448,9 +470,6 @@ struct kvm_arch { u64 last_tsc_nsec; u64 last_tsc_offset; u64 last_tsc_write; - u32 virtual_tsc_khz; - u32 virtual_tsc_mult; - s8 virtual_tsc_shift; struct kvm_xen_hvm_config xen_hvm_config; @@ -502,6 +521,8 @@ struct kvm_vcpu_stat { u32 nmi_injections; }; +struct x86_instruction_info; + struct kvm_x86_ops { int (*cpu_has_kvm_support)(void); /* __init */ int (*disabled_by_bios)(void); /* __init */ @@ -586,9 +607,17 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); + void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); + u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); + void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); + + int (*check_intercept)(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage); + const struct trace_print_flags *exit_reasons_str; }; @@ -627,6 +656,13 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; +/* control of guest tsc rate supported? */ +extern bool kvm_has_tsc_control; +/* minimum supported tsc_khz for guests */ +extern u32 kvm_min_guest_tsc_khz; +/* maximum supported tsc_khz for guests */ +extern u32 kvm_max_guest_tsc_khz; + enum emulation_result { EMULATE_DONE, /* no further processing */ EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ @@ -645,9 +681,6 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu, return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); } -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); -void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); - void kvm_enable_efer_bits(u64); int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); @@ -657,8 +690,6 @@ struct x86_emulate_ctxt; int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); int kvm_emulate_halt(struct kvm_vcpu *vcpu); -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address); -int emulate_clts(struct kvm_vcpu *vcpu); int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); @@ -721,8 +752,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); -int kvm_fix_hypercall(struct kvm_vcpu *vcpu); - int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, void *insn, int insn_len); void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h index 12d55e7..4814297 100644 --- a/arch/x86/include/asm/linkage.h +++ b/arch/x86/include/asm/linkage.h @@ -8,11 +8,6 @@ #ifdef CONFIG_X86_32 #define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) -/* - * For 32-bit UML - mark functions implemented in assembly that use - * regparm input parameters: - */ -#define asmregparm __attribute__((regparm(3))) /* * Make sure the compiler doesn't do anything stupid with the diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 3cce714..485b4f1 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -118,6 +118,7 @@ complete list. */ #define MSR_AMD64_PATCH_LEVEL 0x0000008b +#define MSR_AMD64_TSC_RATIO 0xc0000104 #define MSR_AMD64_NB_CFG 0xc001001f #define MSR_AMD64_PATCH_LOADER 0xc0010020 #define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h index 6761292..d498943 100644 --- a/arch/x86/include/asm/pci.h +++ b/arch/x86/include/asm/pci.h @@ -135,8 +135,6 @@ void default_teardown_msi_irqs(struct pci_dev *dev); #include "pci_64.h" #endif -void dma32_reserve_bootmem(void); - /* implement the pci_ DMA API in terms of the generic device dma_ one */ #include <asm-generic/pci-dma-compat.h> diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index 53278b0..a0a9779 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -509,6 +509,11 @@ do { \ * it in software. The address used in the cmpxchg16 instruction must be * aligned to a 16 byte boundary. */ +#ifdef CONFIG_SMP +#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3 +#else +#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2 +#endif #define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \ ({ \ char __ret; \ @@ -517,7 +522,7 @@ do { \ typeof(o2) __o2 = o2; \ typeof(o2) __n2 = n2; \ typeof(o2) __dummy; \ - alternative_io("call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP4, \ + alternative_io(CMPXCHG16B_EMU_CALL, \ "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \ X86_FEATURE_CX16, \ ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 1babf8a..94e7618 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -136,6 +136,7 @@ struct cpuinfo_x86; struct task_struct; extern unsigned long profile_pc(struct pt_regs *regs); +#define profile_pc profile_pc extern unsigned long convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); @@ -202,20 +203,11 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs) #endif } -static inline unsigned long instruction_pointer(struct pt_regs *regs) -{ - return regs->ip; -} - -static inline unsigned long frame_pointer(struct pt_regs *regs) -{ - return regs->bp; -} +#define GET_IP(regs) ((regs)->ip) +#define GET_FP(regs) ((regs)->bp) +#define GET_USP(regs) ((regs)->sp) -static inline unsigned long user_stack_pointer(struct pt_regs *regs) -{ - return regs->sp; -} +#include <asm-generic/ptrace.h> /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 83e2efd..9db5583 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -51,6 +51,10 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern unsigned long native_calibrate_tsc(void); +#ifdef CONFIG_X86_64 +extern cycles_t vread_tsc(void); +#endif + /* * Boot-time check whether the TSCs are synchronized across * all CPUs/cores: diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 99f0ad7..99ddd14 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -6,7 +6,6 @@ #include <linux/errno.h> #include <linux/compiler.h> #include <linux/thread_info.h> -#include <linux/prefetch.h> #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h index 088d09f..566e803 100644 --- a/arch/x86/include/asm/uaccess_32.h +++ b/arch/x86/include/asm/uaccess_32.h @@ -6,7 +6,6 @@ */ #include <linux/errno.h> #include <linux/thread_info.h> -#include <linux/prefetch.h> #include <linux/string.h> #include <asm/asm.h> #include <asm/page.h> diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h index 316708d..1c66d30 100644 --- a/arch/x86/include/asm/uaccess_64.h +++ b/arch/x86/include/asm/uaccess_64.h @@ -6,7 +6,6 @@ */ #include <linux/compiler.h> #include <linux/errno.h> -#include <linux/prefetch.h> #include <linux/lockdep.h> #include <asm/alternative.h> #include <asm/cpufeature.h> diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index a755ef5..fb6a625 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -350,10 +350,11 @@ #define __NR_open_by_handle_at 342 #define __NR_clock_adjtime 343 #define __NR_syncfs 344 +#define __NR_sendmmsg 345 #ifdef __KERNEL__ -#define NR_syscalls 345 +#define NR_syscalls 346 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 160fa76..79f90eb 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -677,6 +677,8 @@ __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at) __SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) #define __NR_syncfs 306 __SYSCALL(__NR_syncfs, sys_syncfs) +#define __NR_sendmmsg 307 +__SYSCALL(__NR_sendmmsg, sys_sendmmsg) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h index 9064052..bb05228 100644 --- a/arch/x86/include/asm/vdso.h +++ b/arch/x86/include/asm/vdso.h @@ -1,20 +1,6 @@ #ifndef _ASM_X86_VDSO_H #define _ASM_X86_VDSO_H -#ifdef CONFIG_X86_64 -extern const char VDSO64_PRELINK[]; - -/* - * Given a pointer to the vDSO image, find the pointer to VDSO64_name - * as that symbol is defined in the vDSO sources or linker script. - */ -#define VDSO64_SYMBOL(base, name) \ -({ \ - extern const char VDSO64_##name[]; \ - (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \ -}) -#endif - #if defined CONFIG_X86_32 || defined CONFIG_COMPAT extern const char VDSO32_PRELINK[]; diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h index 3d61e20..646b4c1 100644 --- a/arch/x86/include/asm/vgtod.h +++ b/arch/x86/include/asm/vgtod.h @@ -23,8 +23,6 @@ struct vsyscall_gtod_data { struct timespec wall_to_monotonic; struct timespec wall_time_coarse; }; -extern struct vsyscall_gtod_data __vsyscall_gtod_data -__section_vsyscall_gtod_data; extern struct vsyscall_gtod_data vsyscall_gtod_data; #endif /* _ASM_X86_VGTOD_H */ diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h index d0983d2..d555973 100644 --- a/arch/x86/include/asm/vsyscall.h +++ b/arch/x86/include/asm/vsyscall.h @@ -16,27 +16,19 @@ enum vsyscall_num { #ifdef __KERNEL__ #include <linux/seqlock.h> -#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16))) -#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16))) - /* Definitions for CONFIG_GENERIC_TIME definitions */ -#define __section_vsyscall_gtod_data __attribute__ \ - ((unused, __section__ (".vsyscall_gtod_data"),aligned(16))) -#define __section_vsyscall_clock __attribute__ \ - ((unused, __section__ (".vsyscall_clock"),aligned(16))) #define __vsyscall_fn \ __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace #define VGETCPU_RDTSCP 1 #define VGETCPU_LSL 2 -extern int __vgetcpu_mode; -extern volatile unsigned long __jiffies; - /* kernel space (writeable) */ extern int vgetcpu_mode; extern struct timezone sys_tz; +#include <asm/vvar.h> + extern void map_vsyscall(void); #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h new file mode 100644 index 0000000..341b355 --- /dev/null +++ b/arch/x86/include/asm/vvar.h @@ -0,0 +1,52 @@ +/* + * vvar.h: Shared vDSO/kernel variable declarations + * Copyright (c) 2011 Andy Lutomirski + * Subject to the GNU General Public License, version 2 + * + * A handful of variables are accessible (read-only) from userspace + * code in the vsyscall page and the vdso. They are declared here. + * Some other file must define them with DEFINE_VVAR. + * + * In normal kernel code, they are used like any other variable. + * In user code, they are accessed through the VVAR macro. + * + * Each of these variables lives in the vsyscall page, and each + * one needs a unique offset within the little piece of the page + * reserved for vvars. Specify that offset in DECLARE_VVAR. + * (There are 896 bytes available. If you mess up, the linker will + * catch it.) + */ + +/* Offset of vars within vsyscall page */ +#define VSYSCALL_VARS_OFFSET (3072 + 128) + +#if defined(__VVAR_KERNEL_LDS) + +/* The kernel linker script defines its own magic to put vvars in the + * right place. + */ +#define DECLARE_VVAR(offset, type, name) \ + EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset) + +#else + +#define DECLARE_VVAR(offset, type, name) \ + static type const * const vvaraddr_ ## name = \ + (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset)); + +#define DEFINE_VVAR(type, name) \ + type __vvar_ ## name \ + __attribute__((section(".vsyscall_var_" #name), aligned(16))) + +#define VVAR(name) (*vvaraddr_ ## name) + +#endif + +/* DECLARE_VVAR(offset, type, name) */ + +DECLARE_VVAR(0, volatile unsigned long, jiffies) +DECLARE_VVAR(8, int, vgetcpu_mode) +DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data) + +#undef DECLARE_VVAR +#undef VSYSCALL_VARS_OFFSET diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h new file mode 100644 index 0000000..6bf5b8e --- /dev/null +++ b/arch/x86/include/asm/x2apic.h @@ -0,0 +1,62 @@ +/* + * Common bits for X2APIC cluster/physical modes. + */ + +#ifndef _ASM_X86_X2APIC_H +#define _ASM_X86_X2APIC_H + +#include <asm/apic.h> +#include <asm/ipi.h> +#include <linux/cpumask.h> + +/* + * Need to use more than cpu 0, because we need more vectors + * when MSI-X are used. + */ +static const struct cpumask *x2apic_target_cpus(void) +{ + return cpu_online_mask; +} + +static int x2apic_apic_id_registered(void) +{ + return 1; +} + +/* + * For now each logical cpu is in its own vector allocation domain. + */ +static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + +static void +__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) +{ + unsigned long cfg = __prepare_ICR(0, vector, dest); + native_x2apic_icr_write(cfg, apicid); +} + +static unsigned int x2apic_get_apic_id(unsigned long id) +{ + return id; +} + +static unsigned long x2apic_set_apic_id(unsigned int id) +{ + return id; +} + +static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +{ + return initial_apicid >> index_msb; +} + +static void x2apic_send_IPI_self(int vector) +{ + apic_write(APIC_SELF_IPI, vector); +} + +#endif /* _ASM_X86_X2APIC_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 8508bfe..d240ea9 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -447,6 +447,13 @@ HYPERVISOR_hvm_op(int op, void *arg) return _hypercall2(unsigned long, hvm_op, op, arg); } +static inline int +HYPERVISOR_tmem_op( + struct tmem_op *op) +{ + return _hypercall1(int, tmem_op, op); +} + static inline void MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) { diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 2508064..f5abe3a 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -8,7 +8,6 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities -CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_pvclock.o = -pg @@ -24,13 +23,16 @@ endif nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) CFLAGS_hpet.o := $(nostackp) -CFLAGS_tsc.o := $(nostackp) +CFLAGS_vread_tsc_64.o := $(nostackp) CFLAGS_paravirt.o := $(nostackp) GCOV_PROFILE_vsyscall_64.o := n GCOV_PROFILE_hpet.o := n GCOV_PROFILE_tsc.o := n GCOV_PROFILE_paravirt.o := n +# vread_tsc_64 is hot and should be fully optimized: +CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls + obj-y := process_$(BITS).o signal.o entry_$(BITS).o obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o obj-y += time.o ioport.o ldt.o dumpstack.o @@ -39,7 +41,7 @@ obj-$(CONFIG_IRQ_WORK) += irq_work.o obj-y += probe_roms.o obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o -obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o +obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o obj-y += bootflag.o e820.o obj-y += pci-dma.o quirks.o topology.o kdebugfs.o obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 9a966c5..4558f0d 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -970,7 +970,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) mp_irq.irqflag = (trigger << 2) | polarity; mp_irq.srcbus = MP_ISA_BUS; mp_irq.srcbusirq = bus_irq; /* IRQ */ - mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ + mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */ mp_irq.dstirq = pin; /* INTIN# */ mp_save_irq(&mp_irq); @@ -1021,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void) if (ioapic < 0) continue; pin = mp_find_ioapic_pin(ioapic, gsi); - dstapic = mp_ioapics[ioapic].apicid; + dstapic = mpc_ioapic_id(ioapic); for (idx = 0; idx < mp_irq_entries; idx++) { struct mpc_intsrc *irq = mp_irqs + idx; @@ -1082,7 +1082,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger, mp_irq.srcbus = number; mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); ioapic = mp_find_ioapic(gsi); - mp_irq.dstapic = mp_ioapics[ioapic].apicid; + mp_irq.dstapic = mpc_ioapic_id(ioapic); mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); mp_save_irq(&mp_irq); @@ -1113,7 +1113,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " - "%d-%d\n", mp_ioapics[ioapic].apicid, + "%d-%d\n", mpc_ioapic_id(ioapic), ioapic_pin); return gsi; } diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index 873e7e1..cd8cbeb 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -1538,13 +1538,11 @@ static void do_detach(struct device *dev) { struct iommu_dev_data *dev_data; struct amd_iommu *iommu; - struct pci_dev *pdev; u16 devid; devid = get_device_id(dev); iommu = amd_iommu_rlookup_table[devid]; dev_data = get_dev_data(dev); - pdev = to_pci_dev(dev); /* decrease reference counters */ dev_data->domain->dev_iommu[iommu->index] -= 1; @@ -1703,10 +1701,9 @@ static struct protection_domain *domain_for_device(struct device *dev) struct protection_domain *dom; struct iommu_dev_data *dev_data, *alias_data; unsigned long flags; - u16 devid, alias; + u16 devid; devid = get_device_id(dev); - alias = amd_iommu_alias_table[devid]; dev_data = get_dev_data(dev); alias_data = get_dev_data(dev_data->alias); if (!alias_data) diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 3966b56..767fd04 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,20 +2,25 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o obj-y += hw_nmi.o obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o ifeq ($(CONFIG_X86_64),y) -obj-y += apic_flat_64.o -obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o -obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o +# APIC probe will depend on the listing order here obj-$(CONFIG_X86_UV) += x2apic_uv_x.o +obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o +obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o +obj-y += apic_flat_64.o endif -obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o +# APIC probe will depend on the listing order here obj-$(CONFIG_X86_NUMAQ) += numaq_32.o -obj-$(CONFIG_X86_ES7000) += es7000_32.o obj-$(CONFIG_X86_SUMMIT) += summit_32.o +obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o +obj-$(CONFIG_X86_ES7000) += es7000_32.o + +# For 32bit, probe_32 need to be listed last +obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f92a8e5..b961af8 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1461,7 +1461,6 @@ int __init enable_IR(void) void __init enable_IR_x2apic(void) { unsigned long flags; - struct IO_APIC_route_entry **ioapic_entries; int ret, x2apic_enabled = 0; int dmar_table_init_ret; @@ -1469,13 +1468,7 @@ void __init enable_IR_x2apic(void) if (dmar_table_init_ret && !x2apic_supported()) return; - ioapic_entries = alloc_ioapic_entries(); - if (!ioapic_entries) { - pr_err("Allocate ioapic_entries failed\n"); - goto out; - } - - ret = save_IO_APIC_setup(ioapic_entries); + ret = save_ioapic_entries(); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); goto out; @@ -1483,7 +1476,7 @@ void __init enable_IR_x2apic(void) local_irq_save(flags); legacy_pic->mask_all(); - mask_IO_APIC_setup(ioapic_entries); + mask_ioapic_entries(); if (dmar_table_init_ret) ret = 0; @@ -1514,14 +1507,11 @@ void __init enable_IR_x2apic(void) nox2apic: if (!ret) /* IR enabling failed */ - restore_IO_APIC_setup(ioapic_entries); + restore_ioapic_entries(); legacy_pic->restore_mask(); local_irq_restore(flags); out: - if (ioapic_entries) - free_ioapic_entries(ioapic_entries); - if (x2apic_enabled) return; @@ -2095,28 +2085,20 @@ static void lapic_resume(void) { unsigned int l, h; unsigned long flags; - int maxlvt, ret; - struct IO_APIC_route_entry **ioapic_entries = NULL; + int maxlvt; if (!apic_pm_state.active) return; local_irq_save(flags); if (intr_remapping_enabled) { - ioapic_entries = alloc_ioapic_entries(); - if (!ioapic_entries) { - WARN(1, "Alloc ioapic_entries in lapic resume failed."); - goto restore; - } - - ret = save_IO_APIC_setup(ioapic_entries); - if (ret) { - WARN(1, "Saving IO-APIC state failed: %d\n", ret); - free_ioapic_entries(ioapic_entries); - goto restore; - } - - mask_IO_APIC_setup(ioapic_entries); + /* + * IO-APIC and PIC have their own resume routines. + * We just mask them here to make sure the interrupt + * subsystem is completely quiet while we enable x2apic + * and interrupt-remapping. + */ + mask_ioapic_entries(); legacy_pic->mask_all(); } @@ -2159,13 +2141,9 @@ static void lapic_resume(void) apic_write(APIC_ESR, 0); apic_read(APIC_ESR); - if (intr_remapping_enabled) { + if (intr_remapping_enabled) reenable_intr_remapping(x2apic_mode); - legacy_pic->restore_mask(); - restore_IO_APIC_setup(ioapic_entries); - free_ioapic_entries(ioapic_entries); - } -restore: + local_irq_restore(flags); } diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 5652d31..f7a41e4 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -16,6 +16,7 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/hardirq.h> +#include <linux/module.h> #include <asm/smp.h> #include <asm/apic.h> #include <asm/ipi.h> @@ -24,6 +25,12 @@ #include <acpi/acpi_bus.h> #endif +static struct apic apic_physflat; +static struct apic apic_flat; + +struct apic __read_mostly *apic = &apic_flat; +EXPORT_SYMBOL_GPL(apic); + static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return 1; @@ -164,7 +171,7 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } -struct apic apic_flat = { +static struct apic apic_flat = { .name = "flat", .probe = NULL, .acpi_madt_oem_check = flat_acpi_madt_oem_check, @@ -312,10 +319,18 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return per_cpu(x86_cpu_to_apicid, cpu); } -struct apic apic_physflat = { +static int physflat_probe(void) +{ + if (apic == &apic_physflat || num_possible_cpus() > 8) + return 1; + + return 0; +} + +static struct apic apic_physflat = { .name = "physical flat", - .probe = NULL, + .probe = physflat_probe, .acpi_madt_oem_check = physflat_acpi_madt_oem_check, .apic_id_registered = flat_apic_id_registered, @@ -369,3 +384,8 @@ struct apic apic_physflat = { .wait_icr_idle = native_apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, }; + +/* + * We need to check for physflat first, so this order is important. + */ +apic_drivers(apic_physflat, apic_flat); diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c index d84ac5a5..efd737e 100644 --- a/arch/x86/kernel/apic/bigsmp_32.c +++ b/arch/x86/kernel/apic/bigsmp_32.c @@ -193,7 +193,7 @@ static int probe_bigsmp(void) return dmi_bigsmp; } -struct apic apic_bigsmp = { +static struct apic apic_bigsmp = { .name = "bigsmp", .probe = probe_bigsmp, @@ -254,3 +254,13 @@ struct apic apic_bigsmp = { .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, }; + +struct apic * __init generic_bigsmp_probe(void) +{ + if (probe_bigsmp()) + return &apic_bigsmp; + + return NULL; +} + +apic_driver(apic_bigsmp); diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 70533de..9536b3f 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -620,7 +620,7 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem, } /* We've been warned by a false positive warning.Use __refdata to keep calm. */ -struct apic __refdata apic_es7000_cluster = { +static struct apic __refdata apic_es7000_cluster = { .name = "es7000", .probe = probe_es7000, @@ -685,7 +685,7 @@ struct apic __refdata apic_es7000_cluster = { .x86_32_early_logical_apicid = es7000_early_logical_apicid, }; -struct apic __refdata apic_es7000 = { +static struct apic __refdata apic_es7000 = { .name = "es7000", .probe = probe_es7000, @@ -747,3 +747,9 @@ struct apic __refdata apic_es7000 = { .x86_32_early_logical_apicid = es7000_early_logical_apicid, }; + +/* + * Need to check for es7000 followed by es7000_cluster, so this order + * in apic_drivers is important. + */ +apic_drivers(apic_es7000, apic_es7000_cluster); diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index 5260fe9..d5e57db0 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -19,9 +19,9 @@ #include <linux/delay.h> #ifdef CONFIG_HARDLOCKUP_DETECTOR -u64 hw_nmi_get_sample_period(void) +u64 hw_nmi_get_sample_period(int watchdog_thresh) { - return (u64)(cpu_khz) * 1000 * 60; + return (u64)(cpu_khz) * 1000 * watchdog_thresh; } #endif diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 45fd33d..e529339 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -76,17 +76,40 @@ int sis_apic_bug = -1; static DEFINE_RAW_SPINLOCK(ioapic_lock); static DEFINE_RAW_SPINLOCK(vector_lock); -/* - * # of IRQ routing registers - */ -int nr_ioapic_registers[MAX_IO_APICS]; +static struct ioapic { + /* + * # of IRQ routing registers + */ + int nr_registers; + /* + * Saved state during suspend/resume, or while enabling intr-remap. + */ + struct IO_APIC_route_entry *saved_registers; + /* I/O APIC config */ + struct mpc_ioapic mp_config; + /* IO APIC gsi routing info */ + struct mp_ioapic_gsi gsi_config; + DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); +} ioapics[MAX_IO_APICS]; -/* I/O APIC entries */ -struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; -int nr_ioapics; +#define mpc_ioapic_ver(id) ioapics[id].mp_config.apicver + +int mpc_ioapic_id(int id) +{ + return ioapics[id].mp_config.apicid; +} -/* IO APIC gsi routing info */ -struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; +unsigned int mpc_ioapic_addr(int id) +{ + return ioapics[id].mp_config.apicaddr; +} + +struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id) +{ + return &ioapics[id].gsi_config; +} + +int nr_ioapics; /* The one past the highest gsi number used */ u32 gsi_top; @@ -179,6 +202,14 @@ int __init arch_early_irq_init(void) io_apic_irqs = ~0UL; } + for (i = 0; i < nr_ioapics; i++) { + ioapics[i].saved_registers = + kzalloc(sizeof(struct IO_APIC_route_entry) * + ioapics[i].nr_registers, GFP_KERNEL); + if (!ioapics[i].saved_registers) + pr_err("IOAPIC %d: suspend/resume impossible!\n", i); + } + cfg = irq_cfgx; count = ARRAY_SIZE(irq_cfgx); node = cpu_to_node(0); @@ -297,7 +328,7 @@ struct io_apic { static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) { return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) - + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); + + (mpc_ioapic_addr(idx) & ~PAGE_MASK); } static inline void io_apic_eoi(unsigned int apic, unsigned int vector) @@ -573,7 +604,7 @@ static void clear_IO_APIC (void) int apic, pin; for (apic = 0; apic < nr_ioapics; apic++) - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) clear_IO_APIC_pin(apic, pin); } @@ -615,74 +646,43 @@ static int __init ioapic_pirq_setup(char *str) __setup("pirq=", ioapic_pirq_setup); #endif /* CONFIG_X86_32 */ -struct IO_APIC_route_entry **alloc_ioapic_entries(void) -{ - int apic; - struct IO_APIC_route_entry **ioapic_entries; - - ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics, - GFP_KERNEL); - if (!ioapic_entries) - return 0; - - for (apic = 0; apic < nr_ioapics; apic++) { - ioapic_entries[apic] = - kzalloc(sizeof(struct IO_APIC_route_entry) * - nr_ioapic_registers[apic], GFP_KERNEL); - if (!ioapic_entries[apic]) - goto nomem; - } - - return ioapic_entries; - -nomem: - while (--apic >= 0) - kfree(ioapic_entries[apic]); - kfree(ioapic_entries); - - return 0; -} - /* * Saves all the IO-APIC RTE's */ -int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) +int save_ioapic_entries(void) { int apic, pin; - - if (!ioapic_entries) - return -ENOMEM; + int err = 0; for (apic = 0; apic < nr_ioapics; apic++) { - if (!ioapic_entries[apic]) - return -ENOMEM; + if (!ioapics[apic].saved_registers) { + err = -ENOMEM; + continue; + } - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) - ioapic_entries[apic][pin] = + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) + ioapics[apic].saved_registers[pin] = ioapic_read_entry(apic, pin); } - return 0; + return err; } /* * Mask all IO APIC entries. */ -void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) +void mask_ioapic_entries(void) { int apic, pin; - if (!ioapic_entries) - return; - for (apic = 0; apic < nr_ioapics; apic++) { - if (!ioapic_entries[apic]) - break; + if (!ioapics[apic].saved_registers) + continue; - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { struct IO_APIC_route_entry entry; - entry = ioapic_entries[apic][pin]; + entry = ioapics[apic].saved_registers[pin]; if (!entry.mask) { entry.mask = 1; ioapic_write_entry(apic, pin, entry); @@ -692,36 +692,23 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) } /* - * Restore IO APIC entries which was saved in ioapic_entries. + * Restore IO APIC entries which was saved in the ioapic structure. */ -int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) +int restore_ioapic_entries(void) { int apic, pin; - if (!ioapic_entries) - return -ENOMEM; - for (apic = 0; apic < nr_ioapics; apic++) { - if (!ioapic_entries[apic]) - return -ENOMEM; + if (!ioapics[apic].saved_registers) + continue; - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) ioapic_write_entry(apic, pin, - ioapic_entries[apic][pin]); + ioapics[apic].saved_registers[pin]); } return 0; } -void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries) -{ - int apic; - - for (apic = 0; apic < nr_ioapics; apic++) - kfree(ioapic_entries[apic]); - - kfree(ioapic_entries); -} - /* * Find the IRQ entry number of a certain pin. */ @@ -731,7 +718,7 @@ static int find_irq_entry(int apic, int pin, int type) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].irqtype == type && - (mp_irqs[i].dstapic == mp_ioapics[apic].apicid || + (mp_irqs[i].dstapic == mpc_ioapic_id(apic) || mp_irqs[i].dstapic == MP_APIC_ALL) && mp_irqs[i].dstirq == pin) return i; @@ -773,7 +760,7 @@ static int __init find_isa_irq_apic(int irq, int type) if (i < mp_irq_entries) { int apic; for(apic = 0; apic < nr_ioapics; apic++) { - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic) + if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic) return apic; } } @@ -942,6 +929,7 @@ static int pin_2_irq(int idx, int apic, int pin) { int irq; int bus = mp_irqs[idx].srcbus; + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic); /* * Debugging check, we are in big trouble if this message pops up! @@ -952,7 +940,7 @@ static int pin_2_irq(int idx, int apic, int pin) if (test_bit(bus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; } else { - u32 gsi = mp_gsi_routing[apic].gsi_base + pin; + u32 gsi = gsi_cfg->gsi_base + pin; if (gsi >= NR_IRQS_LEGACY) irq = gsi; @@ -1003,7 +991,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin, int lbus = mp_irqs[i].srcbus; for (apic = 0; apic < nr_ioapics; apic++) - if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || + if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic || mp_irqs[i].dstapic == MP_APIC_ALL) break; @@ -1222,7 +1210,7 @@ static inline int IO_APIC_irq_trigger(int irq) int apic, idx, pin; for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { idx = find_irq_entry(apic, pin, mp_INT); if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) return irq_trigger(idx); @@ -1350,14 +1338,14 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, apic_printk(APIC_VERBOSE,KERN_DEBUG "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " "IRQ %d Mode:%i Active:%i)\n", - apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector, + apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector, irq, trigger, polarity); - if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, + if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry, dest, trigger, polarity, cfg->vector, pin)) { printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", - mp_ioapics[apic_id].apicid, pin); + mpc_ioapic_id(apic_id), pin); __clear_irq_vector(irq, cfg); return; } @@ -1369,17 +1357,13 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq, ioapic_write_entry(apic_id, pin, entry); } -static struct { - DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1); -} mp_ioapic_routing[MAX_IO_APICS]; - static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) { if (idx != -1) return false; apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", - mp_ioapics[apic_id].apicid, pin); + mpc_ioapic_id(apic_id), pin); return true; } @@ -1389,7 +1373,7 @@ static void __init __io_apic_setup_irqs(unsigned int apic_id) struct io_apic_irq_attr attr; unsigned int pin, irq; - for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { + for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) { idx = find_irq_entry(apic_id, pin, mp_INT); if (io_apic_pin_not_connected(idx, apic_id, pin)) continue; @@ -1511,7 +1495,7 @@ __apicdebuginit(void) print_IO_APIC(void) printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); for (i = 0; i < nr_ioapics; i++) printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", - mp_ioapics[i].apicid, nr_ioapic_registers[i]); + mpc_ioapic_id(i), ioapics[i].nr_registers); /* * We are a bit conservative about what we expect. We have to @@ -1531,7 +1515,7 @@ __apicdebuginit(void) print_IO_APIC(void) raw_spin_unlock_irqrestore(&ioapic_lock, flags); printk("\n"); - printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); + printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic)); printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); @@ -1825,7 +1809,7 @@ void __init enable_IO_APIC(void) for(apic = 0; apic < nr_ioapics; apic++) { int pin; /* See if any of the pins is in ExtINT mode */ - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + for (pin = 0; pin < ioapics[apic].nr_registers; pin++) { struct IO_APIC_route_entry entry; entry = ioapic_read_entry(apic, pin); @@ -1949,14 +1933,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) reg_00.raw = io_apic_read(apic_id, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - old_id = mp_ioapics[apic_id].apicid; + old_id = mpc_ioapic_id(apic_id); - if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) { + if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", - apic_id, mp_ioapics[apic_id].apicid); + apic_id, mpc_ioapic_id(apic_id)); printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", reg_00.bits.ID); - mp_ioapics[apic_id].apicid = reg_00.bits.ID; + ioapics[apic_id].mp_config.apicid = reg_00.bits.ID; } /* @@ -1965,9 +1949,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) * 'stuck on smp_invalidate_needed IPI wait' messages. */ if (apic->check_apicid_used(&phys_id_present_map, - mp_ioapics[apic_id].apicid)) { + mpc_ioapic_id(apic_id))) { printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", - apic_id, mp_ioapics[apic_id].apicid); + apic_id, mpc_ioapic_id(apic_id)); for (i = 0; i < get_physical_broadcast(); i++) if (!physid_isset(i, phys_id_present_map)) break; @@ -1976,13 +1960,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", i); physid_set(i, phys_id_present_map); - mp_ioapics[apic_id].apicid = i; + ioapics[apic_id].mp_config.apicid = i; } else { physid_mask_t tmp; - apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); + apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id), + &tmp); apic_printk(APIC_VERBOSE, "Setting %d in the " "phys_id_present_map\n", - mp_ioapics[apic_id].apicid); + mpc_ioapic_id(apic_id)); physids_or(phys_id_present_map, phys_id_present_map, tmp); } @@ -1990,24 +1975,24 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) * We need to adjust the IRQ routing table * if the ID changed. */ - if (old_id != mp_ioapics[apic_id].apicid) + if (old_id != mpc_ioapic_id(apic_id)) for (i = 0; i < mp_irq_entries; i++) if (mp_irqs[i].dstapic == old_id) mp_irqs[i].dstapic - = mp_ioapics[apic_id].apicid; + = mpc_ioapic_id(apic_id); /* * Update the ID register according to the right value * from the MPC table if they are different. */ - if (mp_ioapics[apic_id].apicid == reg_00.bits.ID) + if (mpc_ioapic_id(apic_id) == reg_00.bits.ID) continue; apic_printk(APIC_VERBOSE, KERN_INFO "...changing IO-APIC physical APIC ID to %d ...", - mp_ioapics[apic_id].apicid); + mpc_ioapic_id(apic_id)); - reg_00.bits.ID = mp_ioapics[apic_id].apicid; + reg_00.bits.ID = mpc_ioapic_id(apic_id); raw_spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic_id, 0, reg_00.raw); raw_spin_unlock_irqrestore(&ioapic_lock, flags); @@ -2018,7 +2003,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void) raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(apic_id, 0); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) + if (reg_00.bits.ID != mpc_ioapic_id(apic_id)) printk("could not set ID!\n"); else apic_printk(APIC_VERBOSE, " ok.\n"); @@ -2404,7 +2389,7 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) raw_spin_lock_irqsave(&ioapic_lock, flags); for_each_irq_pin(entry, cfg->irq_2_pin) { - if (mp_ioapics[entry->apic].apicver >= 0x20) { + if (mpc_ioapic_ver(entry->apic) >= 0x20) { /* * Intr-remapping uses pin number as the virtual vector * in the RTE. Actual vector is programmed in @@ -2918,49 +2903,19 @@ static int __init io_apic_bug_finalize(void) late_initcall(io_apic_bug_finalize); -static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS]; - -static void suspend_ioapic(int ioapic_id) +static void resume_ioapic_id(int ioapic_id) { - struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id]; - int i; - - if (!saved_data) - return; - - for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++) - saved_data[i] = ioapic_read_entry(ioapic_id, i); -} - -static int ioapic_suspend(void) -{ - int ioapic_id; - - for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++) - suspend_ioapic(ioapic_id); - - return 0; -} - -static void resume_ioapic(int ioapic_id) -{ - struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id]; unsigned long flags; union IO_APIC_reg_00 reg_00; - int i; - if (!saved_data) - return; raw_spin_lock_irqsave(&ioapic_lock, flags); reg_00.raw = io_apic_read(ioapic_id, 0); - if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) { - reg_00.bits.ID = mp_ioapics[ioapic_id].apicid; + if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) { + reg_00.bits.ID = mpc_ioapic_id(ioapic_id); io_apic_write(ioapic_id, 0, reg_00.raw); } raw_spin_unlock_irqrestore(&ioapic_lock, flags); - for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++) - ioapic_write_entry(ioapic_id, i, saved_data[i]); } static void ioapic_resume(void) @@ -2968,28 +2923,18 @@ static void ioapic_resume(void) int ioapic_id; for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--) - resume_ioapic(ioapic_id); + resume_ioapic_id(ioapic_id); + + restore_ioapic_entries(); } static struct syscore_ops ioapic_syscore_ops = { - .suspend = ioapic_suspend, + .suspend = save_ioapic_entries, .resume = ioapic_resume, }; static int __init ioapic_init_ops(void) { - int i; - - for (i = 0; i < nr_ioapics; i++) { - unsigned int size; - - size = nr_ioapic_registers[i] - * sizeof(struct IO_APIC_route_entry); - ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL); - if (!ioapic_saved_data[i]) - pr_err("IOAPIC %d: suspend/resume impossible!\n", i); - } - register_syscore_ops(&ioapic_syscore_ops); return 0; @@ -3592,14 +3537,14 @@ int io_apic_setup_irq_pin_once(unsigned int irq, int node, int ret; /* Avoid redundant programming */ - if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) { + if (test_bit(pin, ioapics[id].pin_programmed)) { pr_debug("Pin %d-%d already programmed\n", - mp_ioapics[id].apicid, pin); + mpc_ioapic_id(id), pin); return 0; } ret = io_apic_setup_irq_pin(irq, node, attr); if (!ret) - set_bit(pin, mp_ioapic_routing[id].pin_programmed); + set_bit(pin, ioapics[id].pin_programmed); return ret; } @@ -3764,8 +3709,7 @@ static u8 __init io_apic_unique_id(u8 id) bitmap_zero(used, 256); for (i = 0; i < nr_ioapics; i++) { - struct mpc_ioapic *ia = &mp_ioapics[i]; - __set_bit(ia->apicid, used); + __set_bit(mpc_ioapic_id(i), used); } if (!test_bit(id, used)) return id; @@ -3825,7 +3769,7 @@ void __init setup_ioapic_dest(void) return; for (ioapic = 0; ioapic < nr_ioapics; ioapic++) - for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { + for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) { irq_entry = find_irq_entry(ioapic, pin, mp_INT); if (irq_entry == -1) continue; @@ -3896,7 +3840,7 @@ void __init ioapic_and_gsi_init(void) ioapic_res = ioapic_setup_resources(nr_ioapics); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { - ioapic_phys = mp_ioapics[i].apicaddr; + ioapic_phys = mpc_ioapic_addr(i); #ifdef CONFIG_X86_32 if (!ioapic_phys) { printk(KERN_ERR @@ -3956,8 +3900,9 @@ int mp_find_ioapic(u32 gsi) /* Find the IOAPIC that manages this GSI. */ for (i = 0; i < nr_ioapics; i++) { - if ((gsi >= mp_gsi_routing[i].gsi_base) - && (gsi <= mp_gsi_routing[i].gsi_end)) + struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i); + if ((gsi >= gsi_cfg->gsi_base) + && (gsi <= gsi_cfg->gsi_end)) return i; } @@ -3967,12 +3912,16 @@ int mp_find_ioapic(u32 gsi) int mp_find_ioapic_pin(int ioapic, u32 gsi) { + struct mp_ioapic_gsi *gsi_cfg; + if (WARN_ON(ioapic == -1)) return -1; - if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) + + gsi_cfg = mp_ioapic_gsi_routing(ioapic); + if (WARN_ON(gsi > gsi_cfg->gsi_end)) return -1; - return gsi - mp_gsi_routing[ioapic].gsi_base; + return gsi - gsi_cfg->gsi_base; } static __init int bad_ioapic(unsigned long address) @@ -3994,40 +3943,42 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) { int idx = 0; int entries; + struct mp_ioapic_gsi *gsi_cfg; if (bad_ioapic(address)) return; idx = nr_ioapics; - mp_ioapics[idx].type = MP_IOAPIC; - mp_ioapics[idx].flags = MPC_APIC_USABLE; - mp_ioapics[idx].apicaddr = address; + ioapics[idx].mp_config.type = MP_IOAPIC; + ioapics[idx].mp_config.flags = MPC_APIC_USABLE; + ioapics[idx].mp_config.apicaddr = address; set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); - mp_ioapics[idx].apicid = io_apic_unique_id(id); - mp_ioapics[idx].apicver = io_apic_get_version(idx); + ioapics[idx].mp_config.apicid = io_apic_unique_id(id); + ioapics[idx].mp_config.apicver = io_apic_get_version(idx); /* * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ entries = io_apic_get_redir_entries(idx); - mp_gsi_routing[idx].gsi_base = gsi_base; - mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1; + gsi_cfg = mp_ioapic_gsi_routing(idx); + gsi_cfg->gsi_base = gsi_base; + gsi_cfg->gsi_end = gsi_base + entries - 1; /* * The number of IO-APIC IRQ registers (== #pins): */ - nr_ioapic_registers[idx] = entries; + ioapics[idx].nr_registers = entries; - if (mp_gsi_routing[idx].gsi_end >= gsi_top) - gsi_top = mp_gsi_routing[idx].gsi_end + 1; + if (gsi_cfg->gsi_end >= gsi_top) + gsi_top = gsi_cfg->gsi_end + 1; printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " - "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, - mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, - mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); + "GSI %d-%d\n", idx, mpc_ioapic_id(idx), + mpc_ioapic_ver(idx), mpc_ioapic_addr(idx), + gsi_cfg->gsi_base, gsi_cfg->gsi_end); nr_ioapics++; } diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c index 30f1331..c4a61ca 100644 --- a/arch/x86/kernel/apic/numaq_32.c +++ b/arch/x86/kernel/apic/numaq_32.c @@ -472,8 +472,8 @@ static void numaq_setup_portio_remap(void) (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); } -/* Use __refdata to keep false positive warning calm. */ -struct apic __refdata apic_numaq = { +/* Use __refdata to keep false positive warning calm. */ +static struct apic __refdata apic_numaq = { .name = "NUMAQ", .probe = probe_numaq, @@ -537,3 +537,5 @@ struct apic __refdata apic_numaq = { .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, .x86_32_numa_cpu_node = numaq_numa_cpu_node, }; + +apic_driver(apic_numaq); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 6541e47..b5254ad 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -52,31 +52,6 @@ static int __init print_ipi_mode(void) } late_initcall(print_ipi_mode); -void __init default_setup_apic_routing(void) -{ - int version = apic_version[boot_cpu_physical_apicid]; - - if (num_possible_cpus() > 8) { - switch (boot_cpu_data.x86_vendor) { - case X86_VENDOR_INTEL: - if (!APIC_XAPIC(version)) { - def_to_bigsmp = 0; - break; - } - /* If P4 and above fall through */ - case X86_VENDOR_AMD: - def_to_bigsmp = 1; - } - } - -#ifdef CONFIG_X86_BIGSMP - generic_bigsmp_probe(); -#endif - - if (apic->setup_apic_routing) - apic->setup_apic_routing(); -} - static int default_x86_32_early_logical_apicid(int cpu) { return 1 << cpu; @@ -112,7 +87,7 @@ static int probe_default(void) return 1; } -struct apic apic_default = { +static struct apic apic_default = { .name = "default", .probe = probe_default, @@ -174,44 +149,22 @@ struct apic apic_default = { .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, }; -extern struct apic apic_numaq; -extern struct apic apic_summit; -extern struct apic apic_bigsmp; -extern struct apic apic_es7000; -extern struct apic apic_es7000_cluster; +apic_driver(apic_default); struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); -static struct apic *apic_probe[] __initdata = { -#ifdef CONFIG_X86_NUMAQ - &apic_numaq, -#endif -#ifdef CONFIG_X86_SUMMIT - &apic_summit, -#endif -#ifdef CONFIG_X86_BIGSMP - &apic_bigsmp, -#endif -#ifdef CONFIG_X86_ES7000 - &apic_es7000, - &apic_es7000_cluster, -#endif - &apic_default, /* must be last */ - NULL, -}; - static int cmdline_apic __initdata; static int __init parse_apic(char *arg) { - int i; + struct apic **drv; if (!arg) return -EINVAL; - for (i = 0; apic_probe[i]; i++) { - if (!strcmp(apic_probe[i]->name, arg)) { - apic = apic_probe[i]; + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!strcmp((*drv)->name, arg)) { + apic = *drv; cmdline_apic = 1; return 0; } @@ -222,38 +175,58 @@ static int __init parse_apic(char *arg) } early_param("apic", parse_apic); -void __init generic_bigsmp_probe(void) +void __init default_setup_apic_routing(void) { + int version = apic_version[boot_cpu_physical_apicid]; + + if (num_possible_cpus() > 8) { + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + if (!APIC_XAPIC(version)) { + def_to_bigsmp = 0; + break; + } + /* If P4 and above fall through */ + case X86_VENDOR_AMD: + def_to_bigsmp = 1; + } + } + #ifdef CONFIG_X86_BIGSMP /* - * This routine is used to switch to bigsmp mode when + * This is used to switch to bigsmp mode when * - There is no apic= option specified by the user * - generic_apic_probe() has chosen apic_default as the sub_arch * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support */ if (!cmdline_apic && apic == &apic_default) { - if (apic_bigsmp.probe()) { - apic = &apic_bigsmp; + struct apic *bigsmp = generic_bigsmp_probe(); + if (bigsmp) { + apic = bigsmp; printk(KERN_INFO "Overriding APIC driver with %s\n", apic->name); } } #endif + + if (apic->setup_apic_routing) + apic->setup_apic_routing(); } void __init generic_apic_probe(void) { if (!cmdline_apic) { - int i; - for (i = 0; apic_probe[i]; i++) { - if (apic_probe[i]->probe()) { - apic = apic_probe[i]; + struct apic **drv; + + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe()) { + apic = *drv; break; } } /* Not visible without early console */ - if (!apic_probe[i]) + if (drv == __apicdrivers_end) panic("Didn't find an APIC driver"); } printk(KERN_INFO "Using APIC driver %s\n", apic->name); @@ -264,16 +237,16 @@ void __init generic_apic_probe(void) int __init generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) { - int i; + struct apic **drv; - for (i = 0; apic_probe[i]; ++i) { - if (!apic_probe[i]->mps_oem_check) + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!((*drv)->mps_oem_check)) continue; - if (!apic_probe[i]->mps_oem_check(mpc, oem, productid)) + if (!(*drv)->mps_oem_check(mpc, oem, productid)) continue; if (!cmdline_apic) { - apic = apic_probe[i]; + apic = *drv; printk(KERN_INFO "Switched to APIC driver `%s'.\n", apic->name); } @@ -284,16 +257,16 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int i; + struct apic **drv; - for (i = 0; apic_probe[i]; ++i) { - if (!apic_probe[i]->acpi_madt_oem_check) + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if (!(*drv)->acpi_madt_oem_check) continue; - if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) + if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) continue; if (!cmdline_apic) { - apic = apic_probe[i]; + apic = *drv; printk(KERN_INFO "Switched to APIC driver `%s'.\n", apic->name); } diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c index d8c4a6f..3fe9866 100644 --- a/arch/x86/kernel/apic/probe_64.c +++ b/arch/x86/kernel/apic/probe_64.c @@ -23,27 +23,6 @@ #include <asm/ipi.h> #include <asm/setup.h> -extern struct apic apic_flat; -extern struct apic apic_physflat; -extern struct apic apic_x2xpic_uv_x; -extern struct apic apic_x2apic_phys; -extern struct apic apic_x2apic_cluster; - -struct apic __read_mostly *apic = &apic_flat; -EXPORT_SYMBOL_GPL(apic); - -static struct apic *apic_probe[] __initdata = { -#ifdef CONFIG_X86_UV - &apic_x2apic_uv_x, -#endif -#ifdef CONFIG_X86_X2APIC - &apic_x2apic_phys, - &apic_x2apic_cluster, -#endif - &apic_physflat, - NULL, -}; - static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) { return hard_smp_processor_id() >> index_msb; @@ -54,26 +33,20 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) */ void __init default_setup_apic_routing(void) { + struct apic **drv; enable_IR_x2apic(); -#ifdef CONFIG_X86_X2APIC - if (x2apic_mode -#ifdef CONFIG_X86_UV - && apic != &apic_x2apic_uv_x -#endif - ) { - if (x2apic_phys) - apic = &apic_x2apic_phys; - else - apic = &apic_x2apic_cluster; + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->probe && (*drv)->probe()) { + if (apic != *drv) { + apic = *drv; + pr_info("Switched APIC routing to %s.\n", + apic->name); + } + break; + } } -#endif - - if (apic == &apic_flat && num_possible_cpus() > 8) - apic = &apic_physflat; - - printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); if (is_vsmp_box()) { /* need to update phys_pkg_id */ @@ -90,13 +63,15 @@ void apic_send_IPI_self(int vector) int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int i; + struct apic **drv; - for (i = 0; apic_probe[i]; ++i) { - if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { - apic = apic_probe[i]; - printk(KERN_INFO "Setting APIC routing to %s.\n", - apic->name); + for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) { + if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) { + if (apic != *drv) { + apic = *drv; + pr_info("Setting APIC routing to %s.\n", + apic->name); + } return 1; } } diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c index 35bcd7d..1911442 100644 --- a/arch/x86/kernel/apic/summit_32.c +++ b/arch/x86/kernel/apic/summit_32.c @@ -491,7 +491,7 @@ void setup_summit(void) } #endif -struct apic apic_summit = { +static struct apic apic_summit = { .name = "summit", .probe = probe_summit, @@ -552,3 +552,5 @@ struct apic apic_summit = { .x86_32_early_logical_apicid = summit_early_logical_apicid, }; + +apic_driver(apic_summit); diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 90949bb..5007958 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -5,118 +5,95 @@ #include <linux/ctype.h> #include <linux/init.h> #include <linux/dmar.h> +#include <linux/cpu.h> #include <asm/smp.h> -#include <asm/apic.h> -#include <asm/ipi.h> +#include <asm/x2apic.h> static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster); +static DEFINE_PER_CPU(cpumask_var_t, ipi_mask); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { return x2apic_enabled(); } -/* - * need to use more than cpu 0, because we need more vectors when - * MSI-X are used. - */ -static const struct cpumask *x2apic_target_cpus(void) +static inline u32 x2apic_cluster(int cpu) { - return cpu_online_mask; -} - -/* - * for now each logical cpu is in its own vector allocation domain. - */ -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); + return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16; } static void - __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { - unsigned long cfg; + struct cpumask *cpus_in_cluster_ptr; + struct cpumask *ipi_mask_ptr; + unsigned int cpu, this_cpu; + unsigned long flags; + u32 dest; + + x2apic_wrmsr_fence(); + + local_irq_save(flags); - cfg = __prepare_ICR(0, vector, dest); + this_cpu = smp_processor_id(); /* - * send the IPI. + * We are to modify mask, so we need an own copy + * and be sure it's manipulated with irq off. */ - native_x2apic_icr_write(cfg, apicid); -} + ipi_mask_ptr = __raw_get_cpu_var(ipi_mask); + cpumask_copy(ipi_mask_ptr, mask); -/* - * for now, we send the IPI's one by one in the cpumask. - * TBD: Based on the cpu mask, we can send the IPI's to the cluster group - * at once. We have 16 cpu's in a cluster. This will minimize IPI register - * writes. - */ -static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) -{ - unsigned long query_cpu; - unsigned long flags; + /* + * The idea is to send one IPI per cluster. + */ + for_each_cpu(cpu, ipi_mask_ptr) { + unsigned long i; - x2apic_wrmsr_fence(); + cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu); + dest = 0; - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); + /* Collect cpus in cluster. */ + for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) { + if (apic_dest == APIC_DEST_ALLINC || i != this_cpu) + dest |= per_cpu(x86_cpu_to_logical_apicid, i); + } + + if (!dest) + continue; + + __x2apic_send_IPI_dest(dest, vector, apic->dest_logical); + /* + * Cluster sibling cpus should be discared now so + * we would not send IPI them second time. + */ + cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr); } + local_irq_restore(flags); } +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu == this_cpu) - continue; - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_allbutself(int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_online_cpu(query_cpu) { - if (query_cpu == this_cpu) - continue; - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_logical_apicid, query_cpu), - vector, apic->dest_logical); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_mask, vector); -} - -static int x2apic_apic_id_registered(void) -{ - return 1; + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) @@ -151,43 +128,90 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return per_cpu(x86_cpu_to_logical_apicid, cpu); } -static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) +static void init_x2apic_ldr(void) { - unsigned int id; + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; - id = x; - return id; + per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); + + __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); + for_each_online_cpu(cpu) { + if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu)); + __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu)); + } } -static unsigned long set_apic_id(unsigned int id) + /* + * At CPU state changes, update the x2apic cluster sibling info. + */ +static int __cpuinit +update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned long x; + unsigned int this_cpu = (unsigned long)hcpu; + unsigned int cpu; + int err = 0; + + switch (action) { + case CPU_UP_PREPARE: + if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu), + GFP_KERNEL)) { + err = -ENOMEM; + } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu), + GFP_KERNEL)) { + free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); + err = -ENOMEM; + } + break; + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DEAD: + for_each_online_cpu(cpu) { + if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) + continue; + __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu)); + __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu)); + } + free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu)); + free_cpumask_var(per_cpu(ipi_mask, this_cpu)); + break; + } - x = id; - return x; + return notifier_from_errno(err); } -static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) -{ - return initial_apicid >> index_msb; -} +static struct notifier_block __refdata x2apic_cpu_notifier = { + .notifier_call = update_clusterinfo, +}; -static void x2apic_send_IPI_self(int vector) +static int x2apic_init_cpu_notifier(void) { - apic_write(APIC_SELF_IPI, vector); + int cpu = smp_processor_id(); + + zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL); + zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL); + + BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); + + __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu)); + register_hotcpu_notifier(&x2apic_cpu_notifier); + return 1; } -static void init_x2apic_ldr(void) +static int x2apic_cluster_probe(void) { - int cpu = smp_processor_id(); - - per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); + if (x2apic_mode) + return x2apic_init_cpu_notifier(); + else + return 0; } -struct apic apic_x2apic_cluster = { +static struct apic apic_x2apic_cluster = { .name = "cluster x2apic", - .probe = NULL, + .probe = x2apic_cluster_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, .apic_id_registered = x2apic_apic_id_registered, @@ -211,11 +235,11 @@ struct apic apic_x2apic_cluster = { .setup_portio_remap = NULL, .check_phys_apicid_present = default_check_phys_apicid_present, .enable_apic_mode = NULL, - .phys_pkg_id = x2apic_cluster_phys_pkg_id, + .phys_pkg_id = x2apic_phys_pkg_id, .mps_oem_check = NULL, - .get_apic_id = x2apic_cluster_phys_get_apic_id, - .set_apic_id = set_apic_id, + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, @@ -240,3 +264,5 @@ struct apic apic_x2apic_cluster = { .wait_icr_idle = native_x2apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; + +apic_driver(apic_x2apic_cluster); diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c index c7e6d66..f5373df 100644 --- a/arch/x86/kernel/apic/x2apic_phys.c +++ b/arch/x86/kernel/apic/x2apic_phys.c @@ -7,11 +7,12 @@ #include <linux/dmar.h> #include <asm/smp.h> -#include <asm/apic.h> -#include <asm/ipi.h> +#include <asm/x2apic.h> int x2apic_phys; +static struct apic apic_x2apic_phys; + static int set_x2apic_phys_mode(char *arg) { x2apic_phys = 1; @@ -27,94 +28,46 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) return 0; } -/* - * need to use more than cpu 0, because we need more vectors when - * MSI-X are used. - */ -static const struct cpumask *x2apic_target_cpus(void) -{ - return cpu_online_mask; -} - -static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask) -{ - cpumask_clear(retmask); - cpumask_set_cpu(cpu, retmask); -} - -static void __x2apic_send_IPI_dest(unsigned int apicid, int vector, - unsigned int dest) -{ - unsigned long cfg; - - cfg = __prepare_ICR(0, vector, dest); - - /* - * send the IPI. - */ - native_x2apic_icr_write(cfg, apicid); -} - -static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +static void +__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest) { unsigned long query_cpu; + unsigned long this_cpu; unsigned long flags; x2apic_wrmsr_fence(); local_irq_save(flags); + + this_cpu = smp_processor_id(); for_each_cpu(query_cpu, mask) { + if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu) + continue; __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), vector, APIC_DEST_PHYSICAL); } local_irq_restore(flags); } +static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector) +{ + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC); +} + static void x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_cpu(query_cpu, mask) { - if (query_cpu != this_cpu) - __x2apic_send_IPI_dest( - per_cpu(x86_cpu_to_apicid, query_cpu), - vector, APIC_DEST_PHYSICAL); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_allbutself(int vector) { - unsigned long this_cpu = smp_processor_id(); - unsigned long query_cpu; - unsigned long flags; - - x2apic_wrmsr_fence(); - - local_irq_save(flags); - for_each_online_cpu(query_cpu) { - if (query_cpu == this_cpu) - continue; - __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), - vector, APIC_DEST_PHYSICAL); - } - local_irq_restore(flags); + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT); } static void x2apic_send_IPI_all(int vector) { - x2apic_send_IPI_mask(cpu_online_mask, vector); -} - -static int x2apic_apic_id_registered(void) -{ - return 1; + __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC); } static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) @@ -149,34 +102,22 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask, return per_cpu(x86_cpu_to_apicid, cpu); } -static unsigned int x2apic_phys_get_apic_id(unsigned long x) -{ - return x; -} - -static unsigned long set_apic_id(unsigned int id) -{ - return id; -} - -static int x2apic_phys_pkg_id(int initial_apicid, int index_msb) +static void init_x2apic_ldr(void) { - return initial_apicid >> index_msb; } -static void x2apic_send_IPI_self(int vector) +static int x2apic_phys_probe(void) { - apic_write(APIC_SELF_IPI, vector); -} + if (x2apic_mode && x2apic_phys) + return 1; -static void init_x2apic_ldr(void) -{ + return apic == &apic_x2apic_phys; } -struct apic apic_x2apic_phys = { +static struct apic apic_x2apic_phys = { .name = "physical x2apic", - .probe = NULL, + .probe = x2apic_phys_probe, .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, .apic_id_registered = x2apic_apic_id_registered, @@ -203,8 +144,8 @@ struct apic apic_x2apic_phys = { .phys_pkg_id = x2apic_phys_pkg_id, .mps_oem_check = NULL, - .get_apic_id = x2apic_phys_get_apic_id, - .set_apic_id = set_apic_id, + .get_apic_id = x2apic_get_apic_id, + .set_apic_id = x2apic_set_apic_id, .apic_id_mask = 0xFFFFFFFFu, .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, @@ -229,3 +170,5 @@ struct apic apic_x2apic_phys = { .wait_icr_idle = native_x2apic_wait_icr_idle, .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, }; + +apic_driver(apic_x2apic_phys); diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 7acd2d2..f450b68 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -58,6 +58,8 @@ unsigned int uv_apicid_hibits; EXPORT_SYMBOL_GPL(uv_apicid_hibits); static DEFINE_SPINLOCK(uv_nmi_lock); +static struct apic apic_x2apic_uv_x; + static unsigned long __init uv_early_read_mmr(unsigned long addr) { unsigned long val, *mmr; @@ -326,10 +328,15 @@ static void uv_send_IPI_self(int vector) apic_write(APIC_SELF_IPI, vector); } -struct apic __refdata apic_x2apic_uv_x = { +static int uv_probe(void) +{ + return apic == &apic_x2apic_uv_x; +} + +static struct apic __refdata apic_x2apic_uv_x = { .name = "UV large system", - .probe = NULL, + .probe = uv_probe, .acpi_madt_oem_check = uv_acpi_madt_oem_check, .apic_id_registered = uv_apic_id_registered, @@ -859,3 +866,5 @@ void __init uv_system_init(void) if (is_kdump_kernel()) reboot_type = BOOT_ACPI; } + +apic_driver(apic_x2apic_uv_x); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 6f9d1f6..8f5cabb3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -629,10 +629,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 */ u64 mask; + int err; - rdmsrl(MSR_AMD64_MCx_MASK(4), mask); - mask |= (1 << 10); - wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask); + if (err == 0) { + mask |= (1 << 10); + checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); + } } } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index cbc70a2..c8b4162 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -254,7 +254,7 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c) } #endif -static int disable_smep __initdata; +static int disable_smep __cpuinitdata; static __init int setup_disable_smep(char *arg) { disable_smep = 1; @@ -262,7 +262,7 @@ static __init int setup_disable_smep(char *arg) } __setup("nosmep", setup_disable_smep); -static __init void setup_smep(struct cpuinfo_x86 *c) +static __cpuinit void setup_smep(struct cpuinfo_x86 *c) { if (cpu_has(c, X86_FEATURE_SMEP)) { if (unlikely(disable_smep)) { diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index e90f084..690bc84 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -369,6 +369,7 @@ static struct of_ioapic_type of_ioapic_type[] = static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, u32 *out_hwirq, u32 *out_type) { + struct mp_ioapic_gsi *gsi_cfg; struct io_apic_irq_attr attr; struct of_ioapic_type *it; u32 line, idx, type; @@ -378,7 +379,8 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, line = *intspec; idx = (u32) id->priv; - *out_hwirq = line + mp_gsi_routing[idx].gsi_base; + gsi_cfg = mp_ioapic_gsi_routing(idx); + *out_hwirq = line + gsi_cfg->gsi_base; intspec++; type = *intspec; @@ -407,7 +409,7 @@ static void __init ioapic_add_ofnode(struct device_node *np) } for (i = 0; i < nr_ioapics; i++) { - if (r.start == mp_ioapics[i].apicaddr) { + if (r.start == mpc_ioapic_addr(i)) { struct irq_domain *id; id = kzalloc(sizeof(*id), GFP_KERNEL); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 6f9bfff..9103b89 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -285,7 +285,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type) intsrc.type = MP_INTSRC; intsrc.irqflag = 0; /* conforming */ intsrc.srcbus = 0; - intsrc.dstapic = mp_ioapics[0].apicid; + intsrc.dstapic = mpc_ioapic_id(0); intsrc.irqtype = mp_INT; diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 9ea999a..b49d00d 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -68,74 +68,10 @@ int dma_set_mask(struct device *dev, u64 mask) } EXPORT_SYMBOL(dma_set_mask); -#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA) -static __initdata void *dma32_bootmem_ptr; -static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); - -static int __init parse_dma32_size_opt(char *p) -{ - if (!p) - return -EINVAL; - dma32_bootmem_size = memparse(p, &p); - return 0; -} -early_param("dma32_size", parse_dma32_size_opt); - -void __init dma32_reserve_bootmem(void) -{ - unsigned long size, align; - if (max_pfn <= MAX_DMA32_PFN) - return; - - /* - * check aperture_64.c allocate_aperture() for reason about - * using 512M as goal - */ - align = 64ULL<<20; - size = roundup(dma32_bootmem_size, align); - dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align, - 512ULL<<20); - /* - * Kmemleak should not scan this block as it may not be mapped via the - * kernel direct mapping. - */ - kmemleak_ignore(dma32_bootmem_ptr); - if (dma32_bootmem_ptr) - dma32_bootmem_size = size; - else - dma32_bootmem_size = 0; -} -static void __init dma32_free_bootmem(void) -{ - - if (max_pfn <= MAX_DMA32_PFN) - return; - - if (!dma32_bootmem_ptr) - return; - - free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size); - - dma32_bootmem_ptr = NULL; - dma32_bootmem_size = 0; -} -#else -void __init dma32_reserve_bootmem(void) -{ -} -static void __init dma32_free_bootmem(void) -{ -} - -#endif - void __init pci_iommu_alloc(void) { struct iommu_table_entry *p; - /* free the range so iommu could get some range less than 4G */ - dma32_free_bootmem(); - sort_iommu_table(__iommu_table, __iommu_table_end); check_iommu_entries(__iommu_table, __iommu_table_end); diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index f65e5b5..807c2a2 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -1363,7 +1363,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, * We must return the syscall number to actually look up in the table. * This can be -1L to skip running any syscall at all. */ -asmregparm long syscall_trace_enter(struct pt_regs *regs) +long syscall_trace_enter(struct pt_regs *regs) { long ret = 0; @@ -1408,7 +1408,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs) return ret ?: regs->orig_ax; } -asmregparm void syscall_trace_leave(struct pt_regs *regs) +void syscall_trace_leave(struct pt_regs *regs) { bool step; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c3050af..a3e5948 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -946,6 +946,8 @@ void __init setup_arch(char **cmdline_p) if (init_ohci1394_dma_early) init_ohci1394_dma_on_all_controllers(); #endif + /* Allocate bigger log buffer */ + setup_log_buf(1); reserve_initrd(); @@ -964,7 +966,6 @@ void __init setup_arch(char **cmdline_p) initmem_init(); memblock_find_dma_reserve(); - dma32_reserve_bootmem(); #ifdef CONFIG_KVM_CLOCK kvmclock_init(); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 4fd173c..40a2493 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -601,10 +601,7 @@ long sys_rt_sigreturn(struct pt_regs *regs) goto badframe; sigdelsetmask(&set, ~_BLOCKABLE); - spin_lock_irq(¤t->sighand->siglock); - current->blocked = set; - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + set_current_blocked(&set); if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) goto badframe; @@ -682,6 +679,7 @@ static int handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, sigset_t *oldset, struct pt_regs *regs) { + sigset_t blocked; int ret; /* Are we from a system call? */ @@ -741,12 +739,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, */ regs->flags &= ~X86_EFLAGS_TF; - spin_lock_irq(¤t->sighand->siglock); - sigorsets(¤t->blocked, ¤t->blocked, &ka->sa.sa_mask); + sigorsets(&blocked, ¤t->blocked, &ka->sa.sa_mask); if (!(ka->sa.sa_flags & SA_NODEFER)) - sigaddset(¤t->blocked, sig); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + sigaddset(&blocked, sig); + set_current_blocked(&blocked); tracehook_signal_handler(sig, info, ka, regs, test_thread_flag(TIF_SINGLESTEP)); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index abce34d..32cbffb 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -344,3 +344,4 @@ ENTRY(sys_call_table) .long sys_open_by_handle_at .long sys_clock_adjtime .long sys_syncfs + .long sys_sendmmsg /* 345 */ diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 998e972..30ac65d 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -110,7 +110,6 @@ static struct mm_struct tboot_mm = { .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), - .cpu_vm_mask = CPU_MASK_ALL, }; static inline void switch_to_tboot_pt(void) diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c index 787a5e4..3f92ce0 100644 --- a/arch/x86/kernel/test_nx.c +++ b/arch/x86/kernel/test_nx.c @@ -161,7 +161,7 @@ static int test_NX(void) } #endif - return 0; + return ret; } static void test_exit(void) diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 25a28a2..00cbb27 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -23,7 +23,7 @@ #include <asm/time.h> #ifdef CONFIG_X86_64 -volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; +DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES; #endif unsigned long profile_pc(struct pt_regs *regs) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9335bf7..6cc6922 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -763,25 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs) ret : clocksource_tsc.cycle_last; } -#ifdef CONFIG_X86_64 -static cycle_t __vsyscall_fn vread_tsc(void) -{ - cycle_t ret; - - /* - * Surround the RDTSC by barriers, to make sure it's not - * speculated to outside the seqlock critical section and - * does not cause time warps: - */ - rdtsc_barrier(); - ret = (cycle_t)vget_cycles(); - rdtsc_barrier(); - - return ret >= __vsyscall_gtod_data.clock.cycle_last ? - ret : __vsyscall_gtod_data.clock.cycle_last; -} -#endif - static void resume_tsc(struct clocksource *cs) { clocksource_tsc.cycle_last = 0; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 624a201..89aed99 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -161,6 +161,12 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) +#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \ + ADDR(.vsyscall_0) + offset \ + : AT(VLOAD(.vsyscall_var_ ## x)) { \ + *(.vsyscall_var_ ## x) \ + } \ + x = VVIRT(.vsyscall_var_ ## x); . = ALIGN(4096); __vsyscall_0 = .; @@ -175,18 +181,6 @@ SECTIONS *(.vsyscall_fn) } - . = ALIGN(L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { - *(.vsyscall_gtod_data) - } - - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { - *(.vsyscall_clock) - } - vsyscall_clock = VVIRT(.vsyscall_clock); - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) } @@ -194,21 +188,14 @@ SECTIONS *(.vsyscall_2) } - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { - *(.vgetcpu_mode) - } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - - . = ALIGN(L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { - *(.jiffies) - } - jiffies = VVIRT(.jiffies); - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) } +#define __VVAR_KERNEL_LDS +#include <asm/vvar.h> +#undef __VVAR_KERNEL_LDS + . = __vsyscall_0 + PAGE_SIZE; #undef VSYSCALL_ADDR @@ -216,6 +203,7 @@ SECTIONS #undef VLOAD #undef VVIRT_OFFSET #undef VVIRT +#undef EMIT_VVAR #endif /* CONFIG_X86_64 */ @@ -306,6 +294,13 @@ SECTIONS } . = ALIGN(8); + .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) { + __apicdrivers = .; + *(.apicdrivers); + __apicdrivers_end = .; + } + + . = ALIGN(8); /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame @@ -319,7 +314,7 @@ SECTIONS } #if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) - PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE) + PERCPU_SECTION(INTERNODE_CACHE_BYTES) #endif . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c new file mode 100644 index 0000000..a81aa9e --- /dev/null +++ b/arch/x86/kernel/vread_tsc_64.c @@ -0,0 +1,36 @@ +/* This code runs in userspace. */ + +#define DISABLE_BRANCH_PROFILING +#include <asm/vgtod.h> + +notrace cycle_t __vsyscall_fn vread_tsc(void) +{ + cycle_t ret; + u64 last; + + /* + * Empirically, a fence (of type that depends on the CPU) + * before rdtsc is enough to ensure that rdtsc is ordered + * with respect to loads. The various CPU manuals are unclear + * as to whether rdtsc can be reordered with later loads, + * but no one has ever seen it happen. + */ + rdtsc_barrier(); + ret = (cycle_t)vget_cycles(); + + last = VVAR(vsyscall_gtod_data).clock.cycle_last; + + if (likely(ret >= last)) + return ret; + + /* + * GCC likes to generate cmov here, but this branch is extremely + * predictable (it's just a funciton of time and the likely is + * very likely) and there's a data dependence, so force GCC + * to generate a branch instead. I don't barrier() because + * we don't actually need a barrier, and if this function + * ever gets inlined it will generate worse code. + */ + asm volatile (""); + return last; +} diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index dcbb28c..3e68218 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -49,17 +49,10 @@ __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace #define __syscall_clobber "r11","cx","memory" -/* - * vsyscall_gtod_data contains data that is : - * - readonly from vsyscalls - * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) - * Try to keep this structure as small as possible to avoid cache line ping pongs - */ -int __vgetcpu_mode __section_vgetcpu_mode; - -struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data = +DEFINE_VVAR(int, vgetcpu_mode); +DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { - .lock = SEQLOCK_UNLOCKED, + .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), .sysctl_enabled = 1, }; @@ -97,7 +90,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, */ static __always_inline void do_get_tz(struct timezone * tz) { - *tz = __vsyscall_gtod_data.sys_tz; + *tz = VVAR(vsyscall_gtod_data).sys_tz; } static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) @@ -126,23 +119,24 @@ static __always_inline void do_vgettimeofday(struct timeval * tv) unsigned long mult, shift, nsec; cycle_t (*vread)(void); do { - seq = read_seqbegin(&__vsyscall_gtod_data.lock); + seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); - vread = __vsyscall_gtod_data.clock.vread; - if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { + vread = VVAR(vsyscall_gtod_data).clock.vread; + if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled || + !vread)) { gettimeofday(tv,NULL); return; } now = vread(); - base = __vsyscall_gtod_data.clock.cycle_last; - mask = __vsyscall_gtod_data.clock.mask; - mult = __vsyscall_gtod_data.clock.mult; - shift = __vsyscall_gtod_data.clock.shift; + base = VVAR(vsyscall_gtod_data).clock.cycle_last; + mask = VVAR(vsyscall_gtod_data).clock.mask; + mult = VVAR(vsyscall_gtod_data).clock.mult; + shift = VVAR(vsyscall_gtod_data).clock.shift; - tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; - nsec = __vsyscall_gtod_data.wall_time_nsec; - } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec; + nsec = VVAR(vsyscall_gtod_data).wall_time_nsec; + } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); /* calculate interval: */ cycle_delta = (now - base) & mask; @@ -171,15 +165,15 @@ time_t __vsyscall(1) vtime(time_t *t) { unsigned seq; time_t result; - if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) + if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) return time_syscall(t); do { - seq = read_seqbegin(&__vsyscall_gtod_data.lock); + seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock); - result = __vsyscall_gtod_data.wall_time_sec; + result = VVAR(vsyscall_gtod_data).wall_time_sec; - } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq)); if (t) *t = result; @@ -208,9 +202,9 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) We do this here because otherwise user space would do it on its own in a likely inferior way (no access to jiffies). If you don't like it pass NULL. */ - if (tcache && tcache->blob[0] == (j = __jiffies)) { + if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) { p = tcache->blob[1]; - } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { + } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ native_read_tscp(&p); } else { diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 0ad47b8..d6e2477 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -73,9 +73,14 @@ #define MemAbs (1<<11) /* Memory operand is absolute displacement */ #define String (1<<12) /* String instruction (rep capable) */ #define Stack (1<<13) /* Stack instruction (push/pop) */ +#define GroupMask (7<<14) /* Opcode uses one of the group mechanisms */ #define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ -#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ +#define GroupDual (2<<14) /* Alternate decoding of mod == 3 */ +#define Prefix (3<<14) /* Instruction varies with 66/f2/f3 prefix */ +#define RMExt (4<<14) /* Opcode extension in ModRM r/m if mod == 3 */ +#define Sse (1<<17) /* SSE Vector instruction */ /* Misc flags */ +#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define VendorSpecific (1<<22) /* Vendor specific instruction */ #define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ #define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ @@ -102,11 +107,14 @@ struct opcode { u32 flags; + u8 intercept; union { int (*execute)(struct x86_emulate_ctxt *ctxt); struct opcode *group; struct group_dual *gdual; + struct gprefix *gprefix; } u; + int (*check_perm)(struct x86_emulate_ctxt *ctxt); }; struct group_dual { @@ -114,6 +122,13 @@ struct group_dual { struct opcode mod3[8]; }; +struct gprefix { + struct opcode pfx_no; + struct opcode pfx_66; + struct opcode pfx_f2; + struct opcode pfx_f3; +}; + /* EFLAGS bit definitions. */ #define EFLG_ID (1<<21) #define EFLG_VIP (1<<20) @@ -248,42 +263,42 @@ struct group_dual { "w", "r", _LO32, "r", "", "r") /* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ - do { \ - unsigned long _tmp; \ - _type _clv = (_cl).val; \ - _type _srcv = (_src).val; \ - _type _dstv = (_dst).val; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "5", "2") \ - _op _suffix " %4,%1 \n" \ - _POST_EFLAGS("0", "5", "2") \ - : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ - : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ - ); \ - \ - (_cl).val = (unsigned long) _clv; \ - (_src).val = (unsigned long) _srcv; \ - (_dst).val = (unsigned long) _dstv; \ +#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ + do { \ + unsigned long _tmp; \ + _type _clv = (_cl).val; \ + _type _srcv = (_src).val; \ + _type _dstv = (_dst).val; \ + \ + __asm__ __volatile__ ( \ + _PRE_EFLAGS("0", "5", "2") \ + _op _suffix " %4,%1 \n" \ + _POST_EFLAGS("0", "5", "2") \ + : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ + ); \ + \ + (_cl).val = (unsigned long) _clv; \ + (_src).val = (unsigned long) _srcv; \ + (_dst).val = (unsigned long) _dstv; \ } while (0) -#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ - do { \ - switch ((_dst).bytes) { \ - case 2: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "w", unsigned short); \ - break; \ - case 4: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "l", unsigned int); \ - break; \ - case 8: \ - ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "q", unsigned long)); \ - break; \ - } \ +#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ + do { \ + switch ((_dst).bytes) { \ + case 2: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "w", unsigned short); \ + break; \ + case 4: \ + __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "l", unsigned int); \ + break; \ + case 8: \ + ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ + "q", unsigned long)); \ + break; \ + } \ } while (0) #define __emulate_1op(_op, _dst, _eflags, _suffix) \ @@ -346,13 +361,25 @@ struct group_dual { } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ - do { \ - switch((_src).bytes) { \ - case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ - case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ - case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ - case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ +#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ + do { \ + switch((_src).bytes) { \ + case 1: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "b"); \ + break; \ + case 2: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "w"); \ + break; \ + case 4: \ + __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "l"); \ + break; \ + case 8: \ + ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ + _eflags, "q")); \ + break; \ } \ } while (0) @@ -388,13 +415,33 @@ struct group_dual { (_type)_x; \ }) -#define insn_fetch_arr(_arr, _size, _eip) \ +#define insn_fetch_arr(_arr, _size, _eip) \ ({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ (_eip) += (_size); \ }) +static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt, + enum x86_intercept intercept, + enum x86_intercept_stage stage) +{ + struct x86_instruction_info info = { + .intercept = intercept, + .rep_prefix = ctxt->decode.rep_prefix, + .modrm_mod = ctxt->decode.modrm_mod, + .modrm_reg = ctxt->decode.modrm_reg, + .modrm_rm = ctxt->decode.modrm_rm, + .src_val = ctxt->decode.src.val64, + .src_bytes = ctxt->decode.src.bytes, + .dst_bytes = ctxt->decode.dst.bytes, + .ad_bytes = ctxt->decode.ad_bytes, + .next_rip = ctxt->eip, + }; + + return ctxt->ops->intercept(ctxt, &info, stage); +} + static inline unsigned long ad_mask(struct decode_cache *c) { return (1UL << (c->ad_bytes << 3)) - 1; @@ -430,6 +477,13 @@ static inline void jmp_rel(struct decode_cache *c, int rel) register_address_increment(c, &c->eip, rel); } +static u32 desc_limit_scaled(struct desc_struct *desc) +{ + u32 limit = get_desc_limit(desc); + + return desc->g ? (limit << 12) | 0xfff : limit; +} + static void set_seg_override(struct decode_cache *c, int seg) { c->has_seg_override = true; @@ -442,11 +496,10 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt, if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) return 0; - return ops->get_cached_segment_base(seg, ctxt->vcpu); + return ops->get_cached_segment_base(ctxt, seg); } static unsigned seg_override(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct decode_cache *c) { if (!c->has_seg_override) @@ -455,18 +508,6 @@ static unsigned seg_override(struct x86_emulate_ctxt *ctxt, return c->seg_override; } -static ulong linear(struct x86_emulate_ctxt *ctxt, - struct segmented_address addr) -{ - struct decode_cache *c = &ctxt->decode; - ulong la; - - la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; - if (c->ad_bytes != 8) - la &= (u32)-1; - return la; -} - static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, u32 error, bool valid) { @@ -476,11 +517,21 @@ static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, return X86EMUL_PROPAGATE_FAULT; } +static int emulate_db(struct x86_emulate_ctxt *ctxt) +{ + return emulate_exception(ctxt, DB_VECTOR, 0, false); +} + static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) { return emulate_exception(ctxt, GP_VECTOR, err, true); } +static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err) +{ + return emulate_exception(ctxt, SS_VECTOR, err, true); +} + static int emulate_ud(struct x86_emulate_ctxt *ctxt) { return emulate_exception(ctxt, UD_VECTOR, 0, false); @@ -496,6 +547,128 @@ static int emulate_de(struct x86_emulate_ctxt *ctxt) return emulate_exception(ctxt, DE_VECTOR, 0, false); } +static int emulate_nm(struct x86_emulate_ctxt *ctxt) +{ + return emulate_exception(ctxt, NM_VECTOR, 0, false); +} + +static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg) +{ + u16 selector; + struct desc_struct desc; + + ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg); + return selector; +} + +static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector, + unsigned seg) +{ + u16 dummy; + u32 base3; + struct desc_struct desc; + + ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg); + ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg); +} + +static int __linearize(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + unsigned size, bool write, bool fetch, + ulong *linear) +{ + struct decode_cache *c = &ctxt->decode; + struct desc_struct desc; + bool usable; + ulong la; + u32 lim; + u16 sel; + unsigned cpl, rpl; + + la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea; + switch (ctxt->mode) { + case X86EMUL_MODE_REAL: + break; + case X86EMUL_MODE_PROT64: + if (((signed long)la << 16) >> 16 != la) + return emulate_gp(ctxt, 0); + break; + default: + usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL, + addr.seg); + if (!usable) + goto bad; + /* code segment or read-only data segment */ + if (((desc.type & 8) || !(desc.type & 2)) && write) + goto bad; + /* unreadable code segment */ + if (!fetch && (desc.type & 8) && !(desc.type & 2)) + goto bad; + lim = desc_limit_scaled(&desc); + if ((desc.type & 8) || !(desc.type & 4)) { + /* expand-up segment */ + if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + goto bad; + } else { + /* exapand-down segment */ + if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim) + goto bad; + lim = desc.d ? 0xffffffff : 0xffff; + if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim) + goto bad; + } + cpl = ctxt->ops->cpl(ctxt); + rpl = sel & 3; + cpl = max(cpl, rpl); + if (!(desc.type & 8)) { + /* data segment */ + if (cpl > desc.dpl) + goto bad; + } else if ((desc.type & 8) && !(desc.type & 4)) { + /* nonconforming code segment */ + if (cpl != desc.dpl) + goto bad; + } else if ((desc.type & 8) && (desc.type & 4)) { + /* conforming code segment */ + if (cpl < desc.dpl) + goto bad; + } + break; + } + if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8) + la &= (u32)-1; + *linear = la; + return X86EMUL_CONTINUE; +bad: + if (addr.seg == VCPU_SREG_SS) + return emulate_ss(ctxt, addr.seg); + else + return emulate_gp(ctxt, addr.seg); +} + +static int linearize(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + unsigned size, bool write, + ulong *linear) +{ + return __linearize(ctxt, addr, size, write, false, linear); +} + + +static int segmented_read_std(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, false, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); +} + static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned long eip, u8 *dest) @@ -505,10 +678,15 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, int size, cur_size; if (eip == fc->end) { + unsigned long linear; + struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip}; cur_size = fc->end - fc->start; size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); - rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, - size, ctxt->vcpu, &ctxt->exception); + rc = __linearize(ctxt, addr, size, false, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + rc = ops->fetch(ctxt, linear, fc->data + cur_size, + size, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; fc->end += size; @@ -551,7 +729,6 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs, } static int read_descriptor(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, struct segmented_address addr, u16 *size, unsigned long *address, int op_bytes) { @@ -560,13 +737,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt, if (op_bytes == 2) op_bytes = 3; *address = 0; - rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2, - ctxt->vcpu, &ctxt->exception); + rc = segmented_read_std(ctxt, addr, size, 2); if (rc != X86EMUL_CONTINUE) return rc; addr.ea += 2; - rc = ops->read_std(linear(ctxt, addr), address, op_bytes, - ctxt->vcpu, &ctxt->exception); + rc = segmented_read_std(ctxt, addr, address, op_bytes); return rc; } @@ -623,7 +798,63 @@ static void fetch_register_operand(struct operand *op) } } -static void decode_register_operand(struct operand *op, +static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break; + case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break; + case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break; + case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break; + case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break; + case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break; + case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break; + case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break; + case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break; + case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break; + case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break; + case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break; + case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break; + case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break; + case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break; +#endif + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, + int reg) +{ + ctxt->ops->get_fpu(ctxt); + switch (reg) { + case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break; + case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break; + case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break; + case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break; + case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break; + case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break; + case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break; + case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break; +#ifdef CONFIG_X86_64 + case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break; + case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break; + case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break; + case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break; + case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break; + case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break; + case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break; + case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break; +#endif + default: BUG(); + } + ctxt->ops->put_fpu(ctxt); +} + +static void decode_register_operand(struct x86_emulate_ctxt *ctxt, + struct operand *op, struct decode_cache *c, int inhibit_bytereg) { @@ -632,6 +863,15 @@ static void decode_register_operand(struct operand *op, if (!(c->d & ModRM)) reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); + + if (c->d & Sse) { + op->type = OP_XMM; + op->bytes = 16; + op->addr.xmm = reg; + read_sse_reg(ctxt, &op->vec_val, reg); + return; + } + op->type = OP_REG; if ((c->d & ByteOp) && !inhibit_bytereg) { op->addr.reg = decode_register(reg, c->regs, highbyte_regs); @@ -671,6 +911,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; op->addr.reg = decode_register(c->modrm_rm, c->regs, c->d & ByteOp); + if (c->d & Sse) { + op->type = OP_XMM; + op->bytes = 16; + op->addr.xmm = c->modrm_rm; + read_sse_reg(ctxt, &op->vec_val, c->modrm_rm); + return rc; + } fetch_register_operand(op); return rc; } @@ -819,8 +1066,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, if (mc->pos < mc->end) goto read_cached; - rc = ops->read_emulated(addr, mc->data + mc->end, n, - &ctxt->exception, ctxt->vcpu); + rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n, + &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; mc->end += n; @@ -834,6 +1081,50 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } +static int segmented_read(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, false, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return read_emulated(ctxt, ctxt->ops, linear, data, size); +} + +static int segmented_write(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + const void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->write_emulated(ctxt, linear, data, size, + &ctxt->exception); +} + +static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt, + struct segmented_address addr, + const void *orig_data, const void *data, + unsigned size) +{ + int rc; + ulong linear; + + rc = linearize(ctxt, addr, size, true, &linear); + if (rc != X86EMUL_CONTINUE) + return rc; + return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data, + size, &ctxt->exception); +} + static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, unsigned int size, unsigned short port, @@ -854,7 +1145,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, if (n == 0) n = 1; rc->pos = rc->end = 0; - if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) + if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n)) return 0; rc->end = n * size; } @@ -864,28 +1155,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, return 1; } -static u32 desc_limit_scaled(struct desc_struct *desc) -{ - u32 limit = get_desc_limit(desc); - - return desc->g ? (limit << 12) | 0xfff : limit; -} - static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, u16 selector, struct desc_ptr *dt) { if (selector & 1 << 2) { struct desc_struct desc; + u16 sel; + memset (dt, 0, sizeof *dt); - if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR, - ctxt->vcpu)) + if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR)) return; dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ dt->address = get_desc_base(&desc); } else - ops->get_gdt(dt, ctxt->vcpu); + ops->get_gdt(ctxt, dt); } /* allowed just for 8 bytes segments */ @@ -903,8 +1188,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt, if (dt.size < index * 8 + 7) return emulate_gp(ctxt, selector & 0xfffc); addr = dt.address + index * 8; - ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, - &ctxt->exception); + ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); return ret; } @@ -925,8 +1209,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt, return emulate_gp(ctxt, selector & 0xfffc); addr = dt.address + index * 8; - ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, - &ctxt->exception); + ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception); return ret; } @@ -986,7 +1269,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, rpl = selector & 3; dpl = seg_desc.dpl; - cpl = ops->cpl(ctxt->vcpu); + cpl = ops->cpl(ctxt); switch (seg) { case VCPU_SREG_SS: @@ -1042,8 +1325,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, return ret; } load: - ops->set_segment_selector(selector, seg, ctxt->vcpu); - ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu); + ops->set_segment(ctxt, selector, &seg_desc, 0, seg); return X86EMUL_CONTINUE; exception: emulate_exception(ctxt, err_vec, err_code, true); @@ -1069,8 +1351,7 @@ static void write_register_operand(struct operand *op) } } -static inline int writeback(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int writeback(struct x86_emulate_ctxt *ctxt) { int rc; struct decode_cache *c = &ctxt->decode; @@ -1081,23 +1362,22 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, break; case OP_MEM: if (c->lock_prefix) - rc = ops->cmpxchg_emulated( - linear(ctxt, c->dst.addr.mem), - &c->dst.orig_val, - &c->dst.val, - c->dst.bytes, - &ctxt->exception, - ctxt->vcpu); + rc = segmented_cmpxchg(ctxt, + c->dst.addr.mem, + &c->dst.orig_val, + &c->dst.val, + c->dst.bytes); else - rc = ops->write_emulated( - linear(ctxt, c->dst.addr.mem), - &c->dst.val, - c->dst.bytes, - &ctxt->exception, - ctxt->vcpu); + rc = segmented_write(ctxt, + c->dst.addr.mem, + &c->dst.val, + c->dst.bytes); if (rc != X86EMUL_CONTINUE) return rc; break; + case OP_XMM: + write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm); + break; case OP_NONE: /* no writeback */ break; @@ -1107,21 +1387,21 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static inline void emulate_push(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_push(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; + struct segmented_address addr; - c->dst.type = OP_MEM; - c->dst.bytes = c->op_bytes; - c->dst.val = c->src.val; register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); - c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]); - c->dst.addr.mem.seg = VCPU_SREG_SS; + addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); + addr.seg = VCPU_SREG_SS; + + /* Disable writeback. */ + c->dst.type = OP_NONE; + return segmented_write(ctxt, addr, &c->src.val, c->op_bytes); } static int emulate_pop(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, void *dest, int len) { struct decode_cache *c = &ctxt->decode; @@ -1130,7 +1410,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); addr.seg = VCPU_SREG_SS; - rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len); + rc = segmented_read(ctxt, addr, dest, len); if (rc != X86EMUL_CONTINUE) return rc; @@ -1138,6 +1418,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt, return rc; } +static int em_pop(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + return emulate_pop(ctxt, &c->dst.val, c->op_bytes); +} + static int emulate_popf(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, void *dest, int len) @@ -1145,9 +1432,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, int rc; unsigned long val, change_mask; int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - int cpl = ops->cpl(ctxt->vcpu); + int cpl = ops->cpl(ctxt); - rc = emulate_pop(ctxt, ops, &val, len); + rc = emulate_pop(ctxt, &val, len); if (rc != X86EMUL_CONTINUE) return rc; @@ -1179,14 +1466,24 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt, return rc; } -static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops, int seg) +static int em_popf(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); + c->dst.type = OP_REG; + c->dst.addr.reg = &ctxt->eflags; + c->dst.bytes = c->op_bytes; + return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); +} - emulate_push(ctxt, ops); +static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, + struct x86_emulate_ops *ops, int seg) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.val = get_segment_selector(ctxt, seg); + + return em_push(ctxt); } static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, @@ -1196,7 +1493,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, unsigned long selector; int rc; - rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); + rc = emulate_pop(ctxt, &selector, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1204,8 +1501,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, return rc; } -static int emulate_pusha(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_pusha(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; unsigned long old_esp = c->regs[VCPU_REGS_RSP]; @@ -1216,23 +1512,25 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt, (reg == VCPU_REGS_RSP) ? (c->src.val = old_esp) : (c->src.val = c->regs[reg]); - emulate_push(ctxt, ops); - - rc = writeback(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ++reg; } - /* Disable writeback. */ - c->dst.type = OP_NONE; - return rc; } -static int emulate_popa(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_pushf(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.val = (unsigned long)ctxt->eflags; + return em_push(ctxt); +} + +static int em_popa(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; @@ -1245,7 +1543,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt, --reg; } - rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); + rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes); if (rc != X86EMUL_CONTINUE) break; --reg; @@ -1265,37 +1563,32 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt, /* TODO: Add limit checks */ c->src.val = ctxt->eflags; - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); - c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; c->src.val = c->eip; - emulate_push(ctxt, ops); - rc = writeback(ctxt, ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; - c->dst.type = OP_NONE; - - ops->get_idt(&dt, ctxt->vcpu); + ops->get_idt(ctxt, &dt); eip_addr = dt.address + (irq << 2); cs_addr = dt.address + (irq << 2) + 2; - rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception); + rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; - rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception); + rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception); if (rc != X86EMUL_CONTINUE) return rc; @@ -1339,7 +1632,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, /* TODO: Add stack limit check */ - rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); + rc = emulate_pop(ctxt, &temp_eip, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1347,12 +1640,12 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt, if (temp_eip & ~0xffff) return emulate_gp(ctxt, 0); - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + rc = emulate_pop(ctxt, &cs, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; - rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); + rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; @@ -1394,15 +1687,31 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt, } } -static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_jmp_far(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + unsigned short sel; + + memcpy(&sel, c->src.valptr + c->op_bytes, 2); + + rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS); + if (rc != X86EMUL_CONTINUE) + return rc; + + c->eip = 0; + memcpy(&c->eip, c->src.valptr, c->op_bytes); + return X86EMUL_CONTINUE; +} + +static int em_grp1a(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; - return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); + return emulate_pop(ctxt, &c->dst.val, c->dst.bytes); } -static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) +static int em_grp2(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; switch (c->modrm_reg) { @@ -1429,10 +1738,10 @@ static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); break; } + return X86EMUL_CONTINUE; } -static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_grp3(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; unsigned long *rax = &c->regs[VCPU_REGS_RAX]; @@ -1471,10 +1780,10 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, return X86EMUL_CONTINUE; } -static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_grp45(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; + int rc = X86EMUL_CONTINUE; switch (c->modrm_reg) { case 0: /* inc */ @@ -1488,21 +1797,23 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, old_eip = c->eip; c->eip = c->src.val; c->src.val = old_eip; - emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } case 4: /* jmp abs */ c->eip = c->src.val; break; + case 5: /* jmp far */ + rc = em_jmp_far(ctxt); + break; case 6: /* push */ - emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } - return X86EMUL_CONTINUE; + return rc; } -static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, - struct x86_emulate_ops *ops) +static int em_grp9(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; u64 old = c->dst.orig_val64; @@ -1528,12 +1839,12 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt, int rc; unsigned long cs; - rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); + rc = emulate_pop(ctxt, &c->eip, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; if (c->op_bytes == 4) c->eip = (u32)c->eip; - rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); + rc = emulate_pop(ctxt, &cs, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); @@ -1562,8 +1873,10 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops, struct desc_struct *cs, struct desc_struct *ss) { + u16 selector; + memset(cs, 0, sizeof(struct desc_struct)); - ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu); + ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS); memset(ss, 0, sizeof(struct desc_struct)); cs->l = 0; /* will be adjusted later */ @@ -1593,44 +1906,44 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; + u64 efer = 0; /* syscall is not available in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86) return emulate_ud(ctxt); + ops->get_msr(ctxt, MSR_EFER, &efer); setup_syscalls_segments(ctxt, ops, &cs, &ss); - ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt, MSR_STAR, &msr_data); msr_data >>= 32; cs_sel = (u16)(msr_data & 0xfffc); ss_sel = (u16)(msr_data + 8); - if (is_long_mode(ctxt->vcpu)) { + if (efer & EFER_LMA) { cs.d = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); c->regs[VCPU_REGS_RCX] = c->eip; - if (is_long_mode(ctxt->vcpu)) { + if (efer & EFER_LMA) { #ifdef CONFIG_X86_64 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; - ops->get_msr(ctxt->vcpu, + ops->get_msr(ctxt, ctxt->mode == X86EMUL_MODE_PROT64 ? MSR_LSTAR : MSR_CSTAR, &msr_data); c->eip = msr_data; - ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); + ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data); ctxt->eflags &= ~(msr_data | EFLG_RF); #endif } else { /* legacy mode */ - ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); + ops->get_msr(ctxt, MSR_STAR, &msr_data); c->eip = (u32)msr_data; ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); @@ -1646,7 +1959,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) struct desc_struct cs, ss; u64 msr_data; u16 cs_sel, ss_sel; + u64 efer = 0; + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); /* inject #GP if in real mode */ if (ctxt->mode == X86EMUL_MODE_REAL) return emulate_gp(ctxt, 0); @@ -1659,7 +1974,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) setup_syscalls_segments(ctxt, ops, &cs, &ss); - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); switch (ctxt->mode) { case X86EMUL_MODE_PROT32: if ((msr_data & 0xfffc) == 0x0) @@ -1676,21 +1991,18 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel &= ~SELECTOR_RPL_MASK; ss_sel = cs_sel + 8; ss_sel &= ~SELECTOR_RPL_MASK; - if (ctxt->mode == X86EMUL_MODE_PROT64 - || is_long_mode(ctxt->vcpu)) { + if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) { cs.d = 0; cs.l = 1; } - ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data); c->eip = msr_data; - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data); c->regs[VCPU_REGS_RSP] = msr_data; return X86EMUL_CONTINUE; @@ -1719,7 +2031,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs.dpl = 3; ss.dpl = 3; - ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); + ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data); switch (usermode) { case X86EMUL_MODE_PROT32: cs_sel = (u16)(msr_data + 16); @@ -1739,10 +2051,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) cs_sel |= SELECTOR_RPL_MASK; ss_sel |= SELECTOR_RPL_MASK; - ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); - ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu); + ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS); + ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS); c->eip = c->regs[VCPU_REGS_RDX]; c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; @@ -1759,7 +2069,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt, if (ctxt->mode == X86EMUL_MODE_VM86) return true; iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; - return ops->cpl(ctxt->vcpu) > iopl; + return ops->cpl(ctxt) > iopl; } static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, @@ -1769,11 +2079,11 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, struct desc_struct tr_seg; u32 base3; int r; - u16 io_bitmap_ptr, perm, bit_idx = port & 0x7; + u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7; unsigned mask = (1 << len) - 1; unsigned long base; - ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu); + ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR); if (!tr_seg.p) return false; if (desc_limit_scaled(&tr_seg) < 103) @@ -1782,13 +2092,12 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, #ifdef CONFIG_X86_64 base |= ((u64)base3) << 32; #endif - r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL); + r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL); if (r != X86EMUL_CONTINUE) return false; if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) return false; - r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu, - NULL); + r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL); if (r != X86EMUL_CONTINUE) return false; if ((perm >> bit_idx) & mask) @@ -1829,11 +2138,11 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt, tss->si = c->regs[VCPU_REGS_RSI]; tss->di = c->regs[VCPU_REGS_RDI]; - tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); - tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); - tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); - tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); + tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); + tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); + tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); + tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); + tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, @@ -1858,11 +2167,11 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, * SDM says that segment selectors are loaded before segment * descriptors */ - ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); - ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); - ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); + set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR); + set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); + set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); + set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); + set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); /* * Now load segment descriptors. If fault happenes at this stage @@ -1896,7 +2205,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, int ret; u32 new_tss_base = get_desc_base(new_desc); - ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -1904,13 +2213,13 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, save_state_to_tss16(ctxt, ops, &tss_seg); - ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; - ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -1919,10 +2228,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt, if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; - ret = ops->write_std(new_tss_base, + ret = ops->write_std(ctxt, new_tss_base, &tss_seg.prev_task_link, sizeof tss_seg.prev_task_link, - ctxt->vcpu, &ctxt->exception); + &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; @@ -1937,7 +2246,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, { struct decode_cache *c = &ctxt->decode; - tss->cr3 = ops->get_cr(3, ctxt->vcpu); + tss->cr3 = ops->get_cr(ctxt, 3); tss->eip = c->eip; tss->eflags = ctxt->eflags; tss->eax = c->regs[VCPU_REGS_RAX]; @@ -1949,13 +2258,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt, tss->esi = c->regs[VCPU_REGS_RSI]; tss->edi = c->regs[VCPU_REGS_RDI]; - tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); - tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); - tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); - tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); - tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); - tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); - tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); + tss->es = get_segment_selector(ctxt, VCPU_SREG_ES); + tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS); + tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS); + tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS); + tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS); + tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS); + tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR); } static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, @@ -1965,7 +2274,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, struct decode_cache *c = &ctxt->decode; int ret; - if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) + if (ops->set_cr(ctxt, 3, tss->cr3)) return emulate_gp(ctxt, 0); c->eip = tss->eip; ctxt->eflags = tss->eflags | 2; @@ -1982,13 +2291,13 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, * SDM says that segment selectors are loaded before segment * descriptors */ - ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); - ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); - ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); - ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); - ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); - ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); - ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); + set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR); + set_segment_selector(ctxt, tss->es, VCPU_SREG_ES); + set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS); + set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS); + set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS); + set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS); + set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS); /* * Now load segment descriptors. If fault happenes at this stage @@ -2028,7 +2337,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, int ret; u32 new_tss_base = get_desc_base(new_desc); - ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -2036,13 +2345,13 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, save_state_to_tss32(ctxt, ops, &tss_seg); - ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; - ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, + ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg, &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ @@ -2051,10 +2360,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt, if (old_tss_sel != 0xffff) { tss_seg.prev_task_link = old_tss_sel; - ret = ops->write_std(new_tss_base, + ret = ops->write_std(ctxt, new_tss_base, &tss_seg.prev_task_link, sizeof tss_seg.prev_task_link, - ctxt->vcpu, &ctxt->exception); + &ctxt->exception); if (ret != X86EMUL_CONTINUE) /* FIXME: need to provide precise fault address */ return ret; @@ -2070,9 +2379,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, { struct desc_struct curr_tss_desc, next_tss_desc; int ret; - u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); + u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR); ulong old_tss_base = - ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); + ops->get_cached_segment_base(ctxt, VCPU_SREG_TR); u32 desc_limit; /* FIXME: old_tss_base == ~0 ? */ @@ -2088,7 +2397,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, if (reason != TASK_SWITCH_IRET) { if ((tss_selector & 3) > next_tss_desc.dpl || - ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) + ops->cpl(ctxt) > next_tss_desc.dpl) return emulate_gp(ctxt, 0); } @@ -2132,9 +2441,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, &next_tss_desc); } - ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); - ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu); - ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu); + ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS); + ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR); if (has_error_code) { struct decode_cache *c = &ctxt->decode; @@ -2142,7 +2450,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt, c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; c->lock_prefix = 0; c->src.val = (unsigned long) error_code; - emulate_push(ctxt, ops); + ret = em_push(ctxt); } return ret; @@ -2162,13 +2470,10 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt, rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, has_error_code, error_code); - if (rc == X86EMUL_CONTINUE) { - rc = writeback(ctxt, ops); - if (rc == X86EMUL_CONTINUE) - ctxt->eip = c->eip; - } + if (rc == X86EMUL_CONTINUE) + ctxt->eip = c->eip; - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, @@ -2182,12 +2487,6 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, op->addr.mem.seg = seg; } -static int em_push(struct x86_emulate_ctxt *ctxt) -{ - emulate_push(ctxt, ctxt->ops); - return X86EMUL_CONTINUE; -} - static int em_das(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -2234,7 +2533,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) ulong old_eip; int rc; - old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); + old_cs = get_segment_selector(ctxt, VCPU_SREG_CS); old_eip = c->eip; memcpy(&sel, c->src.valptr + c->op_bytes, 2); @@ -2245,20 +2544,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt) memcpy(&c->eip, c->src.valptr, c->op_bytes); c->src.val = old_cs; - emulate_push(ctxt, ctxt->ops); - rc = writeback(ctxt, ctxt->ops); + rc = em_push(ctxt); if (rc != X86EMUL_CONTINUE) return rc; c->src.val = old_eip; - emulate_push(ctxt, ctxt->ops); - rc = writeback(ctxt, ctxt->ops); - if (rc != X86EMUL_CONTINUE) - return rc; - - c->dst.type = OP_NONE; - - return X86EMUL_CONTINUE; + return em_push(ctxt); } static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) @@ -2269,13 +2560,79 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) c->dst.type = OP_REG; c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; - rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); + rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes); if (rc != X86EMUL_CONTINUE) return rc; register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); return X86EMUL_CONTINUE; } +static int em_add(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_or(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_adc(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_sbb(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_and(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_sub(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_xor(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); + return X86EMUL_CONTINUE; +} + +static int em_cmp(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + static int em_imul(struct x86_emulate_ctxt *ctxt) { struct decode_cache *c = &ctxt->decode; @@ -2306,13 +2663,10 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt) static int em_rdtsc(struct x86_emulate_ctxt *ctxt) { - unsigned cpl = ctxt->ops->cpl(ctxt->vcpu); struct decode_cache *c = &ctxt->decode; u64 tsc = 0; - if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) - return emulate_gp(ctxt, 0); - ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc); + ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc); c->regs[VCPU_REGS_RAX] = (u32)tsc; c->regs[VCPU_REGS_RDX] = tsc >> 32; return X86EMUL_CONTINUE; @@ -2325,22 +2679,375 @@ static int em_mov(struct x86_emulate_ctxt *ctxt) return X86EMUL_CONTINUE; } +static int em_movdqu(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes); + return X86EMUL_CONTINUE; +} + +static int em_invlpg(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + ulong linear; + + rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear); + if (rc == X86EMUL_CONTINUE) + ctxt->ops->invlpg(ctxt, linear); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_clts(struct x86_emulate_ctxt *ctxt) +{ + ulong cr0; + + cr0 = ctxt->ops->get_cr(ctxt, 0); + cr0 &= ~X86_CR0_TS; + ctxt->ops->set_cr(ctxt, 0, cr0); + return X86EMUL_CONTINUE; +} + +static int em_vmcall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + if (c->modrm_mod != 3 || c->modrm_rm != 1) + return X86EMUL_UNHANDLEABLE; + + rc = ctxt->ops->fix_hypercall(ctxt); + if (rc != X86EMUL_CONTINUE) + return rc; + + /* Let the processor re-execute the fixed hypercall */ + c->eip = ctxt->eip; + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_lgdt(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct desc_ptr desc_ptr; + int rc; + + rc = read_descriptor(ctxt, c->src.addr.mem, + &desc_ptr.size, &desc_ptr.address, + c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + ctxt->ops->set_gdt(ctxt, &desc_ptr); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_vmmcall(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int rc; + + rc = ctxt->ops->fix_hypercall(ctxt); + + /* Disable writeback. */ + c->dst.type = OP_NONE; + return rc; +} + +static int em_lidt(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + struct desc_ptr desc_ptr; + int rc; + + rc = read_descriptor(ctxt, c->src.addr.mem, + &desc_ptr.size, &desc_ptr.address, + c->op_bytes); + if (rc != X86EMUL_CONTINUE) + return rc; + ctxt->ops->set_idt(ctxt, &desc_ptr); + /* Disable writeback. */ + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static int em_smsw(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.bytes = 2; + c->dst.val = ctxt->ops->get_cr(ctxt, 0); + return X86EMUL_CONTINUE; +} + +static int em_lmsw(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul) + | (c->src.val & 0x0f)); + c->dst.type = OP_NONE; + return X86EMUL_CONTINUE; +} + +static bool valid_cr(int nr) +{ + switch (nr) { + case 0: + case 2 ... 4: + case 8: + return true; + default: + return false; + } +} + +static int check_cr_read(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + if (!valid_cr(c->modrm_reg)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_cr_write(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u64 new_val = c->src.val64; + int cr = c->modrm_reg; + u64 efer = 0; + + static u64 cr_reserved_bits[] = { + 0xffffffff00000000ULL, + 0, 0, 0, /* CR3 checked later */ + CR4_RESERVED_BITS, + 0, 0, 0, + CR8_RESERVED_BITS, + }; + + if (!valid_cr(cr)) + return emulate_ud(ctxt); + + if (new_val & cr_reserved_bits[cr]) + return emulate_gp(ctxt, 0); + + switch (cr) { + case 0: { + u64 cr4; + if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) || + ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD))) + return emulate_gp(ctxt, 0); + + cr4 = ctxt->ops->get_cr(ctxt, 4); + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if ((new_val & X86_CR0_PG) && (efer & EFER_LME) && + !(cr4 & X86_CR4_PAE)) + return emulate_gp(ctxt, 0); + + break; + } + case 3: { + u64 rsvd = 0; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + if (efer & EFER_LMA) + rsvd = CR3_L_MODE_RESERVED_BITS; + else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE) + rsvd = CR3_PAE_RESERVED_BITS; + else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG) + rsvd = CR3_NONPAE_RESERVED_BITS; + + if (new_val & rsvd) + return emulate_gp(ctxt, 0); + + break; + } + case 4: { + u64 cr4; + + cr4 = ctxt->ops->get_cr(ctxt, 4); + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE)) + return emulate_gp(ctxt, 0); + + break; + } + } + + return X86EMUL_CONTINUE; +} + +static int check_dr7_gd(struct x86_emulate_ctxt *ctxt) +{ + unsigned long dr7; + + ctxt->ops->get_dr(ctxt, 7, &dr7); + + /* Check if DR7.Global_Enable is set */ + return dr7 & (1 << 13); +} + +static int check_dr_read(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + int dr = c->modrm_reg; + u64 cr4; + + if (dr > 7) + return emulate_ud(ctxt); + + cr4 = ctxt->ops->get_cr(ctxt, 4); + if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5)) + return emulate_ud(ctxt); + + if (check_dr7_gd(ctxt)) + return emulate_db(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_dr_write(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + u64 new_val = c->src.val64; + int dr = c->modrm_reg; + + if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL)) + return emulate_gp(ctxt, 0); + + return check_dr_read(ctxt); +} + +static int check_svme(struct x86_emulate_ctxt *ctxt) +{ + u64 efer; + + ctxt->ops->get_msr(ctxt, MSR_EFER, &efer); + + if (!(efer & EFER_SVME)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_svme_pa(struct x86_emulate_ctxt *ctxt) +{ + u64 rax = ctxt->decode.regs[VCPU_REGS_RAX]; + + /* Valid physical address? */ + if (rax & 0xffff000000000000ULL) + return emulate_gp(ctxt, 0); + + return check_svme(ctxt); +} + +static int check_rdtsc(struct x86_emulate_ctxt *ctxt) +{ + u64 cr4 = ctxt->ops->get_cr(ctxt, 4); + + if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt)) + return emulate_ud(ctxt); + + return X86EMUL_CONTINUE; +} + +static int check_rdpmc(struct x86_emulate_ctxt *ctxt) +{ + u64 cr4 = ctxt->ops->get_cr(ctxt, 4); + u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX]; + + if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) || + (rcx > 3)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + +static int check_perm_in(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->dst.bytes = min(c->dst.bytes, 4u); + if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + +static int check_perm_out(struct x86_emulate_ctxt *ctxt) +{ + struct decode_cache *c = &ctxt->decode; + + c->src.bytes = min(c->src.bytes, 4u); + if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes)) + return emulate_gp(ctxt, 0); + + return X86EMUL_CONTINUE; +} + #define D(_y) { .flags = (_y) } +#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i } +#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \ + .check_perm = (_p) } #define N D(0) +#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) } #define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } -#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } +#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) } #define I(_f, _e) { .flags = (_f), .u.execute = (_e) } +#define II(_f, _e, _i) \ + { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i } +#define IIP(_f, _e, _i, _p) \ + { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \ + .check_perm = (_p) } +#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) } #define D2bv(_f) D((_f) | ByteOp), D(_f) +#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p) #define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) -#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ - D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ - D2bv(((_f) & ~Lock) | DstAcc | SrcImm) +#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \ + I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \ + I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e) +static struct opcode group7_rm1[] = { + DI(SrcNone | ModRM | Priv, monitor), + DI(SrcNone | ModRM | Priv, mwait), + N, N, N, N, N, N, +}; + +static struct opcode group7_rm3[] = { + DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa), + II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall), + DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa), + DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa), + DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme), + DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme), +}; + +static struct opcode group7_rm7[] = { + N, + DIP(SrcNone | ModRM, rdtscp, check_rdtsc), + N, N, N, N, N, N, +}; static struct opcode group1[] = { - X7(D(Lock)), N + I(Lock, em_add), + I(Lock, em_or), + I(Lock, em_adc), + I(Lock, em_sbb), + I(Lock, em_and), + I(Lock, em_sub), + I(Lock, em_xor), + I(0, em_cmp), }; static struct opcode group1A[] = { @@ -2366,16 +3073,28 @@ static struct opcode group5[] = { D(SrcMem | ModRM | Stack), N, }; +static struct opcode group6[] = { + DI(ModRM | Prot, sldt), + DI(ModRM | Prot, str), + DI(ModRM | Prot | Priv, lldt), + DI(ModRM | Prot | Priv, ltr), + N, N, N, N, +}; + static struct group_dual group7 = { { - N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), - D(SrcMem | ModRM | ByteOp | Priv | NoAccess), + DI(ModRM | Mov | DstMem | Priv, sgdt), + DI(ModRM | Mov | DstMem | Priv, sidt), + II(ModRM | SrcMem | Priv, em_lgdt, lgdt), + II(ModRM | SrcMem | Priv, em_lidt, lidt), + II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), + II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg), }, { - D(SrcNone | ModRM | Priv | VendorSpecific), N, - N, D(SrcNone | ModRM | Priv | VendorSpecific), - D(SrcNone | ModRM | DstMem | Mov), N, - D(SrcMem16 | ModRM | Mov | Priv), N, + I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall), + EXT(0, group7_rm1), + N, EXT(0, group7_rm3), + II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N, + II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7), } }; static struct opcode group8[] = { @@ -2394,35 +3113,40 @@ static struct opcode group11[] = { I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), }; +static struct gprefix pfx_0f_6f_0f_7f = { + N, N, N, I(Sse, em_movdqu), +}; + static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ - D6ALU(Lock), + I6ALU(Lock, em_add), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x08 - 0x0F */ - D6ALU(Lock), + I6ALU(Lock, em_or), D(ImplicitOps | Stack | No64), N, /* 0x10 - 0x17 */ - D6ALU(Lock), + I6ALU(Lock, em_adc), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x18 - 0x1F */ - D6ALU(Lock), + I6ALU(Lock, em_sbb), D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), /* 0x20 - 0x27 */ - D6ALU(Lock), N, N, + I6ALU(Lock, em_and), N, N, /* 0x28 - 0x2F */ - D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), + I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das), /* 0x30 - 0x37 */ - D6ALU(Lock), N, N, + I6ALU(Lock, em_xor), N, N, /* 0x38 - 0x3F */ - D6ALU(0), N, N, + I6ALU(0, em_cmp), N, N, /* 0x40 - 0x4F */ X16(D(DstReg)), /* 0x50 - 0x57 */ X8(I(SrcReg | Stack, em_push)), /* 0x58 - 0x5F */ - X8(D(DstReg | Stack)), + X8(I(DstReg | Stack, em_pop)), /* 0x60 - 0x67 */ - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + I(ImplicitOps | Stack | No64, em_pusha), + I(ImplicitOps | Stack | No64, em_popa), N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , N, N, N, N, /* 0x68 - 0x6F */ @@ -2430,8 +3154,8 @@ static struct opcode opcode_table[256] = { I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), I(SrcImmByte | Mov | Stack, em_push), I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), - D2bv(DstDI | Mov | String), /* insb, insw/insd */ - D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ + D2bvIP(DstDI | Mov | String, ins, check_perm_in), /* insb, insw/insd */ + D2bvIP(SrcSI | ImplicitOps | String, outs, check_perm_out), /* outsb, outsw/outsd */ /* 0x70 - 0x7F */ X16(D(SrcImmByte)), /* 0x80 - 0x87 */ @@ -2446,21 +3170,22 @@ static struct opcode opcode_table[256] = { D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), /* 0x90 - 0x97 */ - X8(D(SrcAcc | DstReg)), + DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)), /* 0x98 - 0x9F */ D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), I(SrcImmFAddr | No64, em_call_far), N, - D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, + II(ImplicitOps | Stack, em_pushf, pushf), + II(ImplicitOps | Stack, em_popf, popf), N, N, /* 0xA0 - 0xA7 */ I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), I2bv(SrcSI | DstDI | Mov | String, em_mov), - D2bv(SrcSI | DstDI | String), + I2bv(SrcSI | DstDI | String, em_cmp), /* 0xA8 - 0xAF */ D2bv(DstAcc | SrcImm), I2bv(SrcAcc | DstDI | Mov | String, em_mov), I2bv(SrcSI | DstAcc | Mov | String, em_mov), - D2bv(SrcAcc | DstDI | String), + I2bv(SrcAcc | DstDI | String, em_cmp), /* 0xB0 - 0xB7 */ X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), /* 0xB8 - 0xBF */ @@ -2473,7 +3198,8 @@ static struct opcode opcode_table[256] = { G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ N, N, N, D(ImplicitOps | Stack), - D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), + D(ImplicitOps), DI(SrcImmByte, intn), + D(ImplicitOps | No64), DI(ImplicitOps, iret), /* 0xD0 - 0xD7 */ D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), N, N, N, N, @@ -2481,14 +3207,17 @@ static struct opcode opcode_table[256] = { N, N, N, N, N, N, N, N, /* 0xE0 - 0xE7 */ X4(D(SrcImmByte)), - D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), + D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in), + D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out), /* 0xE8 - 0xEF */ D(SrcImm | Stack), D(SrcImm | ImplicitOps), D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), - D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), + D2bvIP(SrcNone | DstAcc, in, check_perm_in), + D2bvIP(SrcAcc | ImplicitOps, out, check_perm_out), /* 0xF0 - 0xF7 */ - N, N, N, N, - D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), + N, DI(ImplicitOps, icebp), N, N, + DI(ImplicitOps | Priv, hlt), D(ImplicitOps), + G(ByteOp, group3), G(0, group3), /* 0xF8 - 0xFF */ D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), @@ -2496,20 +3225,24 @@ static struct opcode opcode_table[256] = { static struct opcode twobyte_table[256] = { /* 0x00 - 0x0F */ - N, GD(0, &group7), N, N, - N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N, - D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, + G(0, group6), GD(0, &group7), N, N, + N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N, + DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N, N, D(ImplicitOps | ModRM), N, N, /* 0x10 - 0x1F */ N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, /* 0x20 - 0x2F */ - D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), - D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), + DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read), + DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read), + DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write), + DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write), N, N, N, N, N, N, N, N, N, N, N, N, /* 0x30 - 0x3F */ - D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), - D(ImplicitOps | Priv), N, + DI(ImplicitOps | Priv, wrmsr), + IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc), + DI(ImplicitOps | Priv, rdmsr), + DIP(ImplicitOps | Priv, rdpmc, check_rdpmc), D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), N, N, N, N, N, N, N, N, N, N, @@ -2518,21 +3251,27 @@ static struct opcode twobyte_table[256] = { /* 0x50 - 0x5F */ N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, /* 0x60 - 0x6F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x70 - 0x7F */ - N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, N, + N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f), /* 0x80 - 0x8F */ X16(D(SrcImm)), /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp), + DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ D(ImplicitOps | Stack), D(ImplicitOps | Stack), - N, D(DstMem | SrcReg | ModRM | BitOp | Lock), + DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), @@ -2564,10 +3303,13 @@ static struct opcode twobyte_table[256] = { #undef G #undef GD #undef I +#undef GP +#undef EXT #undef D2bv +#undef D2bvIP #undef I2bv -#undef D6ALU +#undef I6ALU static unsigned imm_size(struct decode_cache *c) { @@ -2625,8 +3367,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) struct decode_cache *c = &ctxt->decode; int rc = X86EMUL_CONTINUE; int mode = ctxt->mode; - int def_op_bytes, def_ad_bytes, dual, goffset; - struct opcode opcode, *g_mod012, *g_mod3; + int def_op_bytes, def_ad_bytes, goffset, simd_prefix; + bool op_prefix = false; + struct opcode opcode; struct operand memop = { .type = OP_NONE }; c->eip = ctxt->eip; @@ -2634,7 +3377,6 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) c->fetch.end = c->fetch.start + insn_len; if (insn_len > 0) memcpy(c->fetch.data, insn, insn_len); - ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS); switch (mode) { case X86EMUL_MODE_REAL: @@ -2662,6 +3404,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) for (;;) { switch (c->b = insn_fetch(u8, 1, c->eip)) { case 0x66: /* operand-size override */ + op_prefix = true; /* switch between 2/4 bytes */ c->op_bytes = def_op_bytes ^ 6; break; @@ -2692,10 +3435,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) c->lock_prefix = 1; break; case 0xf2: /* REPNE/REPNZ */ - c->rep_prefix = REPNE_PREFIX; - break; case 0xf3: /* REP/REPE/REPZ */ - c->rep_prefix = REPE_PREFIX; + c->rep_prefix = c->b; break; default: goto done_prefixes; @@ -2722,29 +3463,49 @@ done_prefixes: } c->d = opcode.flags; - if (c->d & Group) { - dual = c->d & GroupDual; - c->modrm = insn_fetch(u8, 1, c->eip); - --c->eip; - - if (c->d & GroupDual) { - g_mod012 = opcode.u.gdual->mod012; - g_mod3 = opcode.u.gdual->mod3; - } else - g_mod012 = g_mod3 = opcode.u.group; - - c->d &= ~(Group | GroupDual); - - goffset = (c->modrm >> 3) & 7; + while (c->d & GroupMask) { + switch (c->d & GroupMask) { + case Group: + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + goffset = (c->modrm >> 3) & 7; + opcode = opcode.u.group[goffset]; + break; + case GroupDual: + c->modrm = insn_fetch(u8, 1, c->eip); + --c->eip; + goffset = (c->modrm >> 3) & 7; + if ((c->modrm >> 6) == 3) + opcode = opcode.u.gdual->mod3[goffset]; + else + opcode = opcode.u.gdual->mod012[goffset]; + break; + case RMExt: + goffset = c->modrm & 7; + opcode = opcode.u.group[goffset]; + break; + case Prefix: + if (c->rep_prefix && op_prefix) + return X86EMUL_UNHANDLEABLE; + simd_prefix = op_prefix ? 0x66 : c->rep_prefix; + switch (simd_prefix) { + case 0x00: opcode = opcode.u.gprefix->pfx_no; break; + case 0x66: opcode = opcode.u.gprefix->pfx_66; break; + case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break; + case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break; + } + break; + default: + return X86EMUL_UNHANDLEABLE; + } - if ((c->modrm >> 6) == 3) - opcode = g_mod3[goffset]; - else - opcode = g_mod012[goffset]; + c->d &= ~GroupMask; c->d |= opcode.flags; } c->execute = opcode.u.execute; + c->check_perm = opcode.check_perm; + c->intercept = opcode.intercept; /* Unrecognised? */ if (c->d == 0 || (c->d & Undefined)) @@ -2763,6 +3524,9 @@ done_prefixes: c->op_bytes = 4; } + if (c->d & Sse) + c->op_bytes = 16; + /* ModRM and SIB bytes. */ if (c->d & ModRM) { rc = decode_modrm(ctxt, ops, &memop); @@ -2776,7 +3540,7 @@ done_prefixes: if (!c->has_seg_override) set_seg_override(c, VCPU_SREG_DS); - memop.addr.mem.seg = seg_override(ctxt, ops, c); + memop.addr.mem.seg = seg_override(ctxt, c); if (memop.type == OP_MEM && c->ad_bytes != 8) memop.addr.mem.ea = (u32)memop.addr.mem.ea; @@ -2792,7 +3556,7 @@ done_prefixes: case SrcNone: break; case SrcReg: - decode_register_operand(&c->src, c, 0); + decode_register_operand(ctxt, &c->src, c, 0); break; case SrcMem16: memop.bytes = 2; @@ -2836,7 +3600,7 @@ done_prefixes: c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; c->src.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSI]); - c->src.addr.mem.seg = seg_override(ctxt, ops, c), + c->src.addr.mem.seg = seg_override(ctxt, c); c->src.val = 0; break; case SrcImmFAddr: @@ -2883,7 +3647,7 @@ done_prefixes: /* Decode and fetch the destination operand: register or memory. */ switch (c->d & DstMask) { case DstReg: - decode_register_operand(&c->dst, c, + decode_register_operand(ctxt, &c->dst, c, c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); break; case DstImmUByte: @@ -2926,7 +3690,7 @@ done_prefixes: } done: - return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; } static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) @@ -2979,12 +3743,51 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) goto done; } + if ((c->d & Sse) + && ((ops->get_cr(ctxt, 0) & X86_CR0_EM) + || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) { + rc = emulate_ud(ctxt); + goto done; + } + + if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) { + rc = emulate_nm(ctxt); + goto done; + } + + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_PRE_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + /* Privileged instruction can be executed only in CPL=0 */ - if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { + if ((c->d & Priv) && ops->cpl(ctxt)) { rc = emulate_gp(ctxt, 0); goto done; } + /* Instruction can only be executed in protected mode */ + if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) { + rc = emulate_ud(ctxt); + goto done; + } + + /* Do instruction specific permission checks */ + if (c->check_perm) { + rc = c->check_perm(ctxt); + if (rc != X86EMUL_CONTINUE) + goto done; + } + + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_POST_EXCEPT); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (c->rep_prefix && (c->d & String)) { /* All REP prefixes have the same first termination condition */ if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { @@ -2994,16 +3797,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) } if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { - rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem), - c->src.valptr, c->src.bytes); + rc = segmented_read(ctxt, c->src.addr.mem, + c->src.valptr, c->src.bytes); if (rc != X86EMUL_CONTINUE) goto done; c->src.orig_val64 = c->src.val64; } if (c->src2.type == OP_MEM) { - rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem), - &c->src2.val, c->src2.bytes); + rc = segmented_read(ctxt, c->src2.addr.mem, + &c->src2.val, c->src2.bytes); if (rc != X86EMUL_CONTINUE) goto done; } @@ -3014,7 +3817,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { /* optimisation - avoid slow emulated read if Mov */ - rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem), + rc = segmented_read(ctxt, c->dst.addr.mem, &c->dst.val, c->dst.bytes); if (rc != X86EMUL_CONTINUE) goto done; @@ -3023,6 +3826,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt) special_insn: + if (unlikely(ctxt->guest_mode) && c->intercept) { + rc = emulator_check_intercept(ctxt, c->intercept, + X86_ICPT_POST_MEMACCESS); + if (rc != X86EMUL_CONTINUE) + goto done; + } + if (c->execute) { rc = c->execute(ctxt); if (rc != X86EMUL_CONTINUE) @@ -3034,75 +3844,33 @@ special_insn: goto twobyte_insn; switch (c->b) { - case 0x00 ... 0x05: - add: /* add */ - emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); - break; case 0x06: /* push es */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); break; case 0x07: /* pop es */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); break; - case 0x08 ... 0x0d: - or: /* or */ - emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); - break; case 0x0e: /* push cs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); - break; - case 0x10 ... 0x15: - adc: /* adc */ - emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); break; case 0x16: /* push ss */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); break; case 0x17: /* pop ss */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); break; - case 0x18 ... 0x1d: - sbb: /* sbb */ - emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); - break; case 0x1e: /* push ds */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); break; case 0x1f: /* pop ds */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); break; - case 0x20 ... 0x25: - and: /* and */ - emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); - break; - case 0x28 ... 0x2d: - sub: /* sub */ - emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags); - break; - case 0x30 ... 0x35: - xor: /* xor */ - emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags); - break; - case 0x38 ... 0x3d: - cmp: /* cmp */ - emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags); - break; case 0x40 ... 0x47: /* inc r16/r32 */ emulate_1op("inc", c->dst, ctxt->eflags); break; case 0x48 ... 0x4f: /* dec r16/r32 */ emulate_1op("dec", c->dst, ctxt->eflags); break; - case 0x58 ... 0x5f: /* pop reg */ - pop_instruction: - rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes); - break; - case 0x60: /* pusha */ - rc = emulate_pusha(ctxt, ops); - break; - case 0x61: /* popa */ - rc = emulate_popa(ctxt, ops); - break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) goto cannot_emulate; @@ -3121,26 +3889,6 @@ special_insn: if (test_cc(c->b, ctxt->eflags)) jmp_rel(c, c->src.val); break; - case 0x80 ... 0x83: /* Grp1 */ - switch (c->modrm_reg) { - case 0: - goto add; - case 1: - goto or; - case 2: - goto adc; - case 3: - goto sbb; - case 4: - goto and; - case 5: - goto sub; - case 6: - goto xor; - case 7: - goto cmp; - } - break; case 0x84 ... 0x85: test: emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); @@ -3162,7 +3910,7 @@ special_insn: rc = emulate_ud(ctxt); goto done; } - c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); + c->dst.val = get_segment_selector(ctxt, c->modrm_reg); break; case 0x8d: /* lea r16/r32, m */ c->dst.val = c->src.addr.mem.ea; @@ -3187,7 +3935,7 @@ special_insn: break; } case 0x8f: /* pop (sole member of Grp1a) */ - rc = emulate_grp1a(ctxt, ops); + rc = em_grp1a(ctxt); break; case 0x90 ... 0x97: /* nop / xchg reg, rax */ if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) @@ -3200,31 +3948,17 @@ special_insn: case 8: c->dst.val = (s32)c->dst.val; break; } break; - case 0x9c: /* pushf */ - c->src.val = (unsigned long) ctxt->eflags; - emulate_push(ctxt, ops); - break; - case 0x9d: /* popf */ - c->dst.type = OP_REG; - c->dst.addr.reg = &ctxt->eflags; - c->dst.bytes = c->op_bytes; - rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes); - break; - case 0xa6 ... 0xa7: /* cmps */ - c->dst.type = OP_NONE; /* Disable writeback. */ - goto cmp; case 0xa8 ... 0xa9: /* test ax, imm */ goto test; - case 0xae ... 0xaf: /* scas */ - goto cmp; case 0xc0 ... 0xc1: - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xc3: /* ret */ c->dst.type = OP_REG; c->dst.addr.reg = &c->eip; c->dst.bytes = c->op_bytes; - goto pop_instruction; + rc = em_pop(ctxt); + break; case 0xc4: /* les */ rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); break; @@ -3252,11 +3986,11 @@ special_insn: rc = emulate_iret(ctxt, ops); break; case 0xd0 ... 0xd1: /* Grp2 */ - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xd2 ... 0xd3: /* Grp2 */ c->src.val = c->regs[VCPU_REGS_RCX]; - emulate_grp2(ctxt); + rc = em_grp2(ctxt); break; case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); @@ -3278,23 +4012,14 @@ special_insn: long int rel = c->src.val; c->src.val = (unsigned long) c->eip; jmp_rel(c, rel); - emulate_push(ctxt, ops); + rc = em_push(ctxt); break; } case 0xe9: /* jmp rel */ goto jmp; - case 0xea: { /* jmp far */ - unsigned short sel; - jump_far: - memcpy(&sel, c->src.valptr + c->op_bytes, 2); - - if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS)) - goto done; - - c->eip = 0; - memcpy(&c->eip, c->src.valptr, c->op_bytes); + case 0xea: /* jmp far */ + rc = em_jmp_far(ctxt); break; - } case 0xeb: jmp: /* jmp rel short */ jmp_rel(c, c->src.val); @@ -3304,11 +4029,6 @@ special_insn: case 0xed: /* in (e/r)ax,dx */ c->src.val = c->regs[VCPU_REGS_RDX]; do_io_in: - c->dst.bytes = min(c->dst.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { - rc = emulate_gp(ctxt, 0); - goto done; - } if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, &c->dst.val)) goto done; /* IO is needed */ @@ -3317,25 +4037,19 @@ special_insn: case 0xef: /* out dx,(e/r)ax */ c->dst.val = c->regs[VCPU_REGS_RDX]; do_io_out: - c->src.bytes = min(c->src.bytes, 4u); - if (!emulator_io_permited(ctxt, ops, c->dst.val, - c->src.bytes)) { - rc = emulate_gp(ctxt, 0); - goto done; - } - ops->pio_out_emulated(c->src.bytes, c->dst.val, - &c->src.val, 1, ctxt->vcpu); + ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val, + &c->src.val, 1); c->dst.type = OP_NONE; /* Disable writeback. */ break; case 0xf4: /* hlt */ - ctxt->vcpu->arch.halt_request = 1; + ctxt->ops->halt(ctxt); break; case 0xf5: /* cmc */ /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; break; case 0xf6 ... 0xf7: /* Grp3 */ - rc = emulate_grp3(ctxt, ops); + rc = em_grp3(ctxt); break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; @@ -3366,13 +4080,11 @@ special_insn: ctxt->eflags |= EFLG_DF; break; case 0xfe: /* Grp4 */ - grp45: - rc = emulate_grp45(ctxt, ops); + rc = em_grp45(ctxt); break; case 0xff: /* Grp5 */ - if (c->modrm_reg == 5) - goto jump_far; - goto grp45; + rc = em_grp45(ctxt); + break; default: goto cannot_emulate; } @@ -3381,7 +4093,7 @@ special_insn: goto done; writeback: - rc = writeback(ctxt, ops); + rc = writeback(ctxt); if (rc != X86EMUL_CONTINUE) goto done; @@ -3392,7 +4104,7 @@ writeback: c->dst.type = saved_dst_type; if ((c->d & SrcMask) == SrcSI) - string_addr_inc(ctxt, seg_override(ctxt, ops, c), + string_addr_inc(ctxt, seg_override(ctxt, c), VCPU_REGS_RSI, &c->src); if ((c->d & DstMask) == DstDI) @@ -3427,115 +4139,34 @@ writeback: done: if (rc == X86EMUL_PROPAGATE_FAULT) ctxt->have_exception = true; + if (rc == X86EMUL_INTERCEPTED) + return EMULATION_INTERCEPTED; + return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; twobyte_insn: switch (c->b) { - case 0x01: /* lgdt, lidt, lmsw */ - switch (c->modrm_reg) { - u16 size; - unsigned long address; - - case 0: /* vmcall */ - if (c->modrm_mod != 3 || c->modrm_rm != 1) - goto cannot_emulate; - - rc = kvm_fix_hypercall(ctxt->vcpu); - if (rc != X86EMUL_CONTINUE) - goto done; - - /* Let the processor re-execute the fixed hypercall */ - c->eip = ctxt->eip; - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 2: /* lgdt */ - rc = read_descriptor(ctxt, ops, c->src.addr.mem, - &size, &address, c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - realmode_lgdt(ctxt->vcpu, size, address); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 3: /* lidt/vmmcall */ - if (c->modrm_mod == 3) { - switch (c->modrm_rm) { - case 1: - rc = kvm_fix_hypercall(ctxt->vcpu); - break; - default: - goto cannot_emulate; - } - } else { - rc = read_descriptor(ctxt, ops, c->src.addr.mem, - &size, &address, - c->op_bytes); - if (rc != X86EMUL_CONTINUE) - goto done; - realmode_lidt(ctxt->vcpu, size, address); - } - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - case 4: /* smsw */ - c->dst.bytes = 2; - c->dst.val = ops->get_cr(0, ctxt->vcpu); - break; - case 6: /* lmsw */ - ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) | - (c->src.val & 0x0f), ctxt->vcpu); - c->dst.type = OP_NONE; - break; - case 5: /* not defined */ - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - case 7: /* invlpg*/ - emulate_invlpg(ctxt->vcpu, - linear(ctxt, c->src.addr.mem)); - /* Disable writeback. */ - c->dst.type = OP_NONE; - break; - default: - goto cannot_emulate; - } - break; case 0x05: /* syscall */ rc = emulate_syscall(ctxt, ops); break; case 0x06: - emulate_clts(ctxt->vcpu); + rc = em_clts(ctxt); break; case 0x09: /* wbinvd */ - kvm_emulate_wbinvd(ctxt->vcpu); + (ctxt->ops->wbinvd)(ctxt); break; case 0x08: /* invd */ case 0x0d: /* GrpP (prefetch) */ case 0x18: /* Grp16 (prefetch/nop) */ break; case 0x20: /* mov cr, reg */ - switch (c->modrm_reg) { - case 1: - case 5 ... 7: - case 9 ... 15: - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu); + c->dst.val = ops->get_cr(ctxt, c->modrm_reg); break; case 0x21: /* mov from dr to reg */ - if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && - (c->modrm_reg == 4 || c->modrm_reg == 5)) { - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu); + ops->get_dr(ctxt, c->modrm_reg, &c->dst.val); break; case 0x22: /* mov reg, cr */ - if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { + if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -3543,16 +4174,9 @@ twobyte_insn: c->dst.type = OP_NONE; break; case 0x23: /* mov from reg to dr */ - if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && - (c->modrm_reg == 4 || c->modrm_reg == 5)) { - emulate_ud(ctxt); - rc = X86EMUL_PROPAGATE_FAULT; - goto done; - } - - if (ops->set_dr(c->modrm_reg, c->src.val & + if (ops->set_dr(ctxt, c->modrm_reg, c->src.val & ((ctxt->mode == X86EMUL_MODE_PROT64) ? - ~0ULL : ~0U), ctxt->vcpu) < 0) { + ~0ULL : ~0U)) < 0) { /* #UD condition is already handled by the code above */ emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; @@ -3565,7 +4189,7 @@ twobyte_insn: /* wrmsr */ msr_data = (u32)c->regs[VCPU_REGS_RAX] | ((u64)c->regs[VCPU_REGS_RDX] << 32); - if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { + if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -3574,7 +4198,7 @@ twobyte_insn: break; case 0x32: /* rdmsr */ - if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { + if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) { emulate_gp(ctxt, 0); rc = X86EMUL_PROPAGATE_FAULT; goto done; @@ -3603,7 +4227,7 @@ twobyte_insn: c->dst.val = test_cc(c->b, ctxt->eflags); break; case 0xa0: /* push fs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); break; case 0xa1: /* pop fs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); @@ -3620,7 +4244,7 @@ twobyte_insn: emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); break; case 0xa8: /* push gs */ - emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); + rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); break; case 0xa9: /* pop gs */ rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); @@ -3727,7 +4351,7 @@ twobyte_insn: (u64) c->src.val; break; case 0xc7: /* Grp9 (cmpxchg8b) */ - rc = emulate_grp9(ctxt, ops); + rc = em_grp9(ctxt); break; default: goto cannot_emulate; @@ -3739,5 +4363,5 @@ twobyte_insn: goto writeback; cannot_emulate: - return -1; + return EMULATION_FAILED; } diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h index 46d08ca..51a9742 100644 --- a/arch/x86/kvm/i8254.h +++ b/arch/x86/kvm/i8254.h @@ -33,7 +33,6 @@ struct kvm_kpit_state { }; struct kvm_pit { - unsigned long base_addresss; struct kvm_io_device dev; struct kvm_io_device speaker_dev; struct kvm *kvm; @@ -51,7 +50,6 @@ struct kvm_pit { #define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 #define KVM_PIT_CHANNEL_MASK 0x3 -void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu); void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); void kvm_free_pit(struct kvm *kvm); diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index ba910d1..53e2d08 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -75,7 +75,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm); void kvm_destroy_pic(struct kvm *kvm); int kvm_pic_read_irq(struct kvm *kvm); void kvm_pic_update_irq(struct kvm_pic *s); -void kvm_pic_clear_isr_ack(struct kvm *kvm); static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) { @@ -100,7 +99,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); void __kvm_migrate_timers(struct kvm_vcpu *vcpu); -int pit_has_pending_timer(struct kvm_vcpu *vcpu); int apic_has_pending_timer(struct kvm_vcpu *vcpu); #endif diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 22fae75..bd14bb4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -1206,7 +1206,7 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva) static void nonpaging_update_pte(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, u64 *spte, - const void *pte, unsigned long mmu_seq) + const void *pte) { WARN_ON(1); } @@ -3163,9 +3163,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, } static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, - struct kvm_mmu_page *sp, - u64 *spte, - const void *new, unsigned long mmu_seq) + struct kvm_mmu_page *sp, u64 *spte, + const void *new) { if (sp->role.level != PT_PAGE_TABLE_LEVEL) { ++vcpu->kvm->stat.mmu_pde_zapped; @@ -3173,7 +3172,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, } ++vcpu->kvm->stat.mmu_pte_updated; - vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq); + vcpu->arch.mmu.update_pte(vcpu, sp, spte, new); } static bool need_remote_flush(u64 old, u64 new) @@ -3229,7 +3228,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, struct kvm_mmu_page *sp; struct hlist_node *node; LIST_HEAD(invalid_list); - unsigned long mmu_seq; u64 entry, gentry, *spte; unsigned pte_size, page_offset, misaligned, quadrant, offset; int level, npte, invlpg_counter, r, flooded = 0; @@ -3271,9 +3269,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, break; } - mmu_seq = vcpu->kvm->mmu_notifier_seq; - smp_rmb(); - spin_lock(&vcpu->kvm->mmu_lock); if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) gentry = 0; @@ -3345,8 +3340,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, if (gentry && !((sp->role.word ^ vcpu->arch.mmu.base_role.word) & mask.word)) - mmu_pte_write_new_pte(vcpu, sp, spte, &gentry, - mmu_seq); + mmu_pte_write_new_pte(vcpu, sp, spte, &gentry); if (!remote_flush && need_remote_flush(entry, *spte)) remote_flush = true; ++spte; @@ -3551,10 +3545,11 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm, return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); } -static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) +static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc) { struct kvm *kvm; struct kvm *kvm_freed = NULL; + int nr_to_scan = sc->nr_to_scan; if (nr_to_scan == 0) goto out; diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index c639779..6c4dc01 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -78,15 +78,19 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; } -static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, - gfn_t table_gfn, unsigned index, - pt_element_t orig_pte, pt_element_t new_pte) +static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, + pt_element_t __user *ptep_user, unsigned index, + pt_element_t orig_pte, pt_element_t new_pte) { + int npages; pt_element_t ret; pt_element_t *table; struct page *page; - page = gfn_to_page(kvm, table_gfn); + npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page); + /* Check if the user is doing something meaningless. */ + if (unlikely(npages != 1)) + return -EFAULT; table = kmap_atomic(page, KM_USER0); ret = CMPXCHG(&table[index], orig_pte, new_pte); @@ -117,6 +121,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gva_t addr, u32 access) { pt_element_t pte; + pt_element_t __user *ptep_user; gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; @@ -152,6 +157,9 @@ walk: pt_access = ACC_ALL; for (;;) { + gfn_t real_gfn; + unsigned long host_addr; + index = PT_INDEX(addr, walker->level); table_gfn = gpte_to_gfn(pte); @@ -160,43 +168,64 @@ walk: walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; - if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, - offset, sizeof(pte), - PFERR_USER_MASK|PFERR_WRITE_MASK)) { + real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn), + PFERR_USER_MASK|PFERR_WRITE_MASK); + if (unlikely(real_gfn == UNMAPPED_GVA)) { + present = false; + break; + } + real_gfn = gpa_to_gfn(real_gfn); + + host_addr = gfn_to_hva(vcpu->kvm, real_gfn); + if (unlikely(kvm_is_error_hva(host_addr))) { + present = false; + break; + } + + ptep_user = (pt_element_t __user *)((void *)host_addr + offset); + if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) { present = false; break; } trace_kvm_mmu_paging_element(pte, walker->level); - if (!is_present_gpte(pte)) { + if (unlikely(!is_present_gpte(pte))) { present = false; break; } - if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { + if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte, + walker->level))) { rsvd_fault = true; break; } - if (write_fault && !is_writable_pte(pte)) - if (user_fault || is_write_protection(vcpu)) - eperm = true; + if (unlikely(write_fault && !is_writable_pte(pte) + && (user_fault || is_write_protection(vcpu)))) + eperm = true; - if (user_fault && !(pte & PT_USER_MASK)) + if (unlikely(user_fault && !(pte & PT_USER_MASK))) eperm = true; #if PTTYPE == 64 - if (fetch_fault && (pte & PT64_NX_MASK)) + if (unlikely(fetch_fault && (pte & PT64_NX_MASK))) eperm = true; #endif - if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { + if (!eperm && !rsvd_fault + && unlikely(!(pte & PT_ACCESSED_MASK))) { + int ret; trace_kvm_mmu_set_accessed_bit(table_gfn, index, sizeof(pte)); - if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, - index, pte, pte|PT_ACCESSED_MASK)) + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, + pte, pte|PT_ACCESSED_MASK); + if (unlikely(ret < 0)) { + present = false; + break; + } else if (ret) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_ACCESSED_MASK; } @@ -241,17 +270,21 @@ walk: --walker->level; } - if (!present || eperm || rsvd_fault) + if (unlikely(!present || eperm || rsvd_fault)) goto error; - if (write_fault && !is_dirty_gpte(pte)) { - bool ret; + if (write_fault && unlikely(!is_dirty_gpte(pte))) { + int ret; trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); - ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, - pte|PT_DIRTY_MASK); - if (ret) + ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index, + pte, pte|PT_DIRTY_MASK); + if (unlikely(ret < 0)) { + present = false; + goto error; + } else if (ret) goto walk; + mark_page_dirty(vcpu->kvm, table_gfn); pte |= PT_DIRTY_MASK; walker->ptes[walker->level - 1] = pte; @@ -325,7 +358,7 @@ no_present: } static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, - u64 *spte, const void *pte, unsigned long mmu_seq) + u64 *spte, const void *pte) { pt_element_t gpte; unsigned pte_access; @@ -342,8 +375,6 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, kvm_release_pfn_clean(pfn); return; } - if (mmu_notifier_retry(vcpu, mmu_seq)) - return; /* * we call mmu_set_spte() with host_writable = true because that diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 6bb15d5..506e4fe 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -63,6 +63,10 @@ MODULE_LICENSE("GPL"); #define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) +#define TSC_RATIO_RSVD 0xffffff0000000000ULL +#define TSC_RATIO_MIN 0x0000000000000001ULL +#define TSC_RATIO_MAX 0x000000ffffffffffULL + static bool erratum_383_found __read_mostly; static const u32 host_save_user_msrs[] = { @@ -93,14 +97,6 @@ struct nested_state { /* A VMEXIT is required but not yet emulated */ bool exit_required; - /* - * If we vmexit during an instruction emulation we need this to restore - * the l1 guest rip after the emulation - */ - unsigned long vmexit_rip; - unsigned long vmexit_rsp; - unsigned long vmexit_rax; - /* cache for intercepts of the guest */ u32 intercept_cr; u32 intercept_dr; @@ -144,8 +140,13 @@ struct vcpu_svm { unsigned int3_injected; unsigned long int3_rip; u32 apf_reason; + + u64 tsc_ratio; }; +static DEFINE_PER_CPU(u64, current_tsc_ratio); +#define TSC_RATIO_DEFAULT 0x0100000000ULL + #define MSR_INVALID 0xffffffffU static struct svm_direct_access_msrs { @@ -190,6 +191,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); +static u64 __scale_tsc(u64 ratio, u64 tsc); enum { VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, @@ -376,7 +378,6 @@ struct svm_cpu_data { }; static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); -static uint32_t svm_features; struct svm_init_data { int cpu; @@ -569,6 +570,10 @@ static int has_svm(void) static void svm_hardware_disable(void *garbage) { + /* Make sure we clean up behind us */ + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) + wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + cpu_svm_disable(); } @@ -610,6 +615,11 @@ static int svm_hardware_enable(void *garbage) wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT); + __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT; + } + svm_init_erratum_383(); return 0; @@ -791,6 +801,23 @@ static __init int svm_hardware_setup(void) if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) kvm_enable_efer_bits(EFER_FFXSR); + if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { + u64 max; + + kvm_has_tsc_control = true; + + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated needed because it will always + * be 1 on all machines and a value of 0 is used to disable + * tsc-scaling for the vcpu. + */ + max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); + + kvm_max_guest_tsc_khz = max; + } + if (nested) { printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); @@ -802,8 +829,6 @@ static __init int svm_hardware_setup(void) goto err; } - svm_features = cpuid_edx(SVM_CPUID_FUNC); - if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; @@ -854,6 +879,64 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } +static u64 __scale_tsc(u64 ratio, u64 tsc) +{ + u64 mult, frac, _tsc; + + mult = ratio >> 32; + frac = ratio & ((1ULL << 32) - 1); + + _tsc = tsc; + _tsc *= mult; + _tsc += (tsc >> 32) * frac; + _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32; + + return _tsc; +} + +static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 _tsc = tsc; + + if (svm->tsc_ratio != TSC_RATIO_DEFAULT) + _tsc = __scale_tsc(svm->tsc_ratio, tsc); + + return _tsc; +} + +static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 ratio; + u64 khz; + + /* TSC scaling supported? */ + if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) + return; + + /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */ + if (user_tsc_khz == 0) { + vcpu->arch.virtual_tsc_khz = 0; + svm->tsc_ratio = TSC_RATIO_DEFAULT; + return; + } + + khz = user_tsc_khz; + + /* TSC scaling required - calculate ratio */ + ratio = khz << 32; + do_div(ratio, tsc_khz); + + if (ratio == 0 || ratio & TSC_RATIO_RSVD) { + WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); + return; + } + vcpu->arch.virtual_tsc_khz = user_tsc_khz; + svm->tsc_ratio = ratio; +} + static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { struct vcpu_svm *svm = to_svm(vcpu); @@ -880,6 +963,15 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } +static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + u64 tsc; + + tsc = svm_scale_tsc(vcpu, native_read_tsc()); + + return target_tsc - tsc; +} + static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -975,7 +1067,7 @@ static void init_vmcb(struct vcpu_svm *svm) svm_set_efer(&svm->vcpu, 0); save->dr6 = 0xffff0ff0; save->dr7 = 0x400; - save->rflags = 2; + kvm_set_rflags(&svm->vcpu, 2); save->rip = 0x0000fff0; svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; @@ -1048,6 +1140,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto out; } + svm->tsc_ratio = TSC_RATIO_DEFAULT; + err = kvm_vcpu_init(&svm->vcpu, kvm, id); if (err) goto free_svm; @@ -1141,6 +1235,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); + + if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && + svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) { + __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio; + wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); + } } static void svm_vcpu_put(struct kvm_vcpu *vcpu) @@ -1365,31 +1465,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_svm *svm = to_svm(vcpu); - if (is_guest_mode(vcpu)) { - /* - * We are here because we run in nested mode, the host kvm - * intercepts cr0 writes but the l1 hypervisor does not. - * But the L1 hypervisor may intercept selective cr0 writes. - * This needs to be checked here. - */ - unsigned long old, new; - - /* Remove bits that would trigger a real cr0 write intercept */ - old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK; - new = cr0 & SVM_CR0_SELECTIVE_MASK; - - if (old == new) { - /* cr0 write with ts and mp unchanged */ - svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; - if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) { - svm->nested.vmexit_rip = kvm_rip_read(vcpu); - svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP); - svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX); - return; - } - } - } - #ifdef CONFIG_X86_64 if (vcpu->arch.efer & EFER_LME) { if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { @@ -2127,7 +2202,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); nested_vmcb->save.cr2 = vmcb->save.cr2; nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; - nested_vmcb->save.rflags = vmcb->save.rflags; + nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu); nested_vmcb->save.rip = vmcb->save.rip; nested_vmcb->save.rsp = vmcb->save.rsp; nested_vmcb->save.rax = vmcb->save.rax; @@ -2184,7 +2259,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) svm->vmcb->save.ds = hsave->save.ds; svm->vmcb->save.gdtr = hsave->save.gdtr; svm->vmcb->save.idtr = hsave->save.idtr; - svm->vmcb->save.rflags = hsave->save.rflags; + kvm_set_rflags(&svm->vcpu, hsave->save.rflags); svm_set_efer(&svm->vcpu, hsave->save.efer); svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); svm_set_cr4(&svm->vcpu, hsave->save.cr4); @@ -2312,7 +2387,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) hsave->save.efer = svm->vcpu.arch.efer; hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); hsave->save.cr4 = svm->vcpu.arch.cr4; - hsave->save.rflags = vmcb->save.rflags; + hsave->save.rflags = kvm_get_rflags(&svm->vcpu); hsave->save.rip = kvm_rip_read(&svm->vcpu); hsave->save.rsp = vmcb->save.rsp; hsave->save.rax = vmcb->save.rax; @@ -2323,7 +2398,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) copy_vmcb_control_area(hsave, vmcb); - if (svm->vmcb->save.rflags & X86_EFLAGS_IF) + if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF) svm->vcpu.arch.hflags |= HF_HIF_MASK; else svm->vcpu.arch.hflags &= ~HF_HIF_MASK; @@ -2341,7 +2416,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm) svm->vmcb->save.ds = nested_vmcb->save.ds; svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; svm->vmcb->save.idtr = nested_vmcb->save.idtr; - svm->vmcb->save.rflags = nested_vmcb->save.rflags; + kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags); svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); @@ -2443,13 +2518,13 @@ static int vmload_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + nested_svm_vmloadsave(nested_vmcb, svm->vmcb); nested_svm_unmap(page); @@ -2464,13 +2539,13 @@ static int vmsave_interception(struct vcpu_svm *svm) if (nested_svm_check_permissions(svm)) return 1; - svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; - skip_emulated_instruction(&svm->vcpu); - nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); if (!nested_vmcb) return 1; + svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; + skip_emulated_instruction(&svm->vcpu); + nested_svm_vmloadsave(svm->vmcb, nested_vmcb); nested_svm_unmap(page); @@ -2676,6 +2751,29 @@ static int emulate_on_interception(struct vcpu_svm *svm) return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; } +bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val) +{ + unsigned long cr0 = svm->vcpu.arch.cr0; + bool ret = false; + u64 intercept; + + intercept = svm->nested.intercept; + + if (!is_guest_mode(&svm->vcpu) || + (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))) + return false; + + cr0 &= ~SVM_CR0_SELECTIVE_MASK; + val &= ~SVM_CR0_SELECTIVE_MASK; + + if (cr0 ^ val) { + svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; + ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); + } + + return ret; +} + #define CR_VALID (1ULL << 63) static int cr_interception(struct vcpu_svm *svm) @@ -2699,7 +2797,11 @@ static int cr_interception(struct vcpu_svm *svm) val = kvm_register_read(&svm->vcpu, reg); switch (cr) { case 0: - err = kvm_set_cr0(&svm->vcpu, val); + if (!check_selective_cr0_intercepted(svm, val)) + err = kvm_set_cr0(&svm->vcpu, val); + else + return 1; + break; case 3: err = kvm_set_cr3(&svm->vcpu, val); @@ -2744,23 +2846,6 @@ static int cr_interception(struct vcpu_svm *svm) return 1; } -static int cr0_write_interception(struct vcpu_svm *svm) -{ - struct kvm_vcpu *vcpu = &svm->vcpu; - int r; - - r = cr_interception(svm); - - if (svm->nested.vmexit_rip) { - kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip); - kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp); - kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax); - svm->nested.vmexit_rip = 0; - } - - return r; -} - static int dr_interception(struct vcpu_svm *svm) { int reg, dr; @@ -2813,7 +2898,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) case MSR_IA32_TSC: { struct vmcb *vmcb = get_host_vmcb(svm); - *data = vmcb->control.tsc_offset + native_read_tsc(); + *data = vmcb->control.tsc_offset + + svm_scale_tsc(vcpu, native_read_tsc()); + break; } case MSR_STAR: @@ -3048,7 +3135,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_READ_CR4] = cr_interception, [SVM_EXIT_READ_CR8] = cr_interception, [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, - [SVM_EXIT_WRITE_CR0] = cr0_write_interception, + [SVM_EXIT_WRITE_CR0] = cr_interception, [SVM_EXIT_WRITE_CR3] = cr_interception, [SVM_EXIT_WRITE_CR4] = cr_interception, [SVM_EXIT_WRITE_CR8] = cr8_write_interception, @@ -3104,97 +3191,109 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_NPF] = pf_interception, }; -void dump_vmcb(struct kvm_vcpu *vcpu) +static void dump_vmcb(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; pr_err("VMCB Control Area:\n"); - pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff); - pr_err("cr_write: %04x\n", control->intercept_cr >> 16); - pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff); - pr_err("dr_write: %04x\n", control->intercept_dr >> 16); - pr_err("exceptions: %08x\n", control->intercept_exceptions); - pr_err("intercepts: %016llx\n", control->intercept); - pr_err("pause filter count: %d\n", control->pause_filter_count); - pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); - pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); - pr_err("tsc_offset: %016llx\n", control->tsc_offset); - pr_err("asid: %d\n", control->asid); - pr_err("tlb_ctl: %d\n", control->tlb_ctl); - pr_err("int_ctl: %08x\n", control->int_ctl); - pr_err("int_vector: %08x\n", control->int_vector); - pr_err("int_state: %08x\n", control->int_state); - pr_err("exit_code: %08x\n", control->exit_code); - pr_err("exit_info1: %016llx\n", control->exit_info_1); - pr_err("exit_info2: %016llx\n", control->exit_info_2); - pr_err("exit_int_info: %08x\n", control->exit_int_info); - pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); - pr_err("nested_ctl: %lld\n", control->nested_ctl); - pr_err("nested_cr3: %016llx\n", control->nested_cr3); - pr_err("event_inj: %08x\n", control->event_inj); - pr_err("event_inj_err: %08x\n", control->event_inj_err); - pr_err("lbr_ctl: %lld\n", control->lbr_ctl); - pr_err("next_rip: %016llx\n", control->next_rip); + pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff); + pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16); + pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff); + pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16); + pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions); + pr_err("%-20s%016llx\n", "intercepts:", control->intercept); + pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count); + pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa); + pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa); + pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset); + pr_err("%-20s%d\n", "asid:", control->asid); + pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl); + pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl); + pr_err("%-20s%08x\n", "int_vector:", control->int_vector); + pr_err("%-20s%08x\n", "int_state:", control->int_state); + pr_err("%-20s%08x\n", "exit_code:", control->exit_code); + pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1); + pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2); + pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info); + pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err); + pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl); + pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3); + pr_err("%-20s%08x\n", "event_inj:", control->event_inj); + pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err); + pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl); + pr_err("%-20s%016llx\n", "next_rip:", control->next_rip); pr_err("VMCB State Save Area:\n"); - pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", - save->es.selector, save->es.attrib, - save->es.limit, save->es.base); - pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", - save->cs.selector, save->cs.attrib, - save->cs.limit, save->cs.base); - pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", - save->ss.selector, save->ss.attrib, - save->ss.limit, save->ss.base); - pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", - save->ds.selector, save->ds.attrib, - save->ds.limit, save->ds.base); - pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", - save->fs.selector, save->fs.attrib, - save->fs.limit, save->fs.base); - pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", - save->gs.selector, save->gs.attrib, - save->gs.limit, save->gs.base); - pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->gdtr.selector, save->gdtr.attrib, - save->gdtr.limit, save->gdtr.base); - pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->ldtr.selector, save->ldtr.attrib, - save->ldtr.limit, save->ldtr.base); - pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", - save->idtr.selector, save->idtr.attrib, - save->idtr.limit, save->idtr.base); - pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", - save->tr.selector, save->tr.attrib, - save->tr.limit, save->tr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "es:", + save->es.selector, save->es.attrib, + save->es.limit, save->es.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "cs:", + save->cs.selector, save->cs.attrib, + save->cs.limit, save->cs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ss:", + save->ss.selector, save->ss.attrib, + save->ss.limit, save->ss.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ds:", + save->ds.selector, save->ds.attrib, + save->ds.limit, save->ds.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "fs:", + save->fs.selector, save->fs.attrib, + save->fs.limit, save->fs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "gs:", + save->gs.selector, save->gs.attrib, + save->gs.limit, save->gs.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "gdtr:", + save->gdtr.selector, save->gdtr.attrib, + save->gdtr.limit, save->gdtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "ldtr:", + save->ldtr.selector, save->ldtr.attrib, + save->ldtr.limit, save->ldtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "idtr:", + save->idtr.selector, save->idtr.attrib, + save->idtr.limit, save->idtr.base); + pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n", + "tr:", + save->tr.selector, save->tr.attrib, + save->tr.limit, save->tr.base); pr_err("cpl: %d efer: %016llx\n", save->cpl, save->efer); - pr_err("cr0: %016llx cr2: %016llx\n", - save->cr0, save->cr2); - pr_err("cr3: %016llx cr4: %016llx\n", - save->cr3, save->cr4); - pr_err("dr6: %016llx dr7: %016llx\n", - save->dr6, save->dr7); - pr_err("rip: %016llx rflags: %016llx\n", - save->rip, save->rflags); - pr_err("rsp: %016llx rax: %016llx\n", - save->rsp, save->rax); - pr_err("star: %016llx lstar: %016llx\n", - save->star, save->lstar); - pr_err("cstar: %016llx sfmask: %016llx\n", - save->cstar, save->sfmask); - pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", - save->kernel_gs_base, save->sysenter_cs); - pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", - save->sysenter_esp, save->sysenter_eip); - pr_err("gpat: %016llx dbgctl: %016llx\n", - save->g_pat, save->dbgctl); - pr_err("br_from: %016llx br_to: %016llx\n", - save->br_from, save->br_to); - pr_err("excp_from: %016llx excp_to: %016llx\n", - save->last_excp_from, save->last_excp_to); - + pr_err("%-15s %016llx %-13s %016llx\n", + "cr0:", save->cr0, "cr2:", save->cr2); + pr_err("%-15s %016llx %-13s %016llx\n", + "cr3:", save->cr3, "cr4:", save->cr4); + pr_err("%-15s %016llx %-13s %016llx\n", + "dr6:", save->dr6, "dr7:", save->dr7); + pr_err("%-15s %016llx %-13s %016llx\n", + "rip:", save->rip, "rflags:", save->rflags); + pr_err("%-15s %016llx %-13s %016llx\n", + "rsp:", save->rsp, "rax:", save->rax); + pr_err("%-15s %016llx %-13s %016llx\n", + "star:", save->star, "lstar:", save->lstar); + pr_err("%-15s %016llx %-13s %016llx\n", + "cstar:", save->cstar, "sfmask:", save->sfmask); + pr_err("%-15s %016llx %-13s %016llx\n", + "kernel_gs_base:", save->kernel_gs_base, + "sysenter_cs:", save->sysenter_cs); + pr_err("%-15s %016llx %-13s %016llx\n", + "sysenter_esp:", save->sysenter_esp, + "sysenter_eip:", save->sysenter_eip); + pr_err("%-15s %016llx %-13s %016llx\n", + "gpat:", save->g_pat, "dbgctl:", save->dbgctl); + pr_err("%-15s %016llx %-13s %016llx\n", + "br_from:", save->br_from, "br_to:", save->br_to); + pr_err("%-15s %016llx %-13s %016llx\n", + "excp_from:", save->last_excp_from, + "excp_to:", save->last_excp_to); } static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) @@ -3384,7 +3483,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) return 0; - ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); + ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF); if (is_guest_mode(vcpu)) return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); @@ -3871,6 +3970,186 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu) update_cr0_intercept(svm); } +#define PRE_EX(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_PRE_EXCEPT, } +#define POST_EX(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_POST_EXCEPT, } +#define POST_MEM(exit) { .exit_code = (exit), \ + .stage = X86_ICPT_POST_MEMACCESS, } + +static struct __x86_intercept { + u32 exit_code; + enum x86_intercept_stage stage; +} x86_intercept_map[] = { + [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), + [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), + [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), + [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), + [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), + [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), + [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), + [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), + [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), + [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), + [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), + [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), + [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), + [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), + [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), + [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), + [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), + [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), + [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), + [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), + [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), + [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), + [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), + [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), + [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), + [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), + [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), + [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), + [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), + [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), + [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), + [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), + [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), + [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), + [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), + [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), + [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), + [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), + [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), + [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), + [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), + [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), +}; + +#undef PRE_EX +#undef POST_EX +#undef POST_MEM + +static int svm_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + struct vcpu_svm *svm = to_svm(vcpu); + int vmexit, ret = X86EMUL_CONTINUE; + struct __x86_intercept icpt_info; + struct vmcb *vmcb = svm->vmcb; + + if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) + goto out; + + icpt_info = x86_intercept_map[info->intercept]; + + if (stage != icpt_info.stage) + goto out; + + switch (icpt_info.exit_code) { + case SVM_EXIT_READ_CR0: + if (info->intercept == x86_intercept_cr_read) + icpt_info.exit_code += info->modrm_reg; + break; + case SVM_EXIT_WRITE_CR0: { + unsigned long cr0, val; + u64 intercept; + + if (info->intercept == x86_intercept_cr_write) + icpt_info.exit_code += info->modrm_reg; + + if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0) + break; + + intercept = svm->nested.intercept; + + if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))) + break; + + cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; + val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; + + if (info->intercept == x86_intercept_lmsw) { + cr0 &= 0xfUL; + val &= 0xfUL; + /* lmsw can't clear PE - catch this here */ + if (cr0 & X86_CR0_PE) + val |= X86_CR0_PE; + } + + if (cr0 ^ val) + icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; + + break; + } + case SVM_EXIT_READ_DR0: + case SVM_EXIT_WRITE_DR0: + icpt_info.exit_code += info->modrm_reg; + break; + case SVM_EXIT_MSR: + if (info->intercept == x86_intercept_wrmsr) + vmcb->control.exit_info_1 = 1; + else + vmcb->control.exit_info_1 = 0; + break; + case SVM_EXIT_PAUSE: + /* + * We get this for NOP only, but pause + * is rep not, check this here + */ + if (info->rep_prefix != REPE_PREFIX) + goto out; + case SVM_EXIT_IOIO: { + u64 exit_info; + u32 bytes; + + exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16; + + if (info->intercept == x86_intercept_in || + info->intercept == x86_intercept_ins) { + exit_info |= SVM_IOIO_TYPE_MASK; + bytes = info->src_bytes; + } else { + bytes = info->dst_bytes; + } + + if (info->intercept == x86_intercept_outs || + info->intercept == x86_intercept_ins) + exit_info |= SVM_IOIO_STR_MASK; + + if (info->rep_prefix) + exit_info |= SVM_IOIO_REP_MASK; + + bytes = min(bytes, 4u); + + exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; + + exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); + + vmcb->control.exit_info_1 = exit_info; + vmcb->control.exit_info_2 = info->next_rip; + + break; + } + default: + break; + } + + vmcb->control.next_rip = info->next_rip; + vmcb->control.exit_code = icpt_info.exit_code; + vmexit = nested_svm_exit_handled(svm); + + ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED + : X86EMUL_CONTINUE; + +out: + return ret; +} + static struct kvm_x86_ops svm_x86_ops = { .cpu_has_kvm_support = has_svm, .disabled_by_bios = is_disabled, @@ -3952,10 +4231,14 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, + .set_tsc_khz = svm_set_tsc_khz, .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, + .compute_tsc_offset = svm_compute_tsc_offset, .set_tdp_cr3 = set_tdp_cr3, + + .check_intercept = svm_check_intercept, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5b4cdcb..4c3fa0f 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -128,8 +128,11 @@ struct vcpu_vmx { unsigned long host_rsp; int launched; u8 fail; + u8 cpl; + bool nmi_known_unmasked; u32 exit_intr_info; u32 idt_vectoring_info; + ulong rflags; struct shared_msr_entry *guest_msrs; int nmsrs; int save_nmsrs; @@ -159,6 +162,10 @@ struct vcpu_vmx { u32 ar; } tr, es, ds, fs, gs; } rmode; + struct { + u32 bitmask; /* 4 bits per segment (1 bit per field) */ + struct kvm_save_segment seg[8]; + } segment_cache; int vpid; bool emulation_required; @@ -171,6 +178,15 @@ struct vcpu_vmx { bool rdtscp_enabled; }; +enum segment_cache_field { + SEG_FIELD_SEL = 0, + SEG_FIELD_BASE = 1, + SEG_FIELD_LIMIT = 2, + SEG_FIELD_AR = 3, + + SEG_FIELD_NR = 4 +}; + static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) { return container_of(vcpu, struct vcpu_vmx, vcpu); @@ -643,6 +659,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask) vmcs_writel(field, vmcs_readl(field) | mask); } +static void vmx_segment_cache_clear(struct vcpu_vmx *vmx) +{ + vmx->segment_cache.bitmask = 0; +} + +static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg, + unsigned field) +{ + bool ret; + u32 mask = 1 << (seg * SEG_FIELD_NR + field); + + if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) { + vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS); + vmx->segment_cache.bitmask = 0; + } + ret = vmx->segment_cache.bitmask & mask; + vmx->segment_cache.bitmask |= mask; + return ret; +} + +static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg) +{ + u16 *p = &vmx->segment_cache.seg[seg].selector; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL)) + *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector); + return *p; +} + +static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg) +{ + ulong *p = &vmx->segment_cache.seg[seg].base; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE)) + *p = vmcs_readl(kvm_vmx_segment_fields[seg].base); + return *p; +} + +static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].limit; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit); + return *p; +} + +static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg) +{ + u32 *p = &vmx->segment_cache.seg[seg].ar; + + if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR)) + *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes); + return *p; +} + static void update_exception_bitmap(struct kvm_vcpu *vcpu) { u32 eb; @@ -970,17 +1042,24 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) { unsigned long rflags, save_rflags; - rflags = vmcs_readl(GUEST_RFLAGS); - if (to_vmx(vcpu)->rmode.vm86_active) { - rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; - save_rflags = to_vmx(vcpu)->rmode.save_rflags; - rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) { + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + rflags = vmcs_readl(GUEST_RFLAGS); + if (to_vmx(vcpu)->rmode.vm86_active) { + rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; + save_rflags = to_vmx(vcpu)->rmode.save_rflags; + rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; + } + to_vmx(vcpu)->rflags = rflags; } - return rflags; + return to_vmx(vcpu)->rflags; } static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) { + __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail); + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + to_vmx(vcpu)->rflags = rflags; if (to_vmx(vcpu)->rmode.vm86_active) { to_vmx(vcpu)->rmode.save_rflags = rflags; rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; @@ -1053,7 +1132,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, } if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) + int inc_eip = 0; + if (kvm_exception_is_soft(nr)) + inc_eip = vcpu->arch.event_exit_inst_len; + if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -1151,6 +1233,16 @@ static u64 guest_read_tsc(void) } /* + * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ + * ioctl. In this case the call-back should update internal vmx state to make + * the changes effective. + */ +static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) +{ + /* Nothing to do here */ +} + +/* * writes 'offset' into guest's timestamp counter offset register */ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) @@ -1164,6 +1256,11 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) vmcs_write64(TSC_OFFSET, offset + adjustment); } +static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + return target_tsc - native_read_tsc(); +} + /* * Reads an msr value (of 'msr_index') into 'pdata'. * Returns 0 on success, non-0 otherwise. @@ -1243,9 +1340,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) break; #ifdef CONFIG_X86_64 case MSR_FS_BASE: + vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_FS_BASE, data); break; case MSR_GS_BASE: + vmx_segment_cache_clear(vmx); vmcs_writel(GUEST_GS_BASE, data); break; case MSR_KERNEL_GS_BASE: @@ -1689,6 +1788,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->emulation_required = 1; vmx->rmode.vm86_active = 0; + vmx_segment_cache_clear(vmx); + vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); @@ -1712,6 +1813,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu) fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); + vmx_segment_cache_clear(vmx); + vmcs_write16(GUEST_SS_SELECTOR, 0); vmcs_write32(GUEST_SS_AR_BYTES, 0x93); @@ -1775,6 +1878,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu) vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); } + vmx_segment_cache_clear(vmx); + vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); @@ -1851,6 +1956,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu) { u32 guest_tr_ar; + vmx_segment_cache_clear(to_vmx(vcpu)); + guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { printk(KERN_DEBUG "%s: tss fixup for long mode. \n", @@ -1998,6 +2105,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); } static u64 construct_eptp(unsigned long root_hpa) @@ -2053,7 +2161,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_save_segment *save; u32 ar; @@ -2075,13 +2182,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu, var->limit = save->limit; ar = save->ar; if (seg == VCPU_SREG_TR - || var->selector == vmcs_read16(sf->selector)) + || var->selector == vmx_read_guest_seg_selector(vmx, seg)) goto use_saved_rmode_seg; } - var->base = vmcs_readl(sf->base); - var->limit = vmcs_read32(sf->limit); - var->selector = vmcs_read16(sf->selector); - ar = vmcs_read32(sf->ar_bytes); + var->base = vmx_read_guest_seg_base(vmx, seg); + var->limit = vmx_read_guest_seg_limit(vmx, seg); + var->selector = vmx_read_guest_seg_selector(vmx, seg); + ar = vmx_read_guest_seg_ar(vmx, seg); use_saved_rmode_seg: if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) ar = 0; @@ -2098,27 +2205,37 @@ use_saved_rmode_seg: static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) { - struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; struct kvm_segment s; if (to_vmx(vcpu)->rmode.vm86_active) { vmx_get_segment(vcpu, &s, seg); return s.base; } - return vmcs_readl(sf->base); + return vmx_read_guest_seg_base(to_vmx(vcpu), seg); } -static int vmx_get_cpl(struct kvm_vcpu *vcpu) +static int __vmx_get_cpl(struct kvm_vcpu *vcpu) { if (!is_protmode(vcpu)) return 0; - if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ + if (!is_long_mode(vcpu) + && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */ return 3; - return vmcs_read16(GUEST_CS_SELECTOR) & 3; + return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3; } +static int vmx_get_cpl(struct kvm_vcpu *vcpu) +{ + if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) { + __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); + to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu); + } + return to_vmx(vcpu)->cpl; +} + + static u32 vmx_segment_access_rights(struct kvm_segment *var) { u32 ar; @@ -2148,6 +2265,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; u32 ar; + vmx_segment_cache_clear(vmx); + if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { vmcs_write16(sf->selector, var->selector); vmx->rmode.tr.selector = var->selector; @@ -2184,11 +2303,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu, ar |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, ar); + __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) { - u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); + u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS); *db = (ar >> 14) & 1; *l = (ar >> 13) & 1; @@ -2775,6 +2895,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) if (ret != 0) goto out; + vmx_segment_cache_clear(vmx); + seg_setup(VCPU_SREG_CS); /* * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode @@ -2904,7 +3026,10 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu) ++vcpu->stat.irq_injections; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) + int inc_eip = 0; + if (vcpu->arch.interrupt.soft) + inc_eip = vcpu->arch.event_exit_inst_len; + if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -2937,8 +3062,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu) } ++vcpu->stat.nmi_injections; + vmx->nmi_known_unmasked = false; if (vmx->rmode.vm86_active) { - if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) + if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE) kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); return; } @@ -2961,6 +3087,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) { if (!cpu_has_virtual_nmis()) return to_vmx(vcpu)->soft_vnmi_blocked; + if (to_vmx(vcpu)->nmi_known_unmasked) + return false; return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; } @@ -2974,6 +3102,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) vmx->vnmi_blocked_time = 0; } } else { + vmx->nmi_known_unmasked = !masked; if (masked) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); @@ -3091,7 +3220,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) enum emulation_result er; vect_info = vmx->idt_vectoring_info; - intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + intr_info = vmx->exit_intr_info; if (is_machine_check(intr_info)) return handle_machine_check(vcpu); @@ -3122,7 +3251,6 @@ static int handle_exception(struct kvm_vcpu *vcpu) } error_code = 0; - rip = kvm_rip_read(vcpu); if (intr_info & INTR_INFO_DELIVER_CODE_MASK) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { @@ -3169,6 +3297,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) vmx->vcpu.arch.event_exit_inst_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN); kvm_run->exit_reason = KVM_EXIT_DEBUG; + rip = kvm_rip_read(vcpu); kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; kvm_run->debug.arch.exception = ex_no; break; @@ -3505,9 +3634,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu) switch (type) { case INTR_TYPE_NMI_INTR: vcpu->arch.nmi_injected = false; - if (cpu_has_virtual_nmis()) - vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + vmx_set_nmi_mask(vcpu, true); break; case INTR_TYPE_EXT_INTR: case INTR_TYPE_SOFT_INTR: @@ -3867,12 +3994,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) { - u32 exit_intr_info = vmx->exit_intr_info; + u32 exit_intr_info; + + if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY + || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI)) + return; + + vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); + exit_intr_info = vmx->exit_intr_info; /* Handle machine checks before interrupts are enabled */ - if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) - || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI - && is_machine_check(exit_intr_info))) + if (is_machine_check(exit_intr_info)) kvm_machine_check(); /* We need to handle NMIs before interrupts are enabled */ @@ -3886,7 +4018,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) { - u32 exit_intr_info = vmx->exit_intr_info; + u32 exit_intr_info; bool unblock_nmi; u8 vector; bool idtv_info_valid; @@ -3894,6 +4026,13 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; if (cpu_has_virtual_nmis()) { + if (vmx->nmi_known_unmasked) + return; + /* + * Can't use vmx->exit_intr_info since we're not sure what + * the exit reason is. + */ + exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; vector = exit_intr_info & INTR_INFO_VECTOR_MASK; /* @@ -3910,6 +4049,10 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) vector != DF_VECTOR && !idtv_info_valid) vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI); + else + vmx->nmi_known_unmasked = + !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) + & GUEST_INTR_STATE_NMI); } else if (unlikely(vmx->soft_vnmi_blocked)) vmx->vnmi_blocked_time += ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); @@ -3946,8 +4089,7 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx, * Clear bit "block by NMI" before VM entry if a NMI * delivery faulted. */ - vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, - GUEST_INTR_STATE_NMI); + vmx_set_nmi_mask(&vmx->vcpu, false); break; case INTR_TYPE_SOFT_EXCEPTION: vmx->vcpu.arch.event_exit_inst_len = @@ -4124,7 +4266,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) ); vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) + | (1 << VCPU_EXREG_RFLAGS) + | (1 << VCPU_EXREG_CPL) | (1 << VCPU_EXREG_PDPTR) + | (1 << VCPU_EXREG_SEGMENTS) | (1 << VCPU_EXREG_CR3)); vcpu->arch.regs_dirty = 0; @@ -4134,7 +4279,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); - vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); @@ -4195,8 +4339,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) goto free_vcpu; vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); + err = -ENOMEM; if (!vmx->guest_msrs) { - err = -ENOMEM; goto uninit_vcpu; } @@ -4215,7 +4359,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) if (err) goto free_vmcs; if (vm_need_virtualize_apic_accesses(kvm)) - if (alloc_apic_access_page(kvm) != 0) + err = alloc_apic_access_page(kvm); + if (err) goto free_vmcs; if (enable_ept) { @@ -4368,6 +4513,13 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) { } +static int vmx_check_intercept(struct kvm_vcpu *vcpu, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + return X86EMUL_CONTINUE; +} + static struct kvm_x86_ops vmx_x86_ops = { .cpu_has_kvm_support = cpu_has_kvm_support, .disabled_by_bios = vmx_disabled_by_bios, @@ -4449,10 +4601,14 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, + .set_tsc_khz = vmx_set_tsc_khz, .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, + .compute_tsc_offset = vmx_compute_tsc_offset, .set_tdp_cr3 = vmx_set_cr3, + + .check_intercept = vmx_check_intercept, }; static int __init vmx_init(void) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 934b4c6..77c9d86 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -60,22 +60,12 @@ #include <asm/div64.h> #define MAX_IO_MSRS 256 -#define CR0_RESERVED_BITS \ - (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \ - | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \ - | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG)) -#define CR4_RESERVED_BITS \ - (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\ - | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \ - | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \ - | X86_CR4_OSXSAVE \ - | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE)) - -#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR) - #define KVM_MAX_MCE_BANKS 32 #define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) +#define emul_to_vcpu(ctxt) \ + container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt) + /* EFER defaults: * - enable syscall per default because its emulated by KVM * - enable LME and LMA per default on 64 bit KVM @@ -100,6 +90,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops); int ignore_msrs = 0; module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); +bool kvm_has_tsc_control; +EXPORT_SYMBOL_GPL(kvm_has_tsc_control); +u32 kvm_max_guest_tsc_khz; +EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); + #define KVM_NR_SHARED_MSRS 16 struct kvm_shared_msrs_global { @@ -157,6 +152,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { u64 __read_mostly host_xcr0; +int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); + static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) { int i; @@ -361,8 +358,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) void kvm_inject_nmi(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_NMI, vcpu); kvm_make_request(KVM_REQ_EVENT, vcpu); + vcpu->arch.nmi_pending = 1; } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -982,7 +979,15 @@ static inline int kvm_tsc_changes_freq(void) return ret; } -static inline u64 nsec_to_cycles(u64 nsec) +static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +{ + if (vcpu->arch.virtual_tsc_khz) + return vcpu->arch.virtual_tsc_khz; + else + return __this_cpu_read(cpu_tsc_khz); +} + +static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec) { u64 ret; @@ -990,25 +995,24 @@ static inline u64 nsec_to_cycles(u64 nsec) if (kvm_tsc_changes_freq()) printk_once(KERN_WARNING "kvm: unreliable cycle conversion on adjustable rate TSC\n"); - ret = nsec * __this_cpu_read(cpu_tsc_khz); + ret = nsec * vcpu_tsc_khz(vcpu); do_div(ret, USEC_PER_SEC); return ret; } -static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) +static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, - &kvm->arch.virtual_tsc_shift, - &kvm->arch.virtual_tsc_mult); - kvm->arch.virtual_tsc_khz = this_tsc_khz; + &vcpu->arch.tsc_catchup_shift, + &vcpu->arch.tsc_catchup_mult); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) { u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, - vcpu->kvm->arch.virtual_tsc_mult, - vcpu->kvm->arch.virtual_tsc_shift); + vcpu->arch.tsc_catchup_mult, + vcpu->arch.tsc_catchup_shift); tsc += vcpu->arch.last_tsc_write; return tsc; } @@ -1021,7 +1025,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) s64 sdiff; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = data - native_read_tsc(); + offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; sdiff = data - kvm->arch.last_tsc_write; @@ -1037,13 +1041,13 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data) * In that case, for a reliable TSC, we can match TSC offsets, * or make a best guest using elapsed value. */ - if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && + if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) && elapsed < 5ULL * NSEC_PER_SEC) { if (!check_tsc_unstable()) { offset = kvm->arch.last_tsc_offset; pr_debug("kvm: matched tsc offset for %llu\n", data); } else { - u64 delta = nsec_to_cycles(elapsed); + u64 delta = nsec_to_cycles(vcpu, elapsed); offset += delta; pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } @@ -1075,8 +1079,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) local_irq_save(flags); kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); kernel_ns = get_kernel_ns(); - this_tsc_khz = __this_cpu_read(cpu_tsc_khz); - + this_tsc_khz = vcpu_tsc_khz(v); if (unlikely(this_tsc_khz == 0)) { local_irq_restore(flags); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); @@ -1993,6 +1996,7 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_X86_ROBUST_SINGLESTEP: case KVM_CAP_XSAVE: case KVM_CAP_ASYNC_PF: + case KVM_CAP_GET_TSC_KHZ: r = 1; break; case KVM_CAP_COALESCED_MMIO: @@ -2019,6 +2023,9 @@ int kvm_dev_ioctl_check_extension(long ext) case KVM_CAP_XCRS: r = cpu_has_xsave; break; + case KVM_CAP_TSC_CONTROL: + r = kvm_has_tsc_control; + break; default: r = 0; break; @@ -2120,8 +2127,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) kvm_x86_ops->vcpu_load(vcpu, cpu); if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { /* Make sure TSC doesn't go backwards */ - s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : - native_read_tsc() - vcpu->arch.last_host_tsc; + s64 tsc_delta; + u64 tsc; + + kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc); + tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : + tsc - vcpu->arch.last_guest_tsc; + if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { @@ -2139,7 +2151,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - vcpu->arch.last_host_tsc = native_read_tsc(); + kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); } static int is_efer_nx(void) @@ -2324,6 +2336,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); + /* cpuid 0xC0000001.edx */ + const u32 kvm_supported_word5_x86_features = + F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) | + F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) | + F(PMM) | F(PMM_EN); + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); do_cpuid_1_ent(entry, function, index); @@ -2418,6 +2436,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | (1 << KVM_FEATURE_NOP_IO_DELAY) | (1 << KVM_FEATURE_CLOCKSOURCE2) | + (1 << KVM_FEATURE_ASYNC_PF) | (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); entry->ebx = 0; entry->ecx = 0; @@ -2432,6 +2451,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx &= kvm_supported_word6_x86_features; cpuid_mask(&entry->ecx, 6); break; + /*Add support for Centaur's CPUID instruction*/ + case 0xC0000000: + /*Just support up to 0xC0000004 now*/ + entry->eax = min(entry->eax, 0xC0000004); + break; + case 0xC0000001: + entry->edx &= kvm_supported_word5_x86_features; + cpuid_mask(&entry->edx, 5); + break; + case 0xC0000002: + case 0xC0000003: + case 0xC0000004: + /*Now nothing to do, reserved for the future*/ + break; } kvm_x86_ops->set_supported_cpuid(function, entry); @@ -2478,6 +2511,26 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, if (nent >= cpuid->nent) goto out_free; + /* Add support for Centaur's CPUID instruction. */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) { + do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0, + &nent, cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + + limit = cpuid_entries[nent - 1].eax; + for (func = 0xC0000001; + func <= limit && nent < cpuid->nent; ++func) + do_cpuid_ent(&cpuid_entries[nent], func, 0, + &nent, cpuid->nent); + + r = -E2BIG; + if (nent >= cpuid->nent) + goto out_free; + } + do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, cpuid->nent); @@ -3046,6 +3099,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp, r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); break; } + case KVM_SET_TSC_KHZ: { + u32 user_tsc_khz; + + r = -EINVAL; + if (!kvm_has_tsc_control) + break; + + user_tsc_khz = (u32)arg; + + if (user_tsc_khz >= kvm_max_guest_tsc_khz) + goto out; + + kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz); + + r = 0; + goto out; + } + case KVM_GET_TSC_KHZ: { + r = -EIO; + if (check_tsc_unstable()) + goto out; + + r = vcpu_tsc_khz(vcpu); + + goto out; + } default: r = -EINVAL; } @@ -3595,20 +3674,43 @@ static void kvm_init_msr_list(void) static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, const void *v) { - if (vcpu->arch.apic && - !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) - return 0; + int handled = 0; + int n; - return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); + do { + n = min(len, 8); + if (!(vcpu->arch.apic && + !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v)) + && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + break; + handled += n; + addr += n; + len -= n; + v += n; + } while (len); + + return handled; } static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) { - if (vcpu->arch.apic && - !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) - return 0; + int handled = 0; + int n; + + do { + n = min(len, 8); + if (!(vcpu->arch.apic && + !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v)) + && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v)) + break; + trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v); + handled += n; + addr += n; + len -= n; + v += n; + } while (len); - return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); + return handled; } static void kvm_set_segment(struct kvm_vcpu *vcpu, @@ -3703,37 +3805,43 @@ out: } /* used for instruction fetching */ -static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access | PFERR_FETCH_MASK, exception); } -static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; + return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); } -static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, +static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); } -static int kvm_write_guest_virt_system(gva_t addr, void *val, +static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt, + gva_t addr, void *val, unsigned int bytes, - struct kvm_vcpu *vcpu, struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); void *data = val; int r = X86EMUL_CONTINUE; @@ -3761,13 +3869,15 @@ out: return r; } -static int emulator_read_emulated(unsigned long addr, +static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, void *val, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; + int handled; if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); @@ -3786,7 +3896,7 @@ static int emulator_read_emulated(unsigned long addr, if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) goto mmio; - if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) + if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) == X86EMUL_CONTINUE) return X86EMUL_CONTINUE; @@ -3794,18 +3904,24 @@ mmio: /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { - trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); + handled = vcpu_mmio_read(vcpu, gpa, bytes, val); + + if (handled == bytes) return X86EMUL_CONTINUE; - } + + gpa += handled; + bytes -= handled; + val += handled; trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); vcpu->mmio_needed = 1; vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->mmio_size = bytes; + vcpu->run->mmio.len = min(vcpu->mmio_size, 8); vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; + vcpu->mmio_index = 0; return X86EMUL_IO_NEEDED; } @@ -3829,6 +3945,7 @@ static int emulator_write_emulated_onepage(unsigned long addr, struct kvm_vcpu *vcpu) { gpa_t gpa; + int handled; gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); @@ -3847,25 +3964,35 @@ mmio: /* * Is this MMIO handled locally? */ - if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) + handled = vcpu_mmio_write(vcpu, gpa, bytes, val); + if (handled == bytes) return X86EMUL_CONTINUE; + gpa += handled; + bytes -= handled; + val += handled; + vcpu->mmio_needed = 1; + memcpy(vcpu->mmio_data, val, bytes); vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->run->mmio.len = vcpu->mmio_size = bytes; + vcpu->mmio_size = bytes; + vcpu->run->mmio.len = min(vcpu->mmio_size, 8); vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; - memcpy(vcpu->run->mmio.data, val, bytes); + memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + vcpu->mmio_index = 0; return X86EMUL_CONTINUE; } -int emulator_write_emulated(unsigned long addr, +int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *val, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + /* Crossing a page boundary? */ if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { int rc, now; @@ -3893,13 +4020,14 @@ int emulator_write_emulated(unsigned long addr, (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) #endif -static int emulator_cmpxchg_emulated(unsigned long addr, +static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, const void *old, const void *new, unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) + struct x86_exception *exception) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); gpa_t gpa; struct page *page; char *kaddr; @@ -3955,7 +4083,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr, emul_write: printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); - return emulator_write_emulated(addr, new, bytes, exception, vcpu); + return emulator_write_emulated(ctxt, addr, new, bytes, exception); } static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) @@ -3974,9 +4102,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) } -static int emulator_pio_in_emulated(int size, unsigned short port, void *val, - unsigned int count, struct kvm_vcpu *vcpu) +static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, void *val, + unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + if (vcpu->arch.pio.count) goto data_avail; @@ -4004,10 +4135,12 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val, return 0; } -static int emulator_pio_out_emulated(int size, unsigned short port, - const void *val, unsigned int count, - struct kvm_vcpu *vcpu) +static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, + int size, unsigned short port, + const void *val, unsigned int count) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); + trace_kvm_pio(1, port, size, count); vcpu->arch.pio.port = port; @@ -4037,10 +4170,9 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) return kvm_x86_ops->get_segment_base(vcpu, seg); } -int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) +static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) { - kvm_mmu_invlpg(vcpu, address); - return X86EMUL_CONTINUE; + kvm_mmu_invlpg(emul_to_vcpu(ctxt), address); } int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) @@ -4062,22 +4194,20 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); -int emulate_clts(struct kvm_vcpu *vcpu) +static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) { - kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - kvm_x86_ops->fpu_activate(vcpu); - return X86EMUL_CONTINUE; + kvm_emulate_wbinvd(emul_to_vcpu(ctxt)); } -int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) +int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) { - return _kvm_get_dr(vcpu, dr, dest); + return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest); } -int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) +int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) { - return __kvm_set_dr(vcpu, dr, value); + return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value); } static u64 mk_cr_64(u64 curr_cr, u32 new_val) @@ -4085,8 +4215,9 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val) return (curr_cr & ~((1ULL << 32) - 1)) | new_val; } -static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) +static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); unsigned long value; switch (cr) { @@ -4113,8 +4244,9 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) return value; } -static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) +static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); int res = 0; switch (cr) { @@ -4141,33 +4273,45 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) return res; } -static int emulator_get_cpl(struct kvm_vcpu *vcpu) +static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) +{ + return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt)); +} + +static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) +{ + kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt); +} + +static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - return kvm_x86_ops->get_cpl(vcpu); + kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt); } -static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_gdt(vcpu, dt); + kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt); } -static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) +static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) { - kvm_x86_ops->get_idt(vcpu, dt); + kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt); } -static unsigned long emulator_get_cached_segment_base(int seg, - struct kvm_vcpu *vcpu) +static unsigned long emulator_get_cached_segment_base( + struct x86_emulate_ctxt *ctxt, int seg) { - return get_segment_base(vcpu, seg); + return get_segment_base(emul_to_vcpu(ctxt), seg); } -static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, - int seg, struct kvm_vcpu *vcpu) +static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, + struct desc_struct *desc, u32 *base3, + int seg) { struct kvm_segment var; - kvm_get_segment(vcpu, &var, seg); + kvm_get_segment(emul_to_vcpu(ctxt), &var, seg); + *selector = var.selector; if (var.unusable) return false; @@ -4192,14 +4336,14 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, return true; } -static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, - int seg, struct kvm_vcpu *vcpu) +static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, + struct desc_struct *desc, u32 base3, + int seg) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); struct kvm_segment var; - /* needed to preserve selector */ - kvm_get_segment(vcpu, &var, seg); - + var.selector = selector; var.base = get_desc_base(desc); #ifdef CONFIG_X86_64 var.base |= ((u64)base3) << 32; @@ -4223,22 +4367,44 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, return; } -static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) +static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 *pdata) { - struct kvm_segment kvm_seg; + return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); +} - kvm_get_segment(vcpu, &kvm_seg, seg); - return kvm_seg.selector; +static int emulator_set_msr(struct x86_emulate_ctxt *ctxt, + u32 msr_index, u64 data) +{ + return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data); } -static void emulator_set_segment_selector(u16 sel, int seg, - struct kvm_vcpu *vcpu) +static void emulator_halt(struct x86_emulate_ctxt *ctxt) { - struct kvm_segment kvm_seg; + emul_to_vcpu(ctxt)->arch.halt_request = 1; +} - kvm_get_segment(vcpu, &kvm_seg, seg); - kvm_seg.selector = sel; - kvm_set_segment(vcpu, &kvm_seg, seg); +static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt) +{ + preempt_disable(); + kvm_load_guest_fpu(emul_to_vcpu(ctxt)); + /* + * CR0.TS may reference the host fpu state, not the guest fpu state, + * so it may be clear at this point. + */ + clts(); +} + +static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt) +{ + preempt_enable(); +} + +static int emulator_intercept(struct x86_emulate_ctxt *ctxt, + struct x86_instruction_info *info, + enum x86_intercept_stage stage) +{ + return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage); } static struct x86_emulate_ops emulate_ops = { @@ -4248,22 +4414,29 @@ static struct x86_emulate_ops emulate_ops = { .read_emulated = emulator_read_emulated, .write_emulated = emulator_write_emulated, .cmpxchg_emulated = emulator_cmpxchg_emulated, + .invlpg = emulator_invlpg, .pio_in_emulated = emulator_pio_in_emulated, .pio_out_emulated = emulator_pio_out_emulated, - .get_cached_descriptor = emulator_get_cached_descriptor, - .set_cached_descriptor = emulator_set_cached_descriptor, - .get_segment_selector = emulator_get_segment_selector, - .set_segment_selector = emulator_set_segment_selector, + .get_segment = emulator_get_segment, + .set_segment = emulator_set_segment, .get_cached_segment_base = emulator_get_cached_segment_base, .get_gdt = emulator_get_gdt, .get_idt = emulator_get_idt, + .set_gdt = emulator_set_gdt, + .set_idt = emulator_set_idt, .get_cr = emulator_get_cr, .set_cr = emulator_set_cr, .cpl = emulator_get_cpl, .get_dr = emulator_get_dr, .set_dr = emulator_set_dr, - .set_msr = kvm_set_msr, - .get_msr = kvm_get_msr, + .set_msr = emulator_set_msr, + .get_msr = emulator_get_msr, + .halt = emulator_halt, + .wbinvd = emulator_wbinvd, + .fix_hypercall = emulator_fix_hypercall, + .get_fpu = emulator_get_fpu, + .put_fpu = emulator_put_fpu, + .intercept = emulator_intercept, }; static void cache_all_regs(struct kvm_vcpu *vcpu) @@ -4305,12 +4478,17 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; int cs_db, cs_l; + /* + * TODO: fix emulate.c to use guest_read/write_register + * instead of direct ->regs accesses, can save hundred cycles + * on Intel for instructions that don't read/change RSP, for + * for example. + */ cache_all_regs(vcpu); kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); - vcpu->arch.emulate_ctxt.vcpu = vcpu; - vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); + vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu); vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); vcpu->arch.emulate_ctxt.mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : @@ -4318,11 +4496,13 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu) ? X86EMUL_MODE_VM86 : cs_l ? X86EMUL_MODE_PROT64 : cs_db ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; + vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu); memset(c, 0, sizeof(struct decode_cache)); memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_from_vcpu = false; } -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) { struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; int ret; @@ -4331,7 +4511,8 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) vcpu->arch.emulate_ctxt.decode.op_bytes = 2; vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; - vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; + vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip + + inc_eip; ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); if (ret != X86EMUL_CONTINUE) @@ -4340,7 +4521,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) vcpu->arch.emulate_ctxt.eip = c->eip; memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); if (irq == NMI_VECTOR) vcpu->arch.nmi_pending = false; @@ -4402,16 +4583,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, { int r; struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + bool writeback = true; kvm_clear_exception_queue(vcpu); - vcpu->arch.mmio_fault_cr2 = cr2; - /* - * TODO: fix emulate.c to use guest_read/write_register - * instead of direct ->regs accesses, can save hundred cycles - * on Intel for instructions that don't read/change RSP, for - * for example. - */ - cache_all_regs(vcpu); if (!(emulation_type & EMULTYPE_NO_DECODE)) { init_emulate_ctxt(vcpu); @@ -4442,13 +4616,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, return EMULATE_DONE; } - /* this is needed for vmware backdor interface to work since it + /* this is needed for vmware backdoor interface to work since it changes registers values during IO operation */ - memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { + vcpu->arch.emulate_regs_need_sync_from_vcpu = false; + memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); + } restart: r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); + if (r == EMULATION_INTERCEPTED) + return EMULATE_DONE; + if (r == EMULATION_FAILED) { if (reexecute_instruction(vcpu, cr2)) return EMULATE_DONE; @@ -4462,21 +4642,28 @@ restart: } else if (vcpu->arch.pio.count) { if (!vcpu->arch.pio.in) vcpu->arch.pio.count = 0; + else + writeback = false; r = EMULATE_DO_MMIO; } else if (vcpu->mmio_needed) { - if (vcpu->mmio_is_write) - vcpu->mmio_needed = 0; + if (!vcpu->mmio_is_write) + writeback = false; r = EMULATE_DO_MMIO; } else if (r == EMULATION_RESTART) goto restart; else r = EMULATE_DONE; - toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); - kvm_make_request(KVM_REQ_EVENT, vcpu); - memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); - kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + if (writeback) { + toggle_interruptibility(vcpu, + vcpu->arch.emulate_ctxt.interruptibility); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_make_request(KVM_REQ_EVENT, vcpu); + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); + } else + vcpu->arch.emulate_regs_need_sync_to_vcpu = true; return r; } @@ -4485,7 +4672,8 @@ EXPORT_SYMBOL_GPL(x86_emulate_instruction); int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) { unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); - int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); + int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt, + size, port, &val, 1); /* do not return to emulator after return from userspace */ vcpu->arch.pio.count = 0; return ret; @@ -4879,8 +5067,9 @@ out: } EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); -int kvm_fix_hypercall(struct kvm_vcpu *vcpu) +int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) { + struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); char instruction[3]; unsigned long rip = kvm_rip_read(vcpu); @@ -4893,21 +5082,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu) kvm_x86_ops->patch_hypercall(vcpu, instruction); - return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); -} - -void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct desc_ptr dt = { limit, base }; - - kvm_x86_ops->set_gdt(vcpu, &dt); -} - -void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base) -{ - struct desc_ptr dt = { limit, base }; - - kvm_x86_ops->set_idt(vcpu, &dt); + return emulator_write_emulated(&vcpu->arch.emulate_ctxt, + rip, instruction, 3, NULL); } static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) @@ -5170,6 +5346,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; + bool nmi_pending; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; @@ -5207,19 +5384,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) r = 1; goto out; } - if (kvm_check_request(KVM_REQ_NMI, vcpu)) - vcpu->arch.nmi_pending = true; } r = kvm_mmu_reload(vcpu); if (unlikely(r)) goto out; + /* + * An NMI can be injected between local nmi_pending read and + * vcpu->arch.nmi_pending read inside inject_pending_event(). + * But in that case, KVM_REQ_EVENT will be set, which makes + * the race described above benign. + */ + nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending); + if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { inject_pending_event(vcpu); /* enable NMI/IRQ window open exits if needed */ - if (vcpu->arch.nmi_pending) + if (nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); @@ -5399,6 +5582,41 @@ static int __vcpu_run(struct kvm_vcpu *vcpu) return r; } +static int complete_mmio(struct kvm_vcpu *vcpu) +{ + struct kvm_run *run = vcpu->run; + int r; + + if (!(vcpu->arch.pio.count || vcpu->mmio_needed)) + return 1; + + if (vcpu->mmio_needed) { + vcpu->mmio_needed = 0; + if (!vcpu->mmio_is_write) + memcpy(vcpu->mmio_data + vcpu->mmio_index, + run->mmio.data, 8); + vcpu->mmio_index += 8; + if (vcpu->mmio_index < vcpu->mmio_size) { + run->exit_reason = KVM_EXIT_MMIO; + run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index; + memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8); + run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8); + run->mmio.is_write = vcpu->mmio_is_write; + vcpu->mmio_needed = 1; + return 0; + } + if (vcpu->mmio_is_write) + return 1; + vcpu->mmio_read_completed = 1; + } + vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); + r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); + srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); + if (r != EMULATE_DONE) + return 0; + return 1; +} + int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) { int r; @@ -5425,20 +5643,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) } } - if (vcpu->arch.pio.count || vcpu->mmio_needed) { - if (vcpu->mmio_needed) { - memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); - vcpu->mmio_read_completed = 1; - vcpu->mmio_needed = 0; - } - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); - r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE); - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - if (r != EMULATE_DONE) { - r = 0; - goto out; - } - } + r = complete_mmio(vcpu); + if (r <= 0) + goto out; + if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) kvm_register_write(vcpu, VCPU_REGS_RAX, kvm_run->hypercall.ret); @@ -5455,6 +5663,18 @@ out: int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { + /* + * We are here if userspace calls get_regs() in the middle of + * instruction emulation. Registers state needs to be copied + * back from emulation context to vcpu. Usrapace shouldn't do + * that usually, but some bad designed PV devices (vmware + * backdoor interface) need this to work + */ + struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; + memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + } regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); @@ -5482,6 +5702,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) { + vcpu->arch.emulate_regs_need_sync_from_vcpu = true; + vcpu->arch.emulate_regs_need_sync_to_vcpu = false; + kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); @@ -5592,7 +5815,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason, memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); - kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); + kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); kvm_make_request(KVM_REQ_EVENT, vcpu); return EMULATE_DONE; } @@ -5974,8 +6197,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) } vcpu->arch.pio_data = page_address(page); - if (!kvm->arch.virtual_tsc_khz) - kvm_arch_set_tsc_khz(kvm, max_tsc_khz); + kvm_init_tsc_catchup(vcpu, max_tsc_khz); r = kvm_mmu_create(vcpu); if (r < 0) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index c600da8..e407ed3 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -77,7 +77,7 @@ static inline u32 bit(int bitno) void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); -int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); +int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip); void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 20e3f87..f7a2a05 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -12,6 +12,7 @@ #include <linux/mmiotrace.h> /* kmmio_handler, ... */ #include <linux/perf_event.h> /* perf_sw_event */ #include <linux/hugetlb.h> /* hstate_index_to_shift */ +#include <linux/prefetch.h> /* prefetchw */ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ @@ -964,7 +965,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) struct mm_struct *mm; int fault; int write = error_code & PF_WRITE; - unsigned int flags = FAULT_FLAG_ALLOW_RETRY | + unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE | (write ? FAULT_FLAG_WRITE : 0); tsk = current; @@ -1138,6 +1139,16 @@ good_area: } /* + * Pagefault was interrupted by SIGKILL. We have no reason to + * continue pagefault. + */ + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + if (!(error_code & PF_USER)) + no_context(regs, error_code, address); + return; + } + + /* * Major/minor page fault accounting is only done on the * initial attempt. If we go through a retry, it is extremely * likely that the page will be found in page cache at that point. diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index d420398..f581a18 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (!vma_shareable(vma, addr)) return; - spin_lock(&mapping->i_mmap_lock); + mutex_lock(&mapping->i_mmap_mutex); vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) put_page(virt_to_page(spte)); spin_unlock(&mm->page_table_lock); out: - spin_unlock(&mapping->i_mmap_lock); + mutex_unlock(&mapping->i_mmap_mutex); } /* diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 37b8b0f..3032644 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -16,8 +16,6 @@ #include <asm/tlb.h> #include <asm/proto.h> -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - unsigned long __initdata pgt_buf_start; unsigned long __meminitdata pgt_buf_end; unsigned long __meminitdata pgt_buf_top; diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c index 38e6d17..9f0614d 100644 --- a/arch/x86/mm/pf_in.c +++ b/arch/x86/mm/pf_in.c @@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs) unsigned char *p; struct prefix_bits prf; int i; - unsigned long rv; p = (unsigned char *)ins_addr; p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(reg_rop); i++) - if (reg_rop[i] == opcode) { - rv = REG_READ; + if (reg_rop[i] == opcode) goto do_work; - } for (i = 0; i < ARRAY_SIZE(reg_wop); i++) - if (reg_wop[i] == opcode) { - rv = REG_WRITE; + if (reg_wop[i] == opcode) goto do_work; - } printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " "0x%02x\n", opcode); @@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr) unsigned char *p; struct prefix_bits prf; int i; - unsigned long rv; p = (unsigned char *)ins_addr; p += skip_prefix(p, &prf); p += get_opcode(p, &opcode); for (i = 0; i < ARRAY_SIZE(imm_wop); i++) - if (imm_wop[i] == opcode) { - rv = IMM_WRITE; + if (imm_wop[i] == opcode) goto do_work; - } printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " "0x%02x\n", opcode); diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile new file mode 100644 index 0000000..90568c3 --- /dev/null +++ b/arch/x86/net/Makefile @@ -0,0 +1,4 @@ +# +# Arch-specific network modules +# +obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S new file mode 100644 index 0000000..6687022 --- /dev/null +++ b/arch/x86/net/bpf_jit.S @@ -0,0 +1,140 @@ +/* bpf_jit.S : BPF JIT helper functions + * + * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/linkage.h> +#include <asm/dwarf2.h> + +/* + * Calling convention : + * rdi : skb pointer + * esi : offset of byte(s) to fetch in skb (can be scratched) + * r8 : copy of skb->data + * r9d : hlen = skb->len - skb->data_len + */ +#define SKBDATA %r8 + +sk_load_word_ind: + .globl sk_load_word_ind + + add %ebx,%esi /* offset += X */ +# test %esi,%esi /* if (offset < 0) goto bpf_error; */ + js bpf_error + +sk_load_word: + .globl sk_load_word + + mov %r9d,%eax # hlen + sub %esi,%eax # hlen - offset + cmp $3,%eax + jle bpf_slow_path_word + mov (SKBDATA,%rsi),%eax + bswap %eax /* ntohl() */ + ret + + +sk_load_half_ind: + .globl sk_load_half_ind + + add %ebx,%esi /* offset += X */ + js bpf_error + +sk_load_half: + .globl sk_load_half + + mov %r9d,%eax + sub %esi,%eax # hlen - offset + cmp $1,%eax + jle bpf_slow_path_half + movzwl (SKBDATA,%rsi),%eax + rol $8,%ax # ntohs() + ret + +sk_load_byte_ind: + .globl sk_load_byte_ind + add %ebx,%esi /* offset += X */ + js bpf_error + +sk_load_byte: + .globl sk_load_byte + + cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */ + jle bpf_slow_path_byte + movzbl (SKBDATA,%rsi),%eax + ret + +/** + * sk_load_byte_msh - BPF_S_LDX_B_MSH helper + * + * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf) + * Must preserve A accumulator (%eax) + * Inputs : %esi is the offset value, already known positive + */ +ENTRY(sk_load_byte_msh) + CFI_STARTPROC + cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */ + jle bpf_slow_path_byte_msh + movzbl (SKBDATA,%rsi),%ebx + and $15,%bl + shl $2,%bl + ret + CFI_ENDPROC +ENDPROC(sk_load_byte_msh) + +bpf_error: +# force a return 0 from jit handler + xor %eax,%eax + mov -8(%rbp),%rbx + leaveq + ret + +/* rsi contains offset and can be scratched */ +#define bpf_slow_path_common(LEN) \ + push %rdi; /* save skb */ \ + push %r9; \ + push SKBDATA; \ +/* rsi already has offset */ \ + mov $LEN,%ecx; /* len */ \ + lea -12(%rbp),%rdx; \ + call skb_copy_bits; \ + test %eax,%eax; \ + pop SKBDATA; \ + pop %r9; \ + pop %rdi + + +bpf_slow_path_word: + bpf_slow_path_common(4) + js bpf_error + mov -12(%rbp),%eax + bswap %eax + ret + +bpf_slow_path_half: + bpf_slow_path_common(2) + js bpf_error + mov -12(%rbp),%ax + rol $8,%ax + movzwl %ax,%eax + ret + +bpf_slow_path_byte: + bpf_slow_path_common(1) + js bpf_error + movzbl -12(%rbp),%eax + ret + +bpf_slow_path_byte_msh: + xchg %eax,%ebx /* dont lose A , X is about to be scratched */ + bpf_slow_path_common(1) + js bpf_error + movzbl -12(%rbp),%eax + and $15,%al + shl $2,%al + xchg %eax,%ebx + ret diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c new file mode 100644 index 0000000..bfab3fa --- /dev/null +++ b/arch/x86/net/bpf_jit_comp.c @@ -0,0 +1,654 @@ +/* bpf_jit_comp.c : BPF JIT compiler + * + * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 + * of the License. + */ +#include <linux/moduleloader.h> +#include <asm/cacheflush.h> +#include <linux/netdevice.h> +#include <linux/filter.h> + +/* + * Conventions : + * EAX : BPF A accumulator + * EBX : BPF X accumulator + * RDI : pointer to skb (first argument given to JIT function) + * RBP : frame pointer (even if CONFIG_FRAME_POINTER=n) + * ECX,EDX,ESI : scratch registers + * r9d : skb->len - skb->data_len (headlen) + * r8 : skb->data + * -8(RBP) : saved RBX value + * -16(RBP)..-80(RBP) : BPF_MEMWORDS values + */ +int bpf_jit_enable __read_mostly; + +/* + * assembly code in arch/x86/net/bpf_jit.S + */ +extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[]; +extern u8 sk_load_word_ind[], sk_load_half_ind[], sk_load_byte_ind[]; + +static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) +{ + if (len == 1) + *ptr = bytes; + else if (len == 2) + *(u16 *)ptr = bytes; + else { + *(u32 *)ptr = bytes; + barrier(); + } + return ptr + len; +} + +#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0) + +#define EMIT1(b1) EMIT(b1, 1) +#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) +#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) +#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) +#define EMIT1_off32(b1, off) do { EMIT1(b1); EMIT(off, 4);} while (0) + +#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */ +#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */ + +static inline bool is_imm8(int value) +{ + return value <= 127 && value >= -128; +} + +static inline bool is_near(int offset) +{ + return offset <= 127 && offset >= -128; +} + +#define EMIT_JMP(offset) \ +do { \ + if (offset) { \ + if (is_near(offset)) \ + EMIT2(0xeb, offset); /* jmp .+off8 */ \ + else \ + EMIT1_off32(0xe9, offset); /* jmp .+off32 */ \ + } \ +} while (0) + +/* list of x86 cond jumps opcodes (. + s8) + * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32) + */ +#define X86_JB 0x72 +#define X86_JAE 0x73 +#define X86_JE 0x74 +#define X86_JNE 0x75 +#define X86_JBE 0x76 +#define X86_JA 0x77 + +#define EMIT_COND_JMP(op, offset) \ +do { \ + if (is_near(offset)) \ + EMIT2(op, offset); /* jxx .+off8 */ \ + else { \ + EMIT2(0x0f, op + 0x10); \ + EMIT(offset, 4); /* jxx .+off32 */ \ + } \ +} while (0) + +#define COND_SEL(CODE, TOP, FOP) \ + case CODE: \ + t_op = TOP; \ + f_op = FOP; \ + goto cond_branch + + +#define SEEN_DATAREF 1 /* might call external helpers */ +#define SEEN_XREG 2 /* ebx is used */ +#define SEEN_MEM 4 /* use mem[] for temporary storage */ + +static inline void bpf_flush_icache(void *start, void *end) +{ + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + smp_wmb(); + flush_icache_range((unsigned long)start, (unsigned long)end); + set_fs(old_fs); +} + + +void bpf_jit_compile(struct sk_filter *fp) +{ + u8 temp[64]; + u8 *prog; + unsigned int proglen, oldproglen = 0; + int ilen, i; + int t_offset, f_offset; + u8 t_op, f_op, seen = 0, pass; + u8 *image = NULL; + u8 *func; + int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */ + unsigned int cleanup_addr; /* epilogue code offset */ + unsigned int *addrs; + const struct sock_filter *filter = fp->insns; + int flen = fp->len; + + if (!bpf_jit_enable) + return; + + addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL); + if (addrs == NULL) + return; + + /* Before first pass, make a rough estimation of addrs[] + * each bpf instruction is translated to less than 64 bytes + */ + for (proglen = 0, i = 0; i < flen; i++) { + proglen += 64; + addrs[i] = proglen; + } + cleanup_addr = proglen; /* epilogue address */ + + for (pass = 0; pass < 10; pass++) { + /* no prologue/epilogue for trivial filters (RET something) */ + proglen = 0; + prog = temp; + + if (seen) { + EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */ + EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */ + /* note : must save %rbx in case bpf_error is hit */ + if (seen & (SEEN_XREG | SEEN_DATAREF)) + EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */ + if (seen & SEEN_XREG) + CLEAR_X(); /* make sure we dont leek kernel memory */ + + /* + * If this filter needs to access skb data, + * loads r9 and r8 with : + * r9 = skb->len - skb->data_len + * r8 = skb->data + */ + if (seen & SEEN_DATAREF) { + if (offsetof(struct sk_buff, len) <= 127) + /* mov off8(%rdi),%r9d */ + EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len)); + else { + /* mov off32(%rdi),%r9d */ + EMIT3(0x44, 0x8b, 0x8f); + EMIT(offsetof(struct sk_buff, len), 4); + } + if (is_imm8(offsetof(struct sk_buff, data_len))) + /* sub off8(%rdi),%r9d */ + EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len)); + else { + EMIT3(0x44, 0x2b, 0x8f); + EMIT(offsetof(struct sk_buff, data_len), 4); + } + + if (is_imm8(offsetof(struct sk_buff, data))) + /* mov off8(%rdi),%r8 */ + EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data)); + else { + /* mov off32(%rdi),%r8 */ + EMIT3(0x4c, 0x8b, 0x87); + EMIT(offsetof(struct sk_buff, data), 4); + } + } + } + + switch (filter[0].code) { + case BPF_S_RET_K: + case BPF_S_LD_W_LEN: + case BPF_S_ANC_PROTOCOL: + case BPF_S_ANC_IFINDEX: + case BPF_S_ANC_MARK: + case BPF_S_ANC_RXHASH: + case BPF_S_ANC_CPU: + case BPF_S_ANC_QUEUE: + case BPF_S_LD_W_ABS: + case BPF_S_LD_H_ABS: + case BPF_S_LD_B_ABS: + /* first instruction sets A register (or is RET 'constant') */ + break; + default: + /* make sure we dont leak kernel information to user */ + CLEAR_A(); /* A = 0 */ + } + + for (i = 0; i < flen; i++) { + unsigned int K = filter[i].k; + + switch (filter[i].code) { + case BPF_S_ALU_ADD_X: /* A += X; */ + seen |= SEEN_XREG; + EMIT2(0x01, 0xd8); /* add %ebx,%eax */ + break; + case BPF_S_ALU_ADD_K: /* A += K; */ + if (!K) + break; + if (is_imm8(K)) + EMIT3(0x83, 0xc0, K); /* add imm8,%eax */ + else + EMIT1_off32(0x05, K); /* add imm32,%eax */ + break; + case BPF_S_ALU_SUB_X: /* A -= X; */ + seen |= SEEN_XREG; + EMIT2(0x29, 0xd8); /* sub %ebx,%eax */ + break; + case BPF_S_ALU_SUB_K: /* A -= K */ + if (!K) + break; + if (is_imm8(K)) + EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */ + else + EMIT1_off32(0x2d, K); /* sub imm32,%eax */ + break; + case BPF_S_ALU_MUL_X: /* A *= X; */ + seen |= SEEN_XREG; + EMIT3(0x0f, 0xaf, 0xc3); /* imul %ebx,%eax */ + break; + case BPF_S_ALU_MUL_K: /* A *= K */ + if (is_imm8(K)) + EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */ + else { + EMIT2(0x69, 0xc0); /* imul imm32,%eax */ + EMIT(K, 4); + } + break; + case BPF_S_ALU_DIV_X: /* A /= X; */ + seen |= SEEN_XREG; + EMIT2(0x85, 0xdb); /* test %ebx,%ebx */ + if (pc_ret0 != -1) + EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4)); + else { + EMIT_COND_JMP(X86_JNE, 2 + 5); + CLEAR_A(); + EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */ + } + EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */ + break; + case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */ + EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */ + EMIT(K, 4); + EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */ + break; + case BPF_S_ALU_AND_X: + seen |= SEEN_XREG; + EMIT2(0x21, 0xd8); /* and %ebx,%eax */ + break; + case BPF_S_ALU_AND_K: + if (K >= 0xFFFFFF00) { + EMIT2(0x24, K & 0xFF); /* and imm8,%al */ + } else if (K >= 0xFFFF0000) { + EMIT2(0x66, 0x25); /* and imm16,%ax */ + EMIT2(K, 2); + } else { + EMIT1_off32(0x25, K); /* and imm32,%eax */ + } + break; + case BPF_S_ALU_OR_X: + seen |= SEEN_XREG; + EMIT2(0x09, 0xd8); /* or %ebx,%eax */ + break; + case BPF_S_ALU_OR_K: + if (is_imm8(K)) + EMIT3(0x83, 0xc8, K); /* or imm8,%eax */ + else + EMIT1_off32(0x0d, K); /* or imm32,%eax */ + break; + case BPF_S_ALU_LSH_X: /* A <<= X; */ + seen |= SEEN_XREG; + EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */ + break; + case BPF_S_ALU_LSH_K: + if (K == 0) + break; + else if (K == 1) + EMIT2(0xd1, 0xe0); /* shl %eax */ + else + EMIT3(0xc1, 0xe0, K); + break; + case BPF_S_ALU_RSH_X: /* A >>= X; */ + seen |= SEEN_XREG; + EMIT4(0x89, 0xd9, 0xd3, 0xe8); /* mov %ebx,%ecx; shr %cl,%eax */ + break; + case BPF_S_ALU_RSH_K: /* A >>= K; */ + if (K == 0) + break; + else if (K == 1) + EMIT2(0xd1, 0xe8); /* shr %eax */ + else + EMIT3(0xc1, 0xe8, K); + break; + case BPF_S_ALU_NEG: + EMIT2(0xf7, 0xd8); /* neg %eax */ + break; + case BPF_S_RET_K: + if (!K) { + if (pc_ret0 == -1) + pc_ret0 = i; + CLEAR_A(); + } else { + EMIT1_off32(0xb8, K); /* mov $imm32,%eax */ + } + /* fallinto */ + case BPF_S_RET_A: + if (seen) { + if (i != flen - 1) { + EMIT_JMP(cleanup_addr - addrs[i]); + break; + } + if (seen & SEEN_XREG) + EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */ + EMIT1(0xc9); /* leaveq */ + } + EMIT1(0xc3); /* ret */ + break; + case BPF_S_MISC_TAX: /* X = A */ + seen |= SEEN_XREG; + EMIT2(0x89, 0xc3); /* mov %eax,%ebx */ + break; + case BPF_S_MISC_TXA: /* A = X */ + seen |= SEEN_XREG; + EMIT2(0x89, 0xd8); /* mov %ebx,%eax */ + break; + case BPF_S_LD_IMM: /* A = K */ + if (!K) + CLEAR_A(); + else + EMIT1_off32(0xb8, K); /* mov $imm32,%eax */ + break; + case BPF_S_LDX_IMM: /* X = K */ + seen |= SEEN_XREG; + if (!K) + CLEAR_X(); + else + EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */ + break; + case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */ + seen |= SEEN_MEM; + EMIT3(0x8b, 0x45, 0xf0 - K*4); + break; + case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */ + seen |= SEEN_XREG | SEEN_MEM; + EMIT3(0x8b, 0x5d, 0xf0 - K*4); + break; + case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */ + seen |= SEEN_MEM; + EMIT3(0x89, 0x45, 0xf0 - K*4); + break; + case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */ + seen |= SEEN_XREG | SEEN_MEM; + EMIT3(0x89, 0x5d, 0xf0 - K*4); + break; + case BPF_S_LD_W_LEN: /* A = skb->len; */ + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4); + if (is_imm8(offsetof(struct sk_buff, len))) + /* mov off8(%rdi),%eax */ + EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len)); + else { + EMIT2(0x8b, 0x87); + EMIT(offsetof(struct sk_buff, len), 4); + } + break; + case BPF_S_LDX_W_LEN: /* X = skb->len; */ + seen |= SEEN_XREG; + if (is_imm8(offsetof(struct sk_buff, len))) + /* mov off8(%rdi),%ebx */ + EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len)); + else { + EMIT2(0x8b, 0x9f); + EMIT(offsetof(struct sk_buff, len), 4); + } + break; + case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */ + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2); + if (is_imm8(offsetof(struct sk_buff, protocol))) { + /* movzwl off8(%rdi),%eax */ + EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol)); + } else { + EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ + EMIT(offsetof(struct sk_buff, protocol), 4); + } + EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */ + break; + case BPF_S_ANC_IFINDEX: + if (is_imm8(offsetof(struct sk_buff, dev))) { + /* movq off8(%rdi),%rax */ + EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev)); + } else { + EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */ + EMIT(offsetof(struct sk_buff, dev), 4); + } + EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */ + EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6)); + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); + EMIT2(0x8b, 0x80); /* mov off32(%rax),%eax */ + EMIT(offsetof(struct net_device, ifindex), 4); + break; + case BPF_S_ANC_MARK: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4); + if (is_imm8(offsetof(struct sk_buff, mark))) { + /* mov off8(%rdi),%eax */ + EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark)); + } else { + EMIT2(0x8b, 0x87); + EMIT(offsetof(struct sk_buff, mark), 4); + } + break; + case BPF_S_ANC_RXHASH: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4); + if (is_imm8(offsetof(struct sk_buff, rxhash))) { + /* mov off8(%rdi),%eax */ + EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash)); + } else { + EMIT2(0x8b, 0x87); + EMIT(offsetof(struct sk_buff, rxhash), 4); + } + break; + case BPF_S_ANC_QUEUE: + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2); + if (is_imm8(offsetof(struct sk_buff, queue_mapping))) { + /* movzwl off8(%rdi),%eax */ + EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping)); + } else { + EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */ + EMIT(offsetof(struct sk_buff, queue_mapping), 4); + } + break; + case BPF_S_ANC_CPU: +#ifdef CONFIG_SMP + EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */ + EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */ +#else + CLEAR_A(); +#endif + break; + case BPF_S_LD_W_ABS: + func = sk_load_word; +common_load: seen |= SEEN_DATAREF; + if ((int)K < 0) + goto out; + t_offset = func - (image + addrs[i]); + EMIT1_off32(0xbe, K); /* mov imm32,%esi */ + EMIT1_off32(0xe8, t_offset); /* call */ + break; + case BPF_S_LD_H_ABS: + func = sk_load_half; + goto common_load; + case BPF_S_LD_B_ABS: + func = sk_load_byte; + goto common_load; + case BPF_S_LDX_B_MSH: + if ((int)K < 0) { + if (pc_ret0 != -1) { + EMIT_JMP(addrs[pc_ret0] - addrs[i]); + break; + } + CLEAR_A(); + EMIT_JMP(cleanup_addr - addrs[i]); + break; + } + seen |= SEEN_DATAREF | SEEN_XREG; + t_offset = sk_load_byte_msh - (image + addrs[i]); + EMIT1_off32(0xbe, K); /* mov imm32,%esi */ + EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */ + break; + case BPF_S_LD_W_IND: + func = sk_load_word_ind; +common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG; + t_offset = func - (image + addrs[i]); + EMIT1_off32(0xbe, K); /* mov imm32,%esi */ + EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */ + break; + case BPF_S_LD_H_IND: + func = sk_load_half_ind; + goto common_load_ind; + case BPF_S_LD_B_IND: + func = sk_load_byte_ind; + goto common_load_ind; + case BPF_S_JMP_JA: + t_offset = addrs[i + K] - addrs[i]; + EMIT_JMP(t_offset); + break; + COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE); + COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB); + COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE); + COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE); + COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE); + COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB); + COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE); + COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE); + +cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i]; + t_offset = addrs[i + filter[i].jt] - addrs[i]; + + /* same targets, can avoid doing the test :) */ + if (filter[i].jt == filter[i].jf) { + EMIT_JMP(t_offset); + break; + } + + switch (filter[i].code) { + case BPF_S_JMP_JGT_X: + case BPF_S_JMP_JGE_X: + case BPF_S_JMP_JEQ_X: + seen |= SEEN_XREG; + EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */ + break; + case BPF_S_JMP_JSET_X: + seen |= SEEN_XREG; + EMIT2(0x85, 0xd8); /* test %ebx,%eax */ + break; + case BPF_S_JMP_JEQ_K: + if (K == 0) { + EMIT2(0x85, 0xc0); /* test %eax,%eax */ + break; + } + case BPF_S_JMP_JGT_K: + case BPF_S_JMP_JGE_K: + if (K <= 127) + EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */ + else + EMIT1_off32(0x3d, K); /* cmp imm32,%eax */ + break; + case BPF_S_JMP_JSET_K: + if (K <= 0xFF) + EMIT2(0xa8, K); /* test imm8,%al */ + else if (!(K & 0xFFFF00FF)) + EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */ + else if (K <= 0xFFFF) { + EMIT2(0x66, 0xa9); /* test imm16,%ax */ + EMIT(K, 2); + } else { + EMIT1_off32(0xa9, K); /* test imm32,%eax */ + } + break; + } + if (filter[i].jt != 0) { + if (filter[i].jf) + t_offset += is_near(f_offset) ? 2 : 6; + EMIT_COND_JMP(t_op, t_offset); + if (filter[i].jf) + EMIT_JMP(f_offset); + break; + } + EMIT_COND_JMP(f_op, f_offset); + break; + default: + /* hmm, too complex filter, give up with jit compiler */ + goto out; + } + ilen = prog - temp; + if (image) { + if (unlikely(proglen + ilen > oldproglen)) { + pr_err("bpb_jit_compile fatal error\n"); + kfree(addrs); + module_free(NULL, image); + return; + } + memcpy(image + proglen, temp, ilen); + } + proglen += ilen; + addrs[i] = proglen; + prog = temp; + } + /* last bpf instruction is always a RET : + * use it to give the cleanup instruction(s) addr + */ + cleanup_addr = proglen - 1; /* ret */ + if (seen) + cleanup_addr -= 1; /* leaveq */ + if (seen & SEEN_XREG) + cleanup_addr -= 4; /* mov -8(%rbp),%rbx */ + + if (image) { + WARN_ON(proglen != oldproglen); + break; + } + if (proglen == oldproglen) { + image = module_alloc(max_t(unsigned int, + proglen, + sizeof(struct work_struct))); + if (!image) + goto out; + } + oldproglen = proglen; + } + if (bpf_jit_enable > 1) + pr_err("flen=%d proglen=%u pass=%d image=%p\n", + flen, proglen, pass, image); + + if (image) { + if (bpf_jit_enable > 1) + print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS, + 16, 1, image, proglen, false); + + bpf_flush_icache(image, image + proglen); + + fp->bpf_func = (void *)image; + } +out: + kfree(addrs); + return; +} + +static void jit_free_defer(struct work_struct *arg) +{ + module_free(NULL, arg); +} + +/* run from softirq, we must use a work_struct to call + * module_free() from process context + */ +void bpf_jit_free(struct sk_filter *fp) +{ + if (fp->bpf_func != sk_run_filter) { + struct work_struct *work = (struct work_struct *)fp->bpf_func; + + INIT_WORK(work, jit_free_defer); + schedule_work(work); + } +} diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index bd33620..e6fd847 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -280,12 +280,9 @@ void __init pci_direct_init(int type) int __init pci_direct_probe(void) { - struct resource *region, *region2; - if ((pci_probe & PCI_PROBE_CONF1) == 0) goto type2; - region = request_region(0xCF8, 8, "PCI conf1"); - if (!region) + if (!request_region(0xCF8, 8, "PCI conf1")) goto type2; if (pci_check_type1()) { @@ -293,16 +290,14 @@ int __init pci_direct_probe(void) port_cf9_safe = true; return 1; } - release_resource(region); + release_region(0xCF8, 8); type2: if ((pci_probe & PCI_PROBE_CONF2) == 0) return 0; - region = request_region(0xCF8, 4, "PCI conf2"); - if (!region) + if (!request_region(0xCF8, 4, "PCI conf2")) return 0; - region2 = request_region(0xC000, 0x1000, "PCI conf2"); - if (!region2) + if (!request_region(0xC000, 0x1000, "PCI conf2")) goto fail2; if (pci_check_type2()) { @@ -311,8 +306,8 @@ int __init pci_direct_probe(void) return 2; } - release_resource(region2); + release_region(0xC000, 0x1000); fail2: - release_resource(region); + release_region(0xCF8, 4); return 0; } diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c index 8201165..372e9b8 100644 --- a/arch/x86/pci/irq.c +++ b/arch/x86/pci/irq.c @@ -602,7 +602,9 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN && device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX) || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN && - device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) { + device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX) + || (device >= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MIN && + device <= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MAX)) { r->name = "PIIX/ICH"; r->get = pirq_piix_get; r->set = pirq_piix_set; diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c index e282886..750c346 100644 --- a/arch/x86/pci/mmconfig-shared.c +++ b/arch/x86/pci/mmconfig-shared.c @@ -606,6 +606,16 @@ static void __init __pci_mmcfg_init(int early) if (list_empty(&pci_mmcfg_list)) return; + if (pcibios_last_bus < 0) { + const struct pci_mmcfg_region *cfg; + + list_for_each_entry(cfg, &pci_mmcfg_list, list) { + if (cfg->segment) + break; + pcibios_last_bus = cfg->end_bus; + } + } + if (pci_mmcfg_arch_init()) pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; else { diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index b6552b1..bef0bc9 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -11,7 +11,7 @@ vdso-install-$(VDSO32-y) += $(vdso32-images) # files to link into the vdso -vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o +vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o # files to link into kernel obj-$(VDSO64-y) += vma.o vdso.o @@ -37,11 +37,24 @@ $(obj)/%.so: OBJCOPYFLAGS := -S $(obj)/%.so: $(obj)/%.so.dbg FORCE $(call if_changed,objcopy) +# +# Don't omit frame pointers for ease of userspace debugging, but do +# optimize sibling calls. +# CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ - $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) + $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ + -fno-omit-frame-pointer -foptimize-sibling-calls $(vobjs): KBUILD_CFLAGS += $(CFL) +# +# vDSO code runs in userspace and -pg doesn't help with profiling anyway. +# +CFLAGS_REMOVE_vdso-note.o = -pg +CFLAGS_REMOVE_vclock_gettime.o = -pg +CFLAGS_REMOVE_vgetcpu.o = -pg +CFLAGS_REMOVE_vvar.o = -pg + targets += vdso-syms.lds obj-$(VDSO64-y) += vdso-syms.lds diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index ee55754..a724905 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -2,7 +2,7 @@ * Copyright 2006 Andi Kleen, SUSE Labs. * Subject to the GNU Public License, v.2 * - * Fast user context implementation of clock_gettime and gettimeofday. + * Fast user context implementation of clock_gettime, gettimeofday, and time. * * The code should have no internal unresolved relocations. * Check with readelf after changing. @@ -22,9 +22,8 @@ #include <asm/hpet.h> #include <asm/unistd.h> #include <asm/io.h> -#include "vextern.h" -#define gtod vdso_vsyscall_gtod_data +#define gtod (&VVAR(vsyscall_gtod_data)) notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) { @@ -56,22 +55,6 @@ notrace static noinline int do_realtime(struct timespec *ts) return 0; } -/* Copy of the version in kernel/time.c which we cannot directly access */ -notrace static void -vset_normalized_timespec(struct timespec *ts, long sec, long nsec) -{ - while (nsec >= NSEC_PER_SEC) { - nsec -= NSEC_PER_SEC; - ++sec; - } - while (nsec < 0) { - nsec += NSEC_PER_SEC; - --sec; - } - ts->tv_sec = sec; - ts->tv_nsec = nsec; -} - notrace static noinline int do_monotonic(struct timespec *ts) { unsigned long seq, ns, secs; @@ -82,7 +65,17 @@ notrace static noinline int do_monotonic(struct timespec *ts) secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; } while (unlikely(read_seqretry(>od->lock, seq))); - vset_normalized_timespec(ts, secs, ns); + + /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec + * are all guaranteed to be nonnegative. + */ + while (ns >= NSEC_PER_SEC) { + ns -= NSEC_PER_SEC; + ++secs; + } + ts->tv_sec = secs; + ts->tv_nsec = ns; + return 0; } @@ -107,7 +100,17 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts) secs += gtod->wall_to_monotonic.tv_sec; ns += gtod->wall_to_monotonic.tv_nsec; } while (unlikely(read_seqretry(>od->lock, seq))); - vset_normalized_timespec(ts, secs, ns); + + /* wall_time_nsec and wall_to_monotonic.tv_nsec are + * guaranteed to be between 0 and NSEC_PER_SEC. + */ + if (ns >= NSEC_PER_SEC) { + ns -= NSEC_PER_SEC; + ++secs; + } + ts->tv_sec = secs; + ts->tv_nsec = ns; + return 0; } @@ -157,3 +160,32 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) } int gettimeofday(struct timeval *, struct timezone *) __attribute__((weak, alias("__vdso_gettimeofday"))); + +/* This will break when the xtime seconds get inaccurate, but that is + * unlikely */ + +static __always_inline long time_syscall(long *t) +{ + long secs; + asm volatile("syscall" + : "=a" (secs) + : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory"); + return secs; +} + +notrace time_t __vdso_time(time_t *t) +{ + time_t result; + + if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled)) + return time_syscall(t); + + /* This is atomic on x86_64 so we don't need any locks. */ + result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec); + + if (t) + *t = result; + return result; +} +int time(time_t *t) + __attribute__((weak, alias("__vdso_time"))); diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S index 4e5dd3b..b96b267 100644 --- a/arch/x86/vdso/vdso.lds.S +++ b/arch/x86/vdso/vdso.lds.S @@ -23,15 +23,10 @@ VERSION { __vdso_gettimeofday; getcpu; __vdso_getcpu; + time; + __vdso_time; local: *; }; } VDSO64_PRELINK = VDSO_PRELINK; - -/* - * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL. - */ -#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x; -#include "vextern.h" -#undef VEXTERN diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h deleted file mode 100644 index 1683ba2..0000000 --- a/arch/x86/vdso/vextern.h +++ /dev/null @@ -1,16 +0,0 @@ -#ifndef VEXTERN -#include <asm/vsyscall.h> -#define VEXTERN(x) \ - extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden"))); -#endif - -#define VMAGIC 0xfeedbabeabcdefabUL - -/* Any kernel variables used in the vDSO must be exported in the main - kernel's vmlinux.lds.S/vsyscall.h/proper __section and - put into vextern.h and be referenced as a pointer with vdso prefix. - The main kernel later fills in the values. */ - -VEXTERN(jiffies) -VEXTERN(vgetcpu_mode) -VEXTERN(vsyscall_gtod_data) diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c index 9fbc6b2..5463ad5 100644 --- a/arch/x86/vdso/vgetcpu.c +++ b/arch/x86/vdso/vgetcpu.c @@ -11,14 +11,13 @@ #include <linux/time.h> #include <asm/vsyscall.h> #include <asm/vgtod.h> -#include "vextern.h" notrace long __vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) { unsigned int p; - if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { + if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) { /* Load per CPU data from RDTSCP */ native_read_tscp(&p); } else { diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 4b5d26f..7abd2be 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -15,9 +15,6 @@ #include <asm/proto.h> #include <asm/vdso.h> -#include "vextern.h" /* Just for VMAGIC. */ -#undef VEXTERN - unsigned int __read_mostly vdso_enabled = 1; extern char vdso_start[], vdso_end[]; @@ -26,20 +23,10 @@ extern unsigned short vdso_sync_cpuid; static struct page **vdso_pages; static unsigned vdso_size; -static inline void *var_ref(void *p, char *name) -{ - if (*(void **)p != (void *)VMAGIC) { - printk("VDSO: variable %s broken\n", name); - vdso_enabled = 0; - } - return p; -} - static int __init init_vdso_vars(void) { int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; int i; - char *vbase; vdso_size = npages << PAGE_SHIFT; vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); @@ -54,20 +41,6 @@ static int __init init_vdso_vars(void) copy_page(page_address(p), vdso_start + i*PAGE_SIZE); } - vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL); - if (!vbase) - goto oom; - - if (memcmp(vbase, "\177ELF", 4)) { - printk("VDSO: I'm broken; not ELF\n"); - vdso_enabled = 0; - } - -#define VEXTERN(x) \ - *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x; -#include "vextern.h" -#undef VEXTERN - vunmap(vbase); return 0; oom: diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c deleted file mode 100644 index 1b7e703..0000000 --- a/arch/x86/vdso/vvar.c +++ /dev/null @@ -1,12 +0,0 @@ -/* Define pointer to external vDSO variables. - These are part of the vDSO. The kernel fills in the real addresses - at boot time. This is done because when the vdso is linked the - kernel isn't yet and we don't know the final addresses. */ -#include <linux/kernel.h> -#include <linux/time.h> -#include <asm/vsyscall.h> -#include <asm/timex.h> -#include <asm/vgtod.h> - -#define VEXTERN(x) typeof (__ ## x) *const vdso_ ## x = (void *)VMAGIC; -#include "vextern.h" diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 02d7524..dc708dc 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -75,67 +75,12 @@ #include "mmu.h" #include "debugfs.h" -#define MMU_UPDATE_HISTO 30 - /* * Protects atomic reservation decrease/increase against concurrent increases. * Also protects non-atomic updates of current_pages and balloon lists. */ DEFINE_SPINLOCK(xen_reservation_lock); -#ifdef CONFIG_XEN_DEBUG_FS - -static struct { - u32 pgd_update; - u32 pgd_update_pinned; - u32 pgd_update_batched; - - u32 pud_update; - u32 pud_update_pinned; - u32 pud_update_batched; - - u32 pmd_update; - u32 pmd_update_pinned; - u32 pmd_update_batched; - - u32 pte_update; - u32 pte_update_pinned; - u32 pte_update_batched; - - u32 mmu_update; - u32 mmu_update_extended; - u32 mmu_update_histo[MMU_UPDATE_HISTO]; - - u32 prot_commit; - u32 prot_commit_batched; - - u32 set_pte_at; - u32 set_pte_at_batched; - u32 set_pte_at_pinned; - u32 set_pte_at_current; - u32 set_pte_at_kernel; -} mmu_stats; - -static u8 zero_stats; - -static inline void check_zero(void) -{ - if (unlikely(zero_stats)) { - memset(&mmu_stats, 0, sizeof(mmu_stats)); - zero_stats = 0; - } -} - -#define ADD_STATS(elem, val) \ - do { check_zero(); mmu_stats.elem += (val); } while(0) - -#else /* !CONFIG_XEN_DEBUG_FS */ - -#define ADD_STATS(elem, val) do { (void)(val); } while(0) - -#endif /* CONFIG_XEN_DEBUG_FS */ - - /* * Identity map, in addition to plain kernel map. This needs to be * large enough to allocate page table pages to allocate the rest. @@ -243,11 +188,6 @@ static bool xen_page_pinned(void *ptr) return PagePinned(page); } -static bool xen_iomap_pte(pte_t pte) -{ - return pte_flags(pte) & _PAGE_IOMAP; -} - void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) { struct multicall_space mcs; @@ -257,7 +197,7 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) u = mcs.args; /* ptep might be kmapped when using 32-bit HIGHPTE */ - u->ptr = arbitrary_virt_to_machine(ptep).maddr; + u->ptr = virt_to_machine(ptep).maddr; u->val = pte_val_ma(pteval); MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); @@ -266,11 +206,6 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) } EXPORT_SYMBOL_GPL(xen_set_domain_pte); -static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval) -{ - xen_set_domain_pte(ptep, pteval, DOMID_IO); -} - static void xen_extend_mmu_update(const struct mmu_update *update) { struct multicall_space mcs; @@ -279,27 +214,17 @@ static void xen_extend_mmu_update(const struct mmu_update *update) mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); if (mcs.mc != NULL) { - ADD_STATS(mmu_update_extended, 1); - ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1); - mcs.mc->args[1]++; - - if (mcs.mc->args[1] < MMU_UPDATE_HISTO) - ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1); - else - ADD_STATS(mmu_update_histo[0], 1); } else { - ADD_STATS(mmu_update, 1); mcs = __xen_mc_entry(sizeof(*u)); MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); - ADD_STATS(mmu_update_histo[1], 1); } u = mcs.args; *u = *update; } -void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) +static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) { struct mmu_update u; @@ -312,17 +237,13 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) u.val = pmd_val_ma(val); xen_extend_mmu_update(&u); - ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); - xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); } -void xen_set_pmd(pmd_t *ptr, pmd_t val) +static void xen_set_pmd(pmd_t *ptr, pmd_t val) { - ADD_STATS(pmd_update, 1); - /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -330,8 +251,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val) return; } - ADD_STATS(pmd_update_pinned, 1); - xen_set_pmd_hyper(ptr, val); } @@ -344,35 +263,34 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); } -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval) +static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval) { - if (xen_iomap_pte(pteval)) { - xen_set_iomap_pte(ptep, pteval); - goto out; - } + struct mmu_update u; - ADD_STATS(set_pte_at, 1); -// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); - ADD_STATS(set_pte_at_current, mm == current->mm); - ADD_STATS(set_pte_at_kernel, mm == &init_mm); + if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU) + return false; - if (mm == current->mm || mm == &init_mm) { - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - struct multicall_space mcs; - mcs = xen_mc_entry(0); + xen_mc_batch(); - MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); - ADD_STATS(set_pte_at_batched, 1); - xen_mc_issue(PARAVIRT_LAZY_MMU); - goto out; - } else - if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0) - goto out; - } - xen_set_pte(ptep, pteval); + u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE; + u.val = pte_val_ma(pteval); + xen_extend_mmu_update(&u); + + xen_mc_issue(PARAVIRT_LAZY_MMU); -out: return; + return true; +} + +static void xen_set_pte(pte_t *ptep, pte_t pteval) +{ + if (!xen_batched_set_pte(ptep, pteval)) + native_set_pte(ptep, pteval); +} + +static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pteval) +{ + xen_set_pte(ptep, pteval); } pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, @@ -389,13 +307,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, xen_mc_batch(); - u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; + u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; u.val = pte_val_ma(pte); xen_extend_mmu_update(&u); - ADD_STATS(prot_commit, 1); - ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); - xen_mc_issue(PARAVIRT_LAZY_MMU); } @@ -463,7 +378,7 @@ static pteval_t iomap_pte(pteval_t val) return val; } -pteval_t xen_pte_val(pte_t pte) +static pteval_t xen_pte_val(pte_t pte) { pteval_t pteval = pte.pte; @@ -480,7 +395,7 @@ pteval_t xen_pte_val(pte_t pte) } PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); -pgdval_t xen_pgd_val(pgd_t pgd) +static pgdval_t xen_pgd_val(pgd_t pgd) { return pte_mfn_to_pfn(pgd.pgd); } @@ -511,7 +426,7 @@ void xen_set_pat(u64 pat) WARN_ON(pat != 0x0007010600070106ull); } -pte_t xen_make_pte(pteval_t pte) +static pte_t xen_make_pte(pteval_t pte) { phys_addr_t addr = (pte & PTE_PFN_MASK); @@ -581,20 +496,20 @@ pte_t xen_make_pte_debug(pteval_t pte) PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug); #endif -pgd_t xen_make_pgd(pgdval_t pgd) +static pgd_t xen_make_pgd(pgdval_t pgd) { pgd = pte_pfn_to_mfn(pgd); return native_make_pgd(pgd); } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); -pmdval_t xen_pmd_val(pmd_t pmd) +static pmdval_t xen_pmd_val(pmd_t pmd) { return pte_mfn_to_pfn(pmd.pmd); } PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); -void xen_set_pud_hyper(pud_t *ptr, pud_t val) +static void xen_set_pud_hyper(pud_t *ptr, pud_t val) { struct mmu_update u; @@ -607,17 +522,13 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val) u.val = pud_val_ma(val); xen_extend_mmu_update(&u); - ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); - xen_mc_issue(PARAVIRT_LAZY_MMU); preempt_enable(); } -void xen_set_pud(pud_t *ptr, pud_t val) +static void xen_set_pud(pud_t *ptr, pud_t val) { - ADD_STATS(pud_update, 1); - /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -625,56 +536,28 @@ void xen_set_pud(pud_t *ptr, pud_t val) return; } - ADD_STATS(pud_update_pinned, 1); - xen_set_pud_hyper(ptr, val); } -void xen_set_pte(pte_t *ptep, pte_t pte) -{ - if (xen_iomap_pte(pte)) { - xen_set_iomap_pte(ptep, pte); - return; - } - - ADD_STATS(pte_update, 1); -// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep)); - ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); - #ifdef CONFIG_X86_PAE - ptep->pte_high = pte.pte_high; - smp_wmb(); - ptep->pte_low = pte.pte_low; -#else - *ptep = pte; -#endif -} - -#ifdef CONFIG_X86_PAE -void xen_set_pte_atomic(pte_t *ptep, pte_t pte) +static void xen_set_pte_atomic(pte_t *ptep, pte_t pte) { - if (xen_iomap_pte(pte)) { - xen_set_iomap_pte(ptep, pte); - return; - } - set_64bit((u64 *)ptep, native_pte_val(pte)); } -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - ptep->pte_low = 0; - smp_wmb(); /* make sure low gets written first */ - ptep->pte_high = 0; + if (!xen_batched_set_pte(ptep, native_make_pte(0))) + native_pte_clear(mm, addr, ptep); } -void xen_pmd_clear(pmd_t *pmdp) +static void xen_pmd_clear(pmd_t *pmdp) { set_pmd(pmdp, __pmd(0)); } #endif /* CONFIG_X86_PAE */ -pmd_t xen_make_pmd(pmdval_t pmd) +static pmd_t xen_make_pmd(pmdval_t pmd) { pmd = pte_pfn_to_mfn(pmd); return native_make_pmd(pmd); @@ -682,13 +565,13 @@ pmd_t xen_make_pmd(pmdval_t pmd) PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); #if PAGETABLE_LEVELS == 4 -pudval_t xen_pud_val(pud_t pud) +static pudval_t xen_pud_val(pud_t pud) { return pte_mfn_to_pfn(pud.pud); } PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); -pud_t xen_make_pud(pudval_t pud) +static pud_t xen_make_pud(pudval_t pud) { pud = pte_pfn_to_mfn(pud); @@ -696,7 +579,7 @@ pud_t xen_make_pud(pudval_t pud) } PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); -pgd_t *xen_get_user_pgd(pgd_t *pgd) +static pgd_t *xen_get_user_pgd(pgd_t *pgd) { pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); unsigned offset = pgd - pgd_page; @@ -728,7 +611,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) * 2. It is always pinned * 3. It has no user pagetable attached to it */ -void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) +static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) { preempt_disable(); @@ -741,12 +624,10 @@ void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) preempt_enable(); } -void xen_set_pgd(pgd_t *ptr, pgd_t val) +static void xen_set_pgd(pgd_t *ptr, pgd_t val) { pgd_t *user_ptr = xen_get_user_pgd(ptr); - ADD_STATS(pgd_update, 1); - /* If page is not pinned, we can just update the entry directly */ if (!xen_page_pinned(ptr)) { @@ -758,9 +639,6 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val) return; } - ADD_STATS(pgd_update_pinned, 1); - ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU); - /* If it's pinned, then we can at least batch the kernel and user updates together. */ xen_mc_batch(); @@ -1162,14 +1040,14 @@ void xen_mm_unpin_all(void) spin_unlock(&pgd_lock); } -void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) +static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) { spin_lock(&next->page_table_lock); xen_pgd_pin(next); spin_unlock(&next->page_table_lock); } -void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { spin_lock(&mm->page_table_lock); xen_pgd_pin(mm); @@ -1256,7 +1134,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm) * pagetable because of lazy tlb flushing. This means we need need to * switch all CPUs off this pagetable before we can unpin it. */ -void xen_exit_mmap(struct mm_struct *mm) +static void xen_exit_mmap(struct mm_struct *mm) { get_cpu(); /* make sure we don't move around */ xen_drop_mm_ref(mm); @@ -2371,7 +2249,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token, struct remap_data *rmd = data; pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); - rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr; + rmd->mmu_update->ptr = virt_to_machine(ptep).maddr; rmd->mmu_update->val = pte_val_ma(pte); rmd->mmu_update++; @@ -2425,7 +2303,6 @@ out: EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); #ifdef CONFIG_XEN_DEBUG_FS - static int p2m_dump_open(struct inode *inode, struct file *filp) { return single_open(filp, p2m_dump_show, NULL); @@ -2437,65 +2314,4 @@ static const struct file_operations p2m_dump_fops = { .llseek = seq_lseek, .release = single_release, }; - -static struct dentry *d_mmu_debug; - -static int __init xen_mmu_debugfs(void) -{ - struct dentry *d_xen = xen_init_debugfs(); - - if (d_xen == NULL) - return -ENOMEM; - - d_mmu_debug = debugfs_create_dir("mmu", d_xen); - - debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); - - debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update); - debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug, - &mmu_stats.pgd_update_pinned); - debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug, - &mmu_stats.pgd_update_pinned); - - debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update); - debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug, - &mmu_stats.pud_update_pinned); - debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug, - &mmu_stats.pud_update_pinned); - - debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update); - debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug, - &mmu_stats.pmd_update_pinned); - debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug, - &mmu_stats.pmd_update_pinned); - - debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update); -// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug, -// &mmu_stats.pte_update_pinned); - debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug, - &mmu_stats.pte_update_pinned); - - debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update); - debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug, - &mmu_stats.mmu_update_extended); - xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug, - mmu_stats.mmu_update_histo, 20); - - debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at); - debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug, - &mmu_stats.set_pte_at_batched); - debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug, - &mmu_stats.set_pte_at_current); - debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug, - &mmu_stats.set_pte_at_kernel); - - debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit); - debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug, - &mmu_stats.prot_commit_batched); - - debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops); - return 0; -} -fs_initcall(xen_mmu_debugfs); - -#endif /* CONFIG_XEN_DEBUG_FS */ +#endif /* CONFIG_XEN_DEBUG_FS */ diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 537bb9a..73809bb 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -15,43 +15,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); - -void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next); -void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); -void xen_exit_mmap(struct mm_struct *mm); - -pteval_t xen_pte_val(pte_t); -pmdval_t xen_pmd_val(pmd_t); -pgdval_t xen_pgd_val(pgd_t); - -pte_t xen_make_pte(pteval_t); -pmd_t xen_make_pmd(pmdval_t); -pgd_t xen_make_pgd(pgdval_t); - -void xen_set_pte(pte_t *ptep, pte_t pteval); -void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, - pte_t *ptep, pte_t pteval); - -#ifdef CONFIG_X86_PAE -void xen_set_pte_atomic(pte_t *ptep, pte_t pte); -void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep); -void xen_pmd_clear(pmd_t *pmdp); -#endif /* CONFIG_X86_PAE */ - -void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval); -void xen_set_pud(pud_t *ptr, pud_t val); -void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval); -void xen_set_pud_hyper(pud_t *ptr, pud_t val); - -#if PAGETABLE_LEVELS == 4 -pudval_t xen_pud_val(pud_t pud); -pud_t xen_make_pud(pudval_t pudval); -void xen_set_pgd(pgd_t *pgdp, pgd_t pgd); -void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd); -#endif - -pgd_t *xen_get_user_pgd(pgd_t *pgd); - pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte); diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c index bfd0632..b480d42 100644 --- a/arch/x86/xen/pci-swiotlb-xen.c +++ b/arch/x86/xen/pci-swiotlb-xen.c @@ -36,7 +36,7 @@ int __init pci_xen_swiotlb_detect(void) /* If running as PV guest, either iommu=soft, or swiotlb=force will * activate this IOMMU. If running as PV privileged, activate it - * irregardlesss. + * irregardless. */ if ((xen_initial_domain() || swiotlb || swiotlb_force) && (xen_pv_domain())) |